In [20]:
from glob import glob
import os
import sys
from io import BytesIO
from pypdf import PdfReader
import datetime
import time
import sqlite3

In [51]:
"""
Searches a directory for PDF files and returns a list of their paths.
Args: directory_path (str): The path to the directory to search.
Returns: list: A list of file paths to PDF files in the directory.
"""

def find_pdfs_in_directory(directory_path):
    filepathlist = [];
    for root, subdirs, files in os.walk(directory_path):
        for file in files:
            absolute_path = os.path.join(root, file)
            try:
                with open(absolute_path, "rb") as x:
                    bytes_stream = BytesIO(x.read())
                    if(bytes_stream.getvalue()[:4] == b'%PDF'):
                        filepathlist.append(absolute_path)
            except PermissionError:
                continue
    return filepathlist

"""
Converts a list into a dictionary with keys as filenames and values as filepaths.
Args: pdf list (str): A list of file paths to PDF files.
Returns: dict (str:str): keys as filenames and values as absolute filepaths.
"""

def list_to_dict(pdflist):
    pdfdict = {}
    for i in range(len(pdflist)):
        filename = os.path.basename(pdflist[i])
        reader = PdfReader(pdflist[i])
        num_pages = len(reader.pages)
        temp = os.path.getatime(pdflist[i])
        last_access_time = str(format(datetime.datetime.fromtimestamp(temp)))
        #last_access_time = time.strftime('%Y-%m-%d %H:%M:%S', )
        size_bytes = os.path.getsize(pdflist[i])
        if( pdfdict.get(pdflist[i]) == None):
            pdfdict.update({pdflist[i]:[filename, num_pages, size_bytes, last_access_time[0:19]]})
        else:`
            continue
    return pdfdict

"""

"""
def load_dict_to_db(pdfdict):
    con = sqlite3.connect("pdfinfo.db")
    cur = con.cursor()
    cur.execute("CREATE TABLE IF NOT EXISTS pdfinfo(absolute filename, filename, number of pages, file size bytes )")
    for key, values in pdfdict.items():
        command = key + ',' + values[0] + ',' + str(values[1]) + ',' + str(values[2]) + ',' + str(values[3])
        cur.execute("INSERT INTO pdfinfo (" + command + ")")
        
        
    

In [52]:
filepathlist = find_pdfs_in_directory("C:/Users/Viraj Gaur/Desktop/")
pdfdict = list_to_dict(filepathlist)
                        

In [53]:
pdfdict

{'C:/Users/Viraj Gaur/Desktop/Dungeons and dragons\\John_Nolan.pdf': ['John_Nolan.pdf',
  2,
  322923,
  '2025-06-14 09:52:58'],
 'C:/Users/Viraj Gaur/Desktop/Dungeons and dragons\\Lockgrim martin.pdf': ['Lockgrim martin.pdf',
  3,
  476357,
  '2025-06-14 09:52:58'],
 'C:/Users/Viraj Gaur/Desktop/Dungeons and dragons\\Lockgrim_Martin_PNG.pdf': ['Lockgrim_Martin_PNG.pdf',
  3,
  469113,
  '2025-06-14 09:52:58'],
 'C:/Users/Viraj Gaur/Desktop/Dungeons and dragons\\Nym_polip_img.pdf': ['Nym_polip_img.pdf',
  3,
  1065418,
  '2025-06-14 09:52:58'],
 'C:/Users/Viraj Gaur/Desktop/Dungeons and dragons\\StonemyClarkson.pdf': ['StonemyClarkson.pdf',
  3,
  951852,
  '2025-06-14 09:52:58'],
 'C:/Users/Viraj Gaur/Desktop/job applications\\Cover letter\\CCP_coverletter.pdf': ['CCP_coverletter.pdf',
  1,
  84917,
  '2025-06-14 09:52:58'],
 'C:/Users/Viraj Gaur/Desktop/job applications\\Cover letter\\Cover_letter.pdf': ['Cover_letter.pdf',
  1,
  634736,
  '2025-06-14 09:52:58'],
 'C:/Users/Viraj Ga

In [54]:
load_dict_to_db(pdfdict)

OperationalError: unrecognized token: ":"