In [1]:
import boto3
import os 
import fitz
import pandas as pd
import pytesseract
from PIL import Image

BUCKET_NAME = "docs-search-bucket"
DOWNLOAD_DIR = "../data"

In [None]:
import boto3
import os 

class S3Client:
    def __init__(self, bucket_name, download_dir="../data"):
        self.s3 = boto3.client('s3')
        self.bucket_name = bucket_name
        self.download_dir = download_dir
        os.makedirs(download_dir, exist_ok=True)

    def list_supported_files(self):
        response = self.s3.list_objects_v2(Bucket=self.bucket_name)
        files = response.get('Contents', [])
        return [f['Key'] for f in files if f['Key'].endswith(('.txt', '.pdf', '.csv', '.png'))]

    def download_file(self, s3_key):
        local_path = os.path.join(self.download_dir, os.path.basename(s3_key))
        self.s3.download_file(self.bucket_name, s3_key, local_path)
        return local_path

In [3]:
class TXTExtractor():
    def extract_text(self, file_path: str) -> str:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
        

class PDFExtractor():
    def extract_text(self, file_path: str) -> str:
        doc = fitz.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    
class CSVExtractor():
    def extract_text(self, file_path: str) -> str:
        df = pd.read_csv(file_path)
        return df.to_string().replace('\n',' ')
    
class ImageExtractor():
    def extract_text(self, file_path: str) -> str:
        image = Image.open(file_path)
        return pytesseract.image_to_string(image)
    
def get_extractor(file_path: str):
    if file_path.endswith(".txt"):
        return TXTExtractor()
    elif file_path.endswith(".pdf"):
        return PDFExtractor()
    elif file_path.endswith(".csv"):
        return CSVExtractor()
    elif file_path.endswith(".png"):
        return ImageExtractor()
    return None

In [4]:
# conn = psycopg2.connect(
#     host="localhost",
#     user="postgres",
#     password="admin",
#     database="docsearch"  # default database
# )

# cur = conn.cursor()

# # Query to get all table names from the current schema
# cur.execute("""
#     SELECT table_name
#     FROM information_schema.tables
#     WHERE table_schema = 'public'
#     AND table_type = 'BASE TABLE';
# """)

# tables = cur.fetchall()

# print("Tables in the database:")
# for table in tables:
#     print(table[0])

# cur.close()
# conn.close()


In [5]:
# conn = psycopg2.connect(
#     host="localhost",
#     user="postgres",
#     password="admin",
#     database="docsearch"  # default database
# )

# conn.autocommit = True 

# cur = conn.cursor()
# cur.execute("CREATE DATABASE docsearch;")
# print("✅ Database 'docsearch' created successfully.")

# cur.close()
# conn.close()

In [None]:
import psycopg2
from psycopg2.extras import RealDictCursor

conn = psycopg2.connect(
    host="localhost",
    user="postgres",
    password="admin",
    database="docsearch"
)

cur = conn.cursor(cursor_factory=RealDictCursor)
cur.execute("SELECT file_name from documents;")

response = cur.fetchall()
files = [file[0] for file in response]

cur.close()
conn.close()

In [19]:
files

['folder-1/sample document.pdf',
 'folder-2/Sample image.png',
 'folder-3/sample.csv']

In [32]:
from psycopg2.extras import RealDictCursor

def list_file_name():

    conn = psycopg2.connect(
    host="localhost",
    user="postgres",
    password="admin",
    database="docsearch")
    with conn.cursor() as cur:
        cur.execute("SELECT file_name from documents;")
        response = cur.fetchall()
        return [file[0] for file in response]

In [33]:
list_file_name()

['folder-1/sample document.pdf',
 'folder-2/Sample image.png',
 'folder-3/sample.csv']

In [1]:
content = "salman is working"

In [2]:
len(content.encode('utf-8'))

17

In [9]:
10*1024*1024

10485760

In [None]:
10485760

In [6]:
import psycopg2
from psycopg2.extras import RealDictCursor

class PostgresClient:
    def __init__(self, db_name='docsearch', user='postgres', password='admin', host='localhost', port='5432'):
        self.conn = psycopg2.connect(dbname=db_name, user=user, password=password, host=host, port=port)
        self._create_table()

    def _create_table(self):
        with self.conn.cursor() as cur:
            cur.execute('''
                CREATE TABLE IF NOT EXISTS documents (
                    id SERIAL PRIMARY KEY,
                    file_name TEXT,
                    content TEXT
                )
            ''')
            self.conn.commit()

    def index_file(self, file_name, content):
        with self.conn.cursor() as cur:
            cur.execute('INSERT INTO documents (file_name, content) VALUES (%s, %s)',
                        (file_name, content))
            self.conn.commit()

    def search(self, query):
        with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
            cur.execute("SELECT file_name FROM documents WHERE content ILIKE %s", ('%' + query + '%',))
            return cur.fetchall()

In [None]:
BUCKET_NAME = "docs-search-bucket"
DOWNLOAD_DIR = "../data"

s3 = S3Client(bucket_name=BUCKET_NAME)
db = PostgresClient()

In [None]:
db.search('John')


[RealDictRow([('file_name', 'folder-3/sample.csv')]),
 RealDictRow([('file_name', 'folder-3/sample.csv')])]

In [9]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [None]:
def run_indexing():
    
    files = s3.list_supported_files()

    for key in files:
        local_path = s3.download_file(key)
        extractor = get_extractor(local_path)
        if extractor:
            content = extractor.extract_text(local_path)
            db.index_file(key, content)


