In [None]:
import boto3
import os 
import fitz
import pandas as pd
import pytesseract
from PIL import Image

s3 = boto3.client('s3')

BUCKET_NAME = "docs-search-bucket"
DOWNLOAD_DIR = "../data"

In [None]:
class S3Client:
    def __init__(self, bucket_name, download_dir="../data"):
        self.s3 = boto3.client('s3')
        self.bucket_name = bucket_name
        self.download_dir = download_dir
        os.makedirs(download_dir, exist_ok=True)

    def list_supported_files(self):
        response = self.s3.list_objects_v2(Bucket=self.bucket_name)
        files = response.get('Contents', [])
        return [f['Key'] for f in files if f['Key'].endswith(('.txt', '.pdf', '.csv', '.png'))]

    def download_file(self, s3_key):
        local_path = os.path.join(self.download_dir, os.path.basename(s3_key))
        self.s3.download_file(self.bucket_name, s3_key, local_path)
        return local_path

    def generate_presigned_url(self, s3_key, expires_in=3600):
        return self.s3.generate_presigned_url('get_object',
                                              Params={'Bucket': self.bucket_name, 'Key': s3_key},
                                              ExpiresIn=expires_in)

In [38]:
s3 = S3Client(bucket_name=BUCKET_NAME)

In [43]:
files = s3.list_supported_files()
files

['folder-1/sample document.pdf',
 'folder-2/Sample image.png',
 'folder-3/sample.csv']

In [44]:
[s3.download_file(file) for file in files]

['../data\\sample document.pdf',
 '../data\\Sample image.png',
 '../data\\sample.csv']

In [None]:
class TXTExtractor():
    def extract_text(self, file_path: str) -> str:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
        

class PDFExtractor():
    def extract_text(self, file_path: str) -> str:
        doc = fitz.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    
class CSVExtractor():
    def extract_text(self, file_path: str) -> str:
        df = pd.read_csv(file_path)
        return df.to_string()
    
class ImageExtractor():
    def extract_text(self, file_path: str) -> str:
        image = Image.open(file_path)
        return pytesseract.image_to_string(image)

In [47]:
def get_extractor(file_path: str):
    if file_path.endswith(".txt"):
        return TXTExtractor()
    elif file_path.endswith(".pdf"):
        return PDFExtractor()
    elif file_path.endswith(".csv"):
        return CSVExtractor()
    elif file_path.endswith(".png"):
        return ImageExtractor()
    return None

In [2]:
from elasticsearch import Elasticsearch