In [1]:
import os
import mimetypes
import pdfplumber
import pytesseract
from PIL import Image
import pandas as pd
import json

BLOCK_SIZE = 8

def extract_and_chunk(file_path):
    filetype = mimetypes.guess_type(file_path)[0]

    if filetype == 'application/pdf':
        return chunk_pdf(file_path)
    elif filetype == 'text/plain':
        return chunk_text(open(file_path).read())
    elif filetype == 'text/csv':
        return chunk_text(pd.read_csv(file_path).to_string(index=False))
    elif filetype == 'application/json':
        return chunk_text(json.dumps(json.load(open(file_path)), indent=2))
    elif filetype and filetype.startswith('image/'):
        return [handle_image(file_path)]
    else:
        raise ValueError("Unsupported file type.")

def chunk_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        pages = [page.extract_text() or "" for page in pdf.pages]
        return ["\n".join(pages[i:i+BLOCK_SIZE]) for i in range(0, len(pages), BLOCK_SIZE)]

def chunk_text(text):
    paras = text.split("\n\n")
    return ["\n\n".join(paras[i:i+BLOCK_SIZE]) for i in range(0, len(paras), BLOCK_SIZE)]

def handle_image(file_path):
    text = pytesseract.image_to_string(Image.open(file_path)).strip()
    if len(text.split()) > 10:
        return text 
    else:
        return f"[IMAGE]{file_path}"


In [2]:
import os
import mimetypes

def detect_and_extract_text(file_path):
    filetype = mimetypes.guess_type(file_path)[0]

    if filetype == 'application/pdf':
        return extract_text_from_pdf(file_path)
    elif filetype == 'text/plain':
        return extract_text_from_txt(file_path)
    elif filetype == 'text/csv':
        return extract_text_from_csv(file_path)
    elif filetype == 'application/json':
        return extract_text_from_json(file_path)
    elif filetype and filetype.startswith('image/'):
        return extract_text_from_image(file_path)
    else:
        raise ValueError("Unsupported file type.")

In [35]:
from PIL import Image
from google import genai
import os
import dotenv
dotenv.load_dotenv()

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

def convert_to_markdown_with_gemini(content):
    if content.startswith("[IMAGE]"):
        path = content.replace("[IMAGE]", "").strip()
        image = Image.open(path)
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[image, """Generate a single-line caption for the image in plain text. Do not include any Markdown formatting or image syntax—only output the caption itself."""]
        )
        caption = response.text.strip()
        return f"![{caption}]({os.path.basename(path)})"
    
    else:
        prompt = """Generate only the Markdown for the following content. The output should use actual line breaks and spacing as required by Markdown syntax—do not include \n, escaped characters, or any other formatting artifacts. The result should be ready to paste directly into a .md file and render correctly."""
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[content, prompt]
        )

        return response.text.strip()

In [None]:
import requests
import uuid
def pipeline(file_path):
    content_blocks = extract_and_chunk(file_path)
    markdown_blocks = [convert_to_markdown_with_gemini(block) for block in content_blocks]
    full_markdown = "\n\n".join(markdown_blocks)
    requests.post("http://localhost:9876/addData", json={"id": uuid.uuid4().hex,"text": full_markdown})
    print(full_markdown)
    return full_markdown

In [33]:
# pipeline("./cat.jpg")
# pipeline("./test-123.pdf")
pipeline('./emc-2.csv')
# pipeline('./ss.png')
# pipeline('./emc-2.json')

```markdown
| Event Name              | Team Reg | Team Part | External Reg | External Part | 1st Reg | 1st Part | 2nd Reg | 2nd Part | 2nd Lateral Reg | 2nd Lateral Part | 3rd Reg | 3rd Part | 3rd Lateral Reg | 3rd Lateral Part | 4th Reg | 4th Part | 4th Lateral Reg | 4th Lateral Part |
|-------------------------|----------|-----------|--------------|---------------|---------|----------|---------|----------|-----------------|--------------------|---------|----------|-----------------|--------------------|---------|----------|-----------------|--------------------|
| Animeverse              | 45       | 39        | 18           | 15            | 53      | 51       | 29      | 29       | 8               | 6                  | 13      | 11       | 1               | 1                  | 1       | 1        | 1               | 1                  |
| Locked in Reality       | 38       | 32        | 21           | 15            | 51      | 51       | 24      | 13       | 1               | 0  

'```markdown\n| Event Name              | Team Reg | Team Part | External Reg | External Part | 1st Reg | 1st Part | 2nd Reg | 2nd Part | 2nd Lateral Reg | 2nd Lateral Part | 3rd Reg | 3rd Part | 3rd Lateral Reg | 3rd Lateral Part | 4th Reg | 4th Part | 4th Lateral Reg | 4th Lateral Part |\n|-------------------------|----------|-----------|--------------|---------------|---------|----------|---------|----------|-----------------|--------------------|---------|----------|-----------------|--------------------|---------|----------|-----------------|--------------------|\n| Animeverse              | 45       | 39        | 18           | 15            | 53      | 51       | 29      | 29       | 8               | 6                  | 13      | 11       | 1               | 1                  | 1       | 1        | 1               | 1                  |\n| Locked in Reality       | 38       | 32        | 21           | 15            | 51      | 51       | 24      | 13       | 1               

In [32]:
searchResp = requests.post("http://localhost:9876/searchData", json={
  "text": "sherlocked",
  "n_results": 3
})
print(searchResp.json())

{'results': [{'id': 'bc42a5a8662345308cd073a39074af08', 'text': '![A grey and white cat with wide eyes looking intently.](cat.jpg)'}, {'id': 'doc5', 'text': 'Search is done using a text query and returns the top N similar documents.'}, {'id': 'doc2', 'text': 'ChromaDB is used for managing and querying embeddings efficiently.'}], 'status': 'success'}
