IMPORTS NEEDED FOR THE CODE

In [4]:
import os
import io
import re
import json
import requests
import tempfile
from datetime import datetime

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from pymongo import MongoClient

from PIL import Image
import pytesseract
import fitz
from docx import Document

import ollama

MongoDB CONNECTION

In [5]:
client = MongoClient(
    "mongodb+srv://daktrboys05_db_user:gdgclubproject@to-do-list.qmqixqe.mongodb.net/"
)
db = client["tries_db"]
assingment = db["submissions"]
questions_collection = db["questions"]



CONNECT API

In [6]:
app = FastAPI()

REQUEST MODEL

In [7]:
class EvaluateRequest(BaseModel):
    exam_id: str
    submission_id: str

SAFE JSON

In [8]:
def safe_json(text: str):
    try:
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if not match:
            return {}

        raw = match.group(0)

        try:
            return json.loads(raw)
        except Exception:
            pass

        cleaned = raw.replace("'", '"').replace("\n", " ").replace("\t", " ")
        try:
            return json.loads(cleaned)
        except Exception:
            return {}

    except Exception:
        return {}


DOWNLOAD FILE

In [9]:
def download_file(file_url: str) -> str:
    response = requests.get(file_url, stream=True, timeout=20)
    response.raise_for_status()

    ext = os.path.splitext(file_url.split("?")[0])[1]

    with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
        for chunk in response.iter_content(8192):
            tmp.write(chunk)
        return tmp.name

PDF EXTRACTION

In [10]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    texts = []

    for page in doc:
        page_text = page.get_text().strip()
        if page_text:
            texts.append(page_text)

        for img in page.get_images(full=True):
            base = doc.extract_image(img[0])
            img_pil = Image.open(io.BytesIO(base["image"]))
            ocr = pytesseract.image_to_string(img_pil, lang="eng", config="--psm 6")
            if ocr.strip():
                texts.append(ocr.strip())

    return texts


DOCX EXTRACTION

In [11]:
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    collected = []

    for para in doc.paragraphs:
        if para.text.strip():
            collected.append(para.text.strip())

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                if cell.text.strip():
                    collected.append(cell.text.strip())

    for section in doc.sections:
        for para in section.header.paragraphs:
            if para.text.strip():
                collected.append(para.text.strip())
        for para in section.footer.paragraphs:
            if para.text.strip():
                collected.append(para.text.strip())

    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            img = Image.open(io.BytesIO(rel.target_part.blob))
            text = pytesseract.image_to_string(img, lang="eng", config="--psm 6")
            if text.strip():
                collected.append(text.strip())

    return "\n".join(collected)


IMAGE EXTRACTION

In [12]:
def extract_text_from_image(image_path):
    img = Image.open(image_path)
    return pytesseract.image_to_string(img).strip()


QUESTION SPLITTER

In [13]:
def split_by_questions(text: str):
    pattern = r"(\[\d+\s*marks?\]\s*Question:.*?)(?=\[\d+\s*marks?\]\s*Question:|$)"
    matches = re.findall(pattern, text, flags=re.IGNORECASE | re.DOTALL)

    return [
        f"[QUESTION_{i+1}_START]\n{block.strip()}"
        for i, block in enumerate(matches)
    ]


FILE TYPE HANDELING

In [14]:
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        pdf_texts = extract_text_from_pdf(file_path)
        full_text = "\n".join(pdf_texts)
        return split_by_questions(full_text)

    elif ext == ".docx":
        text = extract_text_from_docx(file_path)
        return split_by_questions(text)

    elif ext in [".png", ".jpg", ".jpeg"]:
        text = extract_text_from_image(file_path)
        return split_by_questions(text)

    else:
        raise ValueError("Unsupported file type")


FETCH ASSINGMENT FROM MONGO DB

In [None]:
def fetch_submission(submission_id: str):
    return assingment.find_one(
        {"_id": submission_id},
        {"answer_file_url": 1}
    )

MAIN ENDPOINT

In [None]:
@app.post("/ingest/anserkey-file")
def ingest_from_file(payload: EvaluateRequest):

    local_file_path = "../"

    try:
        '''submission = fetch_submission(payload.submission_id)
        if not submission:
            raise HTTPException(status_code=404, detail="Submission not found")

        file_url = submission.get("answer_file_url")
        if not file_url:
            raise HTTPException(status_code=400, detail="No file URL in submission")

        local_file_path = download_file(file_url)'''

        question_blocks = extract_text(local_file_path)

        inserted_ids = []

        for idx, block in enumerate(question_blocks, start=1):
            doc = {
                "exam_id": payload.exam_id,
                "submission_id": payload.submission_id,
                "block_number": idx,
                "raw_text": block,
                "created_at": datetime.utcnow()
            }

            result = questions_collection.insert_one(doc)
            inserted_ids.append(str(result.inserted_id))

        return {
            "exam_id": payload.exam_id,
            "submission_id": payload.submission_id,
            "inserted_blocks": len(inserted_ids),
            "ids": inserted_ids
        }

    finally:
        if local_file_path and os.path.exists(local_file_path):
            os.remove(local_file_path)
    