In [146]:
from sentence_transformers import SentenceTransformer

from PIL import Image
import numpy as np
import pytesseract
import fitz
import os
import io
from docx import Document
import regex as re

FOR READING DOCUMENT

In [147]:
file_path = "test.pdf"

In [148]:
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    collected = []

    # Body paragraphs
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            collected.append(text)

    # Tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                cell_text = cell.text.strip()
                if cell_text:
                    collected.append(cell_text)

    # Headers & footers
    for section in doc.sections:
        header = section.header
        footer = section.footer

        for para in header.paragraphs:
            if para.text.strip():
                collected.append(para.text.strip())

        for para in footer.paragraphs:
            if para.text.strip():
                collected.append(para.text.strip())

    # Embedded images → OCR
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            image_bytes = rel.target_part.blob
            img = Image.open(io.BytesIO(image_bytes))
            ocr_text = pytesseract.image_to_string(
                img,
                lang="eng",
                config="--psm 6"
            )
            if ocr_text.strip():
                collected.append(ocr_text.strip())

    return "\n".join(collected)

In [149]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    texts = []

    for page_num, page in enumerate(doc):
        # Digital text
        page_text = page.get_text().strip()
        if page_text:
            texts.append(page_text)

        # OCR embedded images
        images = page.get_images(full=True)
        for img in images:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            img_pil = Image.open(io.BytesIO(image_bytes))
            ocr_text = pytesseract.image_to_string(
                img_pil,
                lang="eng",
                config="--psm 6"
            ).strip()

            if ocr_text:
                texts.append(ocr_text)

        # Full-page OCR fallback
        if not page_text and not images:
            pix = page.get_pixmap(dpi=300)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img, lang="eng").strip()

            if ocr_text:
                texts.append(ocr_text)

    return texts


In [150]:
def extract_text_from_image(image_path):
    img = Image.open(image_path)
    return pytesseract.image_to_string(img).strip()

In [None]:
import re

def split_by_questions(text: str) -> list[str]:
    
    pattern = (
        r"(\[\d+\s*marks?\]\s*Question:.*?)(?=\[\d+\s*marks?\]\s*Question:|$)"
    )
    matches = re.findall(pattern, text, flags=re.IGNORECASE | re.DOTALL)

    question_blocks = []
    for idx, block in enumerate(matches, start=1):
        question_blocks.append(
            f"[QUESTION_{idx}_START]\n{block.strip()}"
        )
    return question_blocks

In [152]:
def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        pdf_texts = extract_text_from_pdf(file_path)
        full_text = "\n".join(pdf_texts)
        return split_by_questions(full_text)
    
    elif ext == ".docx":
        return (list(extract_text_from_docx(file_path)))

    elif ext in [".png", ".jpg", ".jpeg"]:
        return (list(extract_text_from_image(file_path)))

    else:
        raise ValueError("Unsupported file type")

CHUNKING THE EXTRACTED TEXT 

In [153]:
class embeddingManager:
  def __init__(self,model_name : str = "all-MiniLM-L6-v2"):
    #hugging face model for sentence embedding
    self.model_name = model_name
    self.model = None
    self._load_model()

  def _load_model(self):
    try:
      print(f"Loading embedding model: {self.model_name}")
      self.model = SentenceTransformer(self.model_name)
      print(f"Embedding model loaded successfully.Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
    except Exception as e:
      print(f"Error loading embedding model: {e}")

  def generate_embeddings(self,texts:list[str]) -> np.ndarray:#returns numpy array
    if self.model is None:
      self._load_model()
    print(f"Generating embedding for {len(texts)} texts....")
    embeddings = self.model.encode(texts, show_progress_bar = True)
    print("Embedding generated successfully.")
    return embeddings

In [157]:
embedding_manager = embeddingManager()
print("number of records: ",len(extract_text(file_path)))
texts = extract_text(file_path)
print("extracted texts:", texts)
for text in texts:
    print("starts:",text)
    print("\n")
embedding_manager.generate_embeddings(extract_text(file_path))

Loading embedding model: all-MiniLM-L6-v2
Embedding model loaded successfully.Embedding dimension: 384
number of records:  6
extracted texts: ['[QUESTION_1_START]\n[10 marks]\nQuestion: Explain the concept of ACID properties in database transactions. Discuss how each property ensures data\nconsistency and provide real-world scenarios where violation of these properties could lead to problems.\nEvaluation Rubric:\nTrait\nWeight\nDescription\nConcept Coverage\n40%\nComprehensive explanation of all 4 ACID properties (Atomicity, Consistency, Isolation\nReal-World Application\n30%\nClear examples of transactions and scenarios where violations cause problems\nLogical Flow\n20%\nWell-organized answer with clear connections between properties\nClarity & Language\n10%\nClear writing, appropriate terminology usage\nQuestion 2', '[QUESTION_2_START]\n[8 marks]\nQuestion: Write a SQL query to find the top 5 departments by average salary, excluding departments with fewer than\n10 employees. Include 

Batches: 100%|██████████| 1/1 [00:00<00:00,  6.69it/s]

Embedding generated successfully.





array([[ 0.01815865,  0.03243175, -0.08225077, ...,  0.04441799,
         0.02949115, -0.05169402],
       [ 0.01402685,  0.04991702,  0.10901149, ..., -0.08241593,
        -0.01403602,  0.08955504],
       [-0.00900713, -0.00058494, -0.02288687, ...,  0.06090765,
        -0.02194507,  0.0628505 ],
       [-0.0082739 ,  0.03229178, -0.04765171, ..., -0.06478573,
        -0.04062334,  0.03412547],
       [ 0.05345527,  0.08826678, -0.02615643, ...,  0.0467669 ,
        -0.07550272,  0.01124337],
       [ 0.01976677,  0.10287796,  0.00367258, ..., -0.1330917 ,
        -0.05959409, -0.00589298]], shape=(6, 384), dtype=float32)