In [81]:
# Step 1: Imports and Paths

import os
import pandas as pd
from PIL import Image
import pytesseract
from docx import Document
from transformers import pipeline


pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# File Paths (adjust if needed)
TRAIN_CSV = "train.csv"
TEST_CSV = "test.csv"
TRAIN_FOLDER = "train folder"
TEST_FOLDER = "test folder"
# Load QA model pipeline
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")


Device set to use cpu


In [85]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Load QA model
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# 6 Standard Questions
STANDARD_QUESTIONS = [
    "What is the Agreement Value?",
    "What is the Agreement Start Date?",
    "What is the Agreement End Date?",
    "What is the Renewal Notice (Days)?",
    "Who is Party One?",
    "Who is Party Two?"
]

# Text extractor
def extract_text(file_path):
    if file_path.lower().endswith(".docx"):
        try:
            doc = Document(file_path)
            return "\n".join([p.text for p in doc.paragraphs])
        except Exception as e:
            print(f"DOCX Error: {e}")
            return ""
    elif file_path.lower().endswith(".png"):
        try:
            image = Image.open(file_path)
            return pytesseract.image_to_string(image)
        except Exception as e:
            print(f"Image Error: {e}")
            return ""
    else:
        return ""

# Answer generator
def ask_metadata_questions(file_path):
    context = extract_text(file_path)
    if not context:
        print("❌ No text extracted.")
        return pd.DataFrame()

    result = []
    for q in STANDARD_QUESTIONS:
        try:
            answer = qa_model(question=q, context=context)["answer"]
        except:
            answer = "Not Found"
        result.append({"File Name": os.path.basename(file_path), "Question": q, "Answer": answer})
    
    return pd.DataFrame(result)



Device set to use cpu


In [87]:
# Example usage
file_path = input("Enter file path (e.g., train/123.docx or test/456.png): ").strip()

if os.path.exists(file_path):
    result_df = ask_metadata_questions(file_path)
    display(result_df)
    result_df.to_csv("output_answers.csv", index=False)
else:
    print("File not found.")


Enter file path (e.g., train/123.docx or test/456.png):  Train folder/44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx


Unnamed: 0,File Name,Question,Answer
0,44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx,What is the Agreement Value?,Rs\t9.99.7.9°\t p.m.
1,44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx,What is the Agreement Start Date?,2010
2,44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx,What is the Agreement End Date?,2010
3,44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx,What is the Renewal Notice (Days)?,11 months
4,44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx,Who is Party One?,Resident iai__|iroofo
5,44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx,Who is Party Two?,Resident iai__|iroofo


In [120]:
!pip install fastapi uvicorn nest_asyncio python-multipart transformers pytesseract python-docx Pillow


Collecting fastapi
  Downloading fastapi-0.115.14-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.14-py3-none-any.whl (95 kB)
Downloading uvicorn-0.35.0-py3-none-any.whl (66 kB)
Downloading python_multipart-0.0.20-py3-none-any.whl (24 kB)
Downloading starlette-0.46.2-py3-none-any.whl (72 kB)
Installing collected packages: python-multipart, uvicorn, starlette, fastapi
Successfully installed fastapi-0.115.14 python-multipart-0.0.20 starlette-0.46.2 uvicorn-0.35.0


In [122]:
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from transformers import pipeline
from PIL import Image
from docx import Document
import pytesseract
import os
import nest_asyncio
import uvicorn

# Enable nested event loops (required for Jupyter)
nest_asyncio.apply()

# Load QA model
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Define questions
QUESTIONS = [
    "What is the Agreement Value?",
    "What is the Agreement Start Date?",
    "What is the Agreement End Date?",
    "What is the Renewal Notice (Days)?",
    "Who is Party One?",
    "Who is Party Two?"
]

# Text extraction function
def extract_text(file_path):
    if file_path.endswith(".docx"):
        doc = Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs])
    elif file_path.endswith(".png"):
        image = Image.open(file_path)
        return pytesseract.image_to_string(image)
    return ""

# QA pipeline
def ask_metadata(text):
    results = {}
    for question in QUESTIONS:
        try:
            answer = qa_model(question=question, context=text)['answer']
        except:
            answer = "Not found"
        results[question] = answer
    return results

# Create FastAPI app
app = FastAPI()

@app.post("/extract/")
async def extract_metadata(file: UploadFile = File(...)):
    temp_path = f"temp_{file.filename}"
    with open(temp_path, "wb") as f:
        f.write(await file.read())
    
    text = extract_text(temp_path)
    metadata = ask_metadata(text)
    os.remove(temp_path)
    
    return JSONResponse(content=metadata)


Device set to use cpu


In [None]:
# Run the FastAPI server from within Jupyter
uvicorn.run(app, host="127.0.0.1", port=8000)


INFO:     Started server process [12852]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:59029 - "GET / HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:59269 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:59269 - "GET /openapi.json HTTP/1.1" 200 OK
INFO:     127.0.0.1:59282 - "POST /extract/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:59288 - "POST /extract/ HTTP/1.1" 200 OK
INFO:     127.0.0.1:59348 - "GET /docs HTTP/1.1" 200 OK
INFO:     127.0.0.1:59348 - "GET /openapi.json HTTP/1.1" 200 OK
