In [1]:
!apt-get install -y poppler-utils
!pip install pdf2image google-cloud-vision google-generativeai

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://security.ubuntu.com/ubuntu jammy-security/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 1s (274 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 125082 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting google-cloud-vision
  Downloading google_cloud_vision-3.11.0-py3-none-any.whl.metadat

In [None]:
submitted_json_root = "/content/submitted_docs_json"
digilocker_json_root = "/content/digilocker_mock_json"

for d in [submitted_json_root, digilocker_json_root]:
    os.makedirs(d, exist_ok=True)

In [1]:
# -*- coding: utf-8 -*-
"""Final_prototype_1.ipynb"""

import os
import zipfile
import json
from pdf2image import convert_from_path
from google.cloud import vision
import google.generativeai as genai

# --------------------------
# Setup directories & install dependencies
# --------------------------


os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/industrial-net-477410-a1-41426c2aeb41.json"
genai.configure(api_key="AIzaSyBwzZ-IjSa4OEg-ErJkOHtpCBgl_Sr-GiU")

zip_path1 = "/content/submitted_docs.zip"
zip_path2 = "/content/Digilocker_mock.zip"
submitted_dir = "/content/submitted_docs"
mock_digilocker_dir = "/content/digilocker_mock"
output_root = "/content/Verification_results"

for d in [submitted_dir, mock_digilocker_dir]:
    os.makedirs(d, exist_ok=True)

# --------------------------
# Extract PDFs from ZIPs
# --------------------------
def extract_pdfs_from_zip(zip_path, extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for f in zip_ref.namelist():
            if f.lower().endswith(".pdf"):
                zip_ref.extract(f, extract_dir)
    print(f"✅ Extracted PDFs from {os.path.basename(zip_path)}")

extract_pdfs_from_zip(zip_path1, submitted_dir)
extract_pdfs_from_zip(zip_path2, mock_digilocker_dir)




✅ Extracted PDFs from submitted_docs.zip
✅ Extracted PDFs from Digilocker_mock.zip


In [3]:
# --------------------------
# Base prompt for JSON extraction
# --------------------------
base_prompt = """
You are a structured-data extractor for document verification.
Given OCR text, return a clean JSON in the required format for that document.
Recognize the document type automatically (Aadhaar, PAN, Passport, CBSE 10th/12th, College Degree).
Follow these format rules strictly:
- Use clear field names and numeric values for marks or IDs
- Include "Document_Type" and "Verified_Fields" keys
- Return ONLY valid JSON, no explanations.
OCR TEXT:
"""

# --------------------------
# Google Vision OCR function
# --------------------------
client = vision.ImageAnnotatorClient()

def vision_ocr_pdf(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    full_text = ""

    for i, page in enumerate(pages):
        img_path = f"/content/tmp_page_{i}.jpg"
        page.save(img_path, "JPEG")
        with open(img_path, "rb") as f:
            image = vision.Image(content=f.read())
        resp = client.document_text_detection(image=image)
        full_text += f"\n---- PAGE {i+1} ----\n{resp.full_text_annotation.text}"

    return full_text

# --------------------------
# Gemini JSON extraction
# --------------------------
model = genai.GenerativeModel("models/gemini-2.5-flash")

def extract_json_from_text(ocr_text):
    prompt = base_prompt + ocr_text
    try:
        resp = model.generate_content(prompt)
        js_text = resp.text.strip().strip("```json").strip("```")
        parsed_json = json.loads(js_text)
    except Exception as e:
        parsed_json = {"RawText": ocr_text[:400], "Error": str(e)}
    return parsed_json

# --------------------------
# Verifier: OCR + JSON extraction
# --------------------------
def verifier(pdf_path):
    ocr_text = vision_ocr_pdf(pdf_path)
    json_data = extract_json_from_text(ocr_text)
    return json_data




# --------------------------
# Function to get corresponding file in zip2
# --------------------------
def caller(zip1_file_path, zip2_root, zip1_root):
    # Get path relative to zip1_root
    rel_path = os.path.relpath(zip1_file_path, start=zip1_root)
    # Build corresponding path in zip2_root
    zip2_file_path = os.path.join(zip2_root, rel_path)
    return zip2_file_path if os.path.exists(zip2_file_path) else None

# --------------------------
# Compare two PDFs and generate differences JSON
# --------------------------
def compare_pdfs(pdf_path1, pdf_path2, output_dir):
    json1 = verifier(pdf_path1)
    print(json1)
    json2 = verifier(pdf_path2)
    print(json2)

    compare_prompt = f"""
You are a JSON comparison assistant.
Given two JSON objects representing the same document, identify any differences.
Return a JSON object with the following structure:

- "Document_Name": the base name of the document
- "Differences": "Yes" or "No"
- "Details": a list of objects for each differing field. Each object should have:
    - "Field": name of the field
    - "Reason": explanation of the difference

JSON 1:
{json.dumps(json1, indent=4)}

JSON 2:
{json.dumps(json2, indent=4)}

Return ONLY valid JSON.
"""

    resp = model.generate_content(compare_prompt)
    js_text = resp.text.strip().strip("```json").strip("```")

    try:
        comparison_json = json.loads(js_text)
    except Exception as e:
        comparison_json = {
            "Document_Name": os.path.basename(pdf_path1),
            "Differences": "Error",
            "Details": [{"Field": "N/A", "Reason": str(e)}]
        }

    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, os.path.basename(pdf_path1).replace(".pdf", "_comparison.json"))
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(comparison_json, f, indent=4, ensure_ascii=False)

    return output_file

# --------------------------
# Loop through all candidates and compare their documents
# --------------------------
def compare_all_candidates(zip1_root, zip2_root, output_root):
    os.makedirs(output_root, exist_ok=True)

    for cand_folder in os.listdir(zip1_root):
        cand_path1 = os.path.join(zip1_root, cand_folder)
        if not os.path.isdir(cand_path1):
            continue


        candidate_result = {"Candidate": cand_folder, "Documents": []}

        for file_name in os.listdir(cand_path1):
            file_path1 = os.path.join(cand_path1, file_name)
            if not os.path.isfile(file_path1):
                continue


            file_path2 = caller(file_path1, zip2_root, zip1_root)
            if file_path2 is None:
                candidate_result["Documents"].append({
                    "Document_Name": file_name,
                    "Differences": "File missing in zip2",
                    "Details": [{"Field": "N/A", "Reason": "Corresponding file not found"}]
                })
                continue

            comparison_json_file = compare_pdfs(file_path1, file_path2, output_root)
            with open(comparison_json_file, "r", encoding="utf-8") as f:
                comparison_data = json.load(f)
            candidate_result["Documents"].append(comparison_data)

        candidate_json_file = os.path.join(output_root, f"{cand_folder}_comparison.json")
        with open(candidate_json_file, "w", encoding="utf-8") as f:
            json.dump(candidate_result, f, indent=4, ensure_ascii=False)

        print(f"✅ Candidate comparison saved: {candidate_json_file}")

In [4]:
compare_all_candidates(submitted_dir, mock_digilocker_dir, output_root)


{'Document_Type': 'CBSE 10th Marksheet', 'Verified_Fields': {'Student_Name': 'LAKSHY AGGARWAL', 'Roll_Number': '14120763', 'Mother_Name': 'SHALU AGGARWAL', 'Father_Name': 'RAJEEV KUMAR AGGARWAL', 'Date_of_Birth': '18-09-2006', 'School_Code': '25110', 'School_Name': 'LITTLE FLOWERS PUB SSS SHIVAJI PK SHAHDARA DL', 'Examination_Year': 2022, 'Result': 'PASS', 'Date_of_Issue': '22-07-2022', 'Registration_Number': 'D122251100282', 'Marks_Obtained': {'ENGLISH LNG & LIT': 71, 'HINDI COURSE-A': 75, 'MATHEMATICS STANDARD': 81, 'SCIENCE': 74, 'SOCIAL SCIENCE': 75, 'INFORMATION TECHNOLOGY': 59}}}
{'Document_Type': 'CBSE_10th_Mark_Sheet', 'Verified_Fields': {'Name': 'LAKSHY AGGARWAL', 'Roll_Number': '14120763', 'Mother_Name': 'SHALU AGGARWAL', 'Father_Name': 'RAJEEV KUMAR AGGARWAL', 'Date_of_Birth': '18-09-2006', 'School_Code': '25110', 'School_Name': 'LITTLE FLOWERS PUB SSS SHIVAJI PK SHAHDARA DL', 'Exam_Year': 2022, 'Result': 'PASS', 'Marks_Details': [{'Subject': 'ENGLISH LNG & LIT', 'Theory_Mar

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def verifier(pdf_path):
    ocr_text = vision_ocr_pdf(pdf_path)
    json_data = extract_json_from_text(ocr_text)
    return json_data

In [5]:
a=vision_ocr_pdf("/content/submitted_docs/cand001/cand001_10thmarksheet.pdf")

In [6]:
print(extract_json_from_text(a))

{'Document_Type': 'CBSE 10th Marksheet', 'Name': 'LAKSHY AGGARWAL', 'Roll_No': '14120763', 'Mother_Name': 'SHALU AGGARWAL', 'Father_Name': 'RAJEEV KUMAR AGGARWAL', 'Date_of_Birth': '18-09-2006', 'School_Code': '25110', 'School_Name': 'LITTLE FLOWERS PUB SSS SHIVAJI PK SHAHDARA DL', 'Examination_Year': 2022, 'Date_of_Issue': '22-07-2022', 'Result': 'PASS', 'Subjects': [{'Subject_Name': 'ENGLISH LNG & LIT', 'Internal_Marks': 19, 'Theory_Marks': 52, 'Total_Marks': 71, 'Grade': 'B1'}, {'Subject_Name': 'HINDI COURBE-A', 'Internal_Marks': 20, 'Theory_Marks': 55, 'Total_Marks': 75, 'Grade': 'B1'}, {'Subject_Name': 'MATHEMATICS STANDARD', 'Internal_Marks': 62, 'Theory_Marks': 19, 'Total_Marks': 81, 'Grade': 'A2'}, {'Subject_Name': 'SCIENCE', 'Internal_Marks': 54, 'Theory_Marks': 20, 'Total_Marks': 74, 'Grade': 'B2'}, {'Subject_Name': 'SOCIAL SCIENCE', 'Internal_Marks': 50, 'Theory_Marks': 19, 'Total_Marks': 75, 'Grade': 'B2'}, {'Subject_Name': 'INFORMATION TECHNOLOGY', 'Internal_Marks': 18, 'T

In [None]:
print(a)


---- PAGE 1 ----
21409
0049923
26110-00282
Reg D122251100282
केन्द्रीय माध्यमिक शिक्षा बोर्ड
CENTRAL BOARD OF SECONDARY EDUCATION
अंक विवरणिका सह प्रमाण पत्र
MARKS STATEMENT CUM CERTIFICATE
माध्यमिक विद्यालय परीक्षा, 2013
SECONDARY SCHOOL EXAMINATION, 2022
This is cotify that
LAKSHY AGGARWAL
Roll No
14120763
SHALU AGGARWAL
Guardian's Name RAJEEV KUMAR AGGARWAL
od
18-09-2006 18TH SEPTEMBER TWO THOUSAND SIX
Sulmol
25110- LITTLE FLOWERS PUB SSS SHIVAJI PK SHAHDARA DL
ên thrive artent lengt i, has returned Scholastic Achievements us iniler |
BUD
CON
SUBJECT
MARKE ONTAINED
Teotra inc
THECY
TIL FIL
योग (सब्दों में)
TOTAL TOTAL (IN WORDS)
GRADE
184
ENGLISH LNG & LIT
019
071
SEVENTY ONE
002
HINDI COURBE-A
020
070
SEVENTY FIVE
041
LATHEMATICS STANDARD
062
081
EIGHTY ONE
066
SCIENCE
054
020
074
SEVENTY FOUR
5622
C1
B1
067
SOCIAL SCIENCE
050
01일
075
SEVENTY FIVE
B2
ADDITIONAL SUBJECT
402
INFORMATION TECHNOLOGY
01B
041
08
FIFTY NINE
D2
दिल्ली Dulhi
22-07-2022
PASS
when Th
Castruller of Examinatio

In [None]:
import time
import google.generativeai as genai

genai.configure(api_key="AIzaSyBwzZ-IjSa4OEg-ErJkOHtpCBgl_Sr-GiU")
model = genai.GenerativeModel("models/gemini-2.5-flash")

test_prompt = "Extract JSON for this short OCR text: LAKSHY AGGARWAL Roll No 14120763"

start_time = time.time()
resp = model.generate_content(test_prompt)
end_time = time.time()

print("Response time:", end_time - start_time, "seconds")
print(resp.text)


KeyboardInterrupt: 

In [None]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyBwzZ-IjSa4OEg-ErJkOHtpCBgl_Sr-GiU")
models = genai.list_models()

for m in models:
    print(m)

ReadTimeout: HTTPConnectionPool(host='localhost', port=33695): Read timed out. (read timeout=60.0)

In [None]:
import google.generativeai as genai

# Configure only the API key
genai.configure(api_key="AIzaSyBwzZ-IjSa4OEg-ErJkOHtpCBgl_Sr-GiU")

# Optional: check endpoint
print(genai.configuration.api_base)  # should be https://api.generativeai.googleapis.com


AttributeError: module 'google.generativeai' has no attribute 'configuration'

In [None]:
import os
import google.generativeai as genai

# Make sure no local endpoint is set
os.environ.pop("GENAI_API_BASE", None)

# Configure SDK with your API key
genai.configure(api_key="AIzaSyCz9gXJ7czXY5Y72Rss3Wbg02iCQupiw_Q")


In [None]:
model = genai.GenerativeModel("models/gemini-2.5-flash")
resp = model.generate_content("hi")


ReadTimeout: HTTPConnectionPool(host='localhost', port=33695): Read timed out. (read timeout=600.0)