<a href="https://colab.research.google.com/github/SohamNigam/Vision/blob/main/OCR(final).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install all required packages
!pip install google-cloud-vision pillow opencv-python google-generativeai pdf2image
!apt-get update -qq && apt-get install -y poppler-utils
print("✅ Installation complete!")

Collecting google-cloud-vision
  Downloading google_cloud_vision-3.10.2-py3-none-any.whl.metadata (9.6 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading google_cloud_vision-3.10.2-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.9/527.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image, google-cloud-vision
Successfully installed google-cloud-vision-3.10.2 pdf2image-1.17.0


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 40 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 0s (466 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
input_folder = '/content/drive/MyDrive/OCR/circular'   # Folder with original PDFs
output_folder = '/content/drive/MyDrive/OCR/Renamed_Files' # Folder to save renamed PDFs

import os
os.makedirs(output_folder, exist_ok=True)


In [10]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyAbZY56YX_TDAk3gcQnEMgO9CwhFwc3cgI")  # Only needed if not in env variable

chat_model = genai.GenerativeModel('gemini-2.5-pro')
# or for other model types:
# chat_model = genai.GenerativeModel('gemini-1.5-pro')

In [4]:
from google.colab import files
import os

# Upload your Google Cloud service account JSON file
print("Please upload your Google Cloud service account JSON file:")
uploaded = files.upload()

# Set up authentication
credential_file = list(uploaded.keys())[0]
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_file
print(f"✅ Credentials set: {credential_file}")

Please upload your Google Cloud service account JSON file:


Saving vision-ocr-466709-44d985fb6a51.json to vision-ocr-466709-44d985fb6a51.json
✅ Credentials set: vision-ocr-466709-44d985fb6a51.json


In [5]:
import os
import re
import io
import cv2
import numpy as np
from datetime import datetime
from pathlib import Path
from PIL import Image, ImageEnhance
from pdf2image import convert_from_path
from google.cloud import vision

client = vision.ImageAnnotatorClient()

def preprocess_image(image):
    """Enhance image quality for better OCR"""
    # Convert to OpenCV format
    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2GRAY)

    # Apply noise reduction
    denoised = cv2.fastNlMeansDenoising(gray)

    # Sharpen the image
    kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(denoised, -1, kernel)

    # Enhance contrast
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast_enhanced = clahe.apply(sharpened)

    # Convert back to PIL and enhance
    processed_image = Image.fromarray(contrast_enhanced)
    enhancer = ImageEnhance.Contrast(processed_image)
    processed_image = enhancer.enhance(1.2)

    return processed_image.convert('RGB')

def extract_text_from_pdf(image):
    """Extract text using Google Vision API"""
    try:
        # Convert image to bytes
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG', optimize=True, quality=95)
        img_byte_arr = img_byte_arr.getvalue()

        # Create Vision API request
        vision_image = vision.Image(content=img_byte_arr)
        image_context = vision.ImageContext(language_hints=['mr', 'hi', 'eng'])

        # Perform OCR
        response = client.document_text_detection(
            image=vision_image,
            image_context=image_context
        )

        if response.error.message:
            raise Exception(f'Vision API Error: {response.error.message}')

        return response.full_text_annotation.text if response.full_text_annotation else ""

    except Exception as e:
        print(f"OCR Error: {e}")
        return ""


def extract_keywords(text, max_words=3):
    """Extract subject keywords with better Marathi support"""
    # Enhanced stop words including more Marathi terms
    stop_words = {
        'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
        'का', 'की', 'के', 'में', 'पर', 'से', 'को', 'और', 'है', 'में',  # Hindi
        'चा', 'ची', 'चे', 'मध्ये', 'वर', 'पासून', 'आणि', 'या', 'ते', 'ला', 'ने',  # Marathi
        'शासन', 'कार्यालय', 'विभाग', 'मंत्रालय', 'संदर्भ', 'दिनांक'  # Common govt terms
    }

    # Look for "विषय" pattern specifically
    vishay_patterns = [
        r'विषय\s*[:।]\s*(.+?)(?:\n|।|\.)',  # विषय: content
        r'विषय\s*[-—]\s*(.+?)(?:\n|।|\.)',   # विषय - content
        r'विषय\s+(.+?)(?:\n|।|\.|,)',        # विषय content
    ]

    # Try to find subject using विषय patterns first
    for pattern in vishay_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            subject_line = match.group(1).strip()
            if subject_line and len(subject_line) > 2:
                # Clean and extract meaningful words from subject line
                subject_words = re.findall(r'\b[A-Za-z\u0900-\u097F]{3,}\b', subject_line)
                filtered_words = [w for w in subject_words if w.lower() not in stop_words]
                if filtered_words:
                    return filtered_words[:max_words]

    # Fallback: Extract from first few lines if विषय not found
    lines = text.split('\n')[:15]  # First 15 lines
    words = []

    for line in lines:
        # Skip common headers
        if any(skip in line.lower() for skip in ['शासन', 'सरकार', 'government', 'कार्यालय']):
            continue

        line_words = re.findall(r'\b[A-Za-z\u0900-\u097F]{3,}\b', line)
        words.extend(line_words[:3])  # Take first 3 words per line

    # Filter meaningful words
    meaningful_words = []
    seen = set()
    for word in words:
        word_lower = word.lower()
        if (word_lower not in stop_words and
            word_lower not in seen and
            len(word) >= 3):
            meaningful_words.append(word)
            seen.add(word_lower)
            if len(meaningful_words) >= max_words:
                break

    return meaningful_words

def extract_accurate_header_date(text):
    """
    Extracts the header date (main top date) from the first 15 lines of OCR text.
    Prioritizes lines with 'दिनांक', then finds first valid date pattern if not found.
    """
    import re
    date_regex = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
    lines = text.splitlines()
    header_zone = lines[:15]  # Adjust as needed

    # First, look for lines with 'दिनांक'
    for line in header_zone:
        if 'दिनांक' in line or 'दिन' in line:
            match = re.search(date_regex, line)
            if match:
                return match.group(0)
    # If not found, match any date pattern in the first 15 lines
    for line in header_zone:
        match = re.search(date_regex, line)
        if match:
            return match.group(0)
    return None

def extract_full_vishay_text(text):
    """
    Extracts the full block of text following "विषय" until the next major section (like संदर्भ or महोदय)
    or the end of the document.
    """
    import re
    # Pattern to find "विषय" and capture everything after it
    pattern_start = r'(?:^|\n)\s*विषय\s*[:।\-–—]?\s*(.+)'
    match_start = re.search(pattern_start, text)

    if not match_start:
        # Fallback: check for 'subject' in English if text is English
        pattern_start_eng = r'(?:^|\n)\s*subject\s*[:\-–—]?\s*(.+)'
        match_start_eng = re.search(pattern_start_eng, text, flags=re.I)
        if not match_start_eng:
            return None # Subject not found

        # If English subject found, set start index and adjust text
        start_index = match_start_eng.start(1)
        subject_text = text[start_index:].strip()
        # Look for common English section headers to mark the end
        end_match = re.search(r'(?:^|\n)\s*(Reference|Sir|Madam|Dear|Yours)', subject_text, flags=re.I)

    else:
        # If Marathi/Hindi विषय found, set start index and adjust text
        start_index = match_start.start(1)
        subject_text = text[start_index:].strip()
        # Look for common Marathi/Hindi section headers to mark the end
        end_match = re.search(r'(?:^|\n)\s*(संदर्भ|महोदय|महोदया)', subject_text)


    if end_match:
        # If an end pattern is found, truncate the text at that point
        end_index = end_match.start()
        return subject_text[:end_index].strip()
    else:
        # If no end pattern is found, return the rest of the text
        return subject_text.strip()


def clean_filename(text):
    # ...
    text = re.sub(r"[\/.\-]", "_", text)
    text = re.sub(r"_+", "_", text)
    return text



print("✅ All processing functions ready!")

✅ All processing functions ready!


In [6]:
def process_single_pdf(pdf_path):
    """Process one PDF and return extracted header data"""
    try:
        print(f"🔄 Processing: {os.path.basename(pdf_path)}")

        # Convert to image
        pages = convert_from_path(pdf_path, first_page=1, last_page=1, dpi=300)
        if not pages:
            return {"error": "Failed to convert PDF to image"}

        print("  ✓ PDF converted to image")

        # Preprocess image
        processed_image = preprocess_image(pages[0])
        print("  ✓ Image preprocessed")

        # Extract text
        text = extract_text_from_pdf(processed_image)
        if not text:
            return {"error": "No text extracted"}

        print(f"  ✓ Text extracted ({len(text)} characters)")

        # Extract values
        header_date = extract_accurate_header_date(text)
        vishay_line = extract_full_vishay_text(text) # Changed from extract_vishay_line

        # Return only what’s needed
        return {
            "success": True,
            "original_file": Path(pdf_path).name,
            "header_date": header_date,
            "vishay": vishay_line,
            "header_text": text[:800] + "..." if len(text) > 800 else text
        }

    except Exception as e:
        return {"error": f"Processing failed: {str(e)}"}

In [19]:
from google.colab import files
import shutil
from datetime import datetime
import re
import google.generativeai as genai
from google.colab import userdata

# 1. Configure Gemini API key
try:
    genai.configure(api_key=userdata.get('AIzaSyCfJrDk8CG80aLPQRix4GEWSYE1dldPHD0'))
except Exception as e:
    print(f"Error configuring Gemini API: {e}")
    print("Please make sure you have added your GOOGLE_API_KEY to Colab secrets.")
    exit()

gemini_model = genai.GenerativeModel("gemini-2.5-pro")

# 2. Utility: Convert Devanagari digits to English
def convert_devanagari_to_english(text):
    devanagari_digits = "०१२३४५६७८९"
    eng_digits = "0123456789"
    return text.translate(str.maketrans(devanagari_digits, eng_digits))

# 3. Utility: Summarize subject using Gemini API
def summarize_vishay_with_gemini(vishay_text, gemini_model):
    if not vishay_text or len(vishay_text.strip()) < 4:
        return "Document"
    prompt = (
        "तुमचा कार्य हा PDF मधून मिळालेला विषय ओळखून त्याचं ३ ते ७ शब्दांत सारांश तयार करणे आहे. "
        "कृपया मूळ भाषेतच उत्तर द्या. इंग्रजी अथवा ट्रान्सलिटरेशनमध्ये उत्तर देऊ नका.\n"
        f"\nविषय: {vishay_text}\n\n"
        "✅ उत्तरात केवळ सारांश लिहा. इतर काही नाही."
    )
    try:
        response = gemini_model.generate_content(prompt)
        if response and hasattr(response, "text") and response.text:
            summary = response.text.strip()
            return summary or "Document"
        else:
            print("❌ Gemini API returned an empty/invalid response.")
            return "Document"
    except Exception as e:
        print(f"❌ Gemini summarization failed: {e}")
        return "Document"

# 4. Utility: Clean filename for filesystem safety, preserving language/script
def clean_filename(text):
    # Safely replaces / . - with underscores (hyphen last, no ranges)
    text = re.sub(r"[/.\-]", "_", text)
    text = re.sub(r"_+", "_", text)
    return text.strip('_')[:100]

# 5. Main flow: Upload, process, summarize, rename
print("📤 Upload a PDF to process and rename:")
uploaded_pdf = files.upload()

if uploaded_pdf:
    pdf_filename = list(uploaded_pdf.keys())[0]
    result = process_single_pdf(pdf_filename)

    if result.get("success"):
        # Extract full (multi-line) विषय block
        subject_raw = extract_full_vishay_text(result['header_text'])
        summarized_subject = summarize_vishay_with_gemini(subject_raw, gemini_model)

        # Fallback logic if Gemini returns nothing
        if not summarized_subject or summarized_subject == "Document":
            if result.get('vishay'):
                words = re.findall(r'\b[\u0900-\u097F\w]{3,}\b', result['vishay'])
                summarized_subject = "_".join(words[:4]) if words else "Document"
            else:
                summarized_subject = "Document"

        # Clean the subject for filename use
        clean_subject = clean_filename(summarized_subject)

        # Prepare and clean date for filename
        date_raw = result['header_date'] if result['header_date'] else datetime.now().strftime("%Y%m%d")
        date_eng = convert_devanagari_to_english(date_raw)
        date_formatted = re.sub(r"[/.\-]", "_", date_eng)
        date_formatted = re.sub(r"_+", "_", date_formatted)

        # Build, trim, and save the final filename
        final_filename = f"{clean_subject}_{date_formatted}.pdf"
        final_filename = final_filename[:200]

        shutil.copy2(pdf_filename, final_filename)
        # files.download(final_filename)

        print(f"\n✅ Downloaded as: {final_filename}")
        print(f"🔖 विषय (Subject): {result.get('vishay', 'N/A')}")
        print(f"📅 Header Date: {result.get('header_date', 'N/A')}")
    else:
        print(f"❌ Error: {result['error']}")

Error configuring Gemini API: Secret AIzaSyCfJrDk8CG80aLPQRix4GEWSYE1dldPHD0 does not exist.
Please make sure you have added your GOOGLE_API_KEY to Colab secrets.
📤 Upload a PDF to process and rename:


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-19-2309107472.py", line 55, in <cell line: 0>
    uploaded_pdf = files.upload()
                   ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/colab/files.py", line 72, in upload
    uploaded_files = _upload_files(multiple=True)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/colab/files.py", line 164, in _upload_files
    result = _output.eval_js(
             ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/colab/output/_js.py", line 40, in eval_js
    return _message.read_reply_from_input(request_id, timeout_sec)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_message.py", line 96,

TypeError: object of type 'NoneType' has no len()

In [11]:
import os
from shutil import copy2
from datetime import datetime # Import datetime

# List all PDF files in input folder
pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

# Process only the first 10 files
for filename in pdf_files[:10]:
    full_path = os.path.join(input_folder, filename)
    print(f"📄 Processing: {filename}")

    # Use the existing process_single_pdf function
    result = process_single_pdf(full_path)

    if result.get("success"):
        # Extract full (multi-line) विषय block from the result
        subject_raw = result.get('vishay')

        # Step 4: Summarize subject using Gemini
        # Pass the gemini_model which is defined in ALAQBJf_-q_o
        summarized_subject = summarize_vishay_with_gemini(subject_raw, chat_model) # Using chat_model as defined in 79P_e-BfyPLv

        # Fallback logic if Gemini returns nothing
        if not summarized_subject or summarized_subject == "Document":
            if subject_raw:
                words = re.findall(r'\b[\u0900-\u097F\w]{3,}\b', subject_raw)
                summarized_subject = "_".join(words[:4]) if words else "Document"
            else:
                summarized_subject = "Document"

        # Step 5: Clean filename parts
        clean_subject = clean_filename(summarized_subject) # Using existing clean_filename
        date_raw = result.get('header_date') # Get date from result

        # Prepare and clean date for filename
        date_raw = result['header_date'] if result['header_date'] else datetime.now().strftime("%Y%m%d")
        date_eng = convert_devanagari_to_english(date_raw) # Using existing convert_devanagari_to_english
        date_formatted = re.sub(r"[/.\-]", "_", date_eng)
        date_formatted = re.sub(r"_+", "_", date_formatted)


        # Step 6: Build new filename
        new_filename = f"{clean_subject}_{date_formatted}.pdf"
        output_file_path = os.path.join(output_folder, new_filename)

        # Step 7: Copy renamed file to output folder
        copy2(full_path, output_file_path)

        print(f"✅ Saved as: {new_filename}\n")
    else:
        print(f"❌ Error processing {filename}: {result['error']}\n")

📄 Processing: 523 09042025.pdf
🔄 Processing: 523 09042025.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1376 characters)
✅ Saved as: Document_09_04_2025.pdf

📄 Processing: कंत्राटदारांच्या सहभागाविषयी व ईएसआय अनुपालनाविषयी.pdf
🔄 Processing: कंत्राटदारांच्या सहभागाविषयी व ईएसआय अनुपालनाविषयी.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1224 characters)
✅ Saved as: कंत्राटदारांचा सहभाग व ईएसआय अनुपालन तपशील_07_04_2025.pdf

📄 Processing: 524 09042025.pdf
🔄 Processing: 524 09042025.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1458 characters)
✅ Saved as: Document_09_04_2025.pdf

📄 Processing: आधार नोडल.pdf
🔄 Processing: आधार नोडल.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1625 characters)
✅ Saved as: आधारसाठी नोडल अधिकाऱ्याची नियुक्ती करणे_08_04_2025.pdf

📄 Processing: online-training-e-Office-computer-system-8-april_0.pdf
🔄 Processing: online-training-e-Office-computer-syst

ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 2949.60ms


✅ Saved as: पुणे मनपा क्रीडा अधिकाऱ्याची वेतन संरचना_07_04_2025.pdf

📄 Processing: JE antim.pdf
🔄 Processing: JE antim.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (2365 characters)
✅ Saved as: कनिष्ठ अभियंता (स्थापत्य) अंतिम ज्येष्ठता यादी_08_04_2025.pdf

📄 Processing: IMG_0001_29.pdf
🔄 Processing: IMG_0001_29.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1208 characters)
✅ Saved as: शुभम बोरसुतकर यांच्या वेतन व्यवस्थेबाबत_07_04_2025.pdf

📄 Processing: 344 04042025.pdf
🔄 Processing: 344 04042025.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1962 characters)
✅ Saved as: Document_04_04_2025.pdf

