In [1]:
# Install all required packages
!pip install google-cloud-vision pdf2image pillow opencv-python
!apt-get update -qq && apt-get install -y poppler-utils
print("✅ Installation complete!")


Collecting google-cloud-vision
  Downloading google_cloud_vision-3.10.2-py3-none-any.whl.metadata (9.6 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading google_cloud_vision-3.10.2-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.9/527.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image, google-cloud-vision
Successfully installed google-cloud-vision-3.10.2 pdf2image-1.17.0


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 40 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 0s (592 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


In [2]:
from google.colab import files
import os

# Upload your Google Cloud service account JSON file
print("Please upload your Google Cloud service account JSON file:")
uploaded = files.upload()

# Set up authentication
credential_file = list(uploaded.keys())[0]
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_file
print(f"✅ Credentials set: {credential_file}")


Please upload your Google Cloud service account JSON file:


Saving vision-ocr-466709-44d985fb6a51.json to vision-ocr-466709-44d985fb6a51.json
✅ Credentials set: vision-ocr-466709-44d985fb6a51.json


In [3]:
# Test Google Vision API connection
try:
    from google.cloud import vision
    client = vision.ImageAnnotatorClient()
    print("🎉 Google Vision API connected successfully!")
    print("Ready to process your Marathi/Hindi/English PDFs!")
except Exception as e:
    print(f"❌ Connection error: {e}")
    print("Check your credentials and ensure Vision API is enabled")


🎉 Google Vision API connected successfully!
Ready to process your Marathi/Hindi/English PDFs!


In [4]:
import os
import re
import io
import cv2
import numpy as np
from datetime import datetime
from pathlib import Path
from PIL import Image, ImageEnhance
from pdf2image import convert_from_path
from google.cloud import vision

def preprocess_image(image):
    """Enhance image quality for better OCR"""
    # Convert to OpenCV format
    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2GRAY)

    # Apply noise reduction
    denoised = cv2.fastNlMeansDenoising(gray)

    # Sharpen the image
    kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(denoised, -1, kernel)

    # Enhance contrast
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast_enhanced = clahe.apply(sharpened)

    # Convert back to PIL and enhance
    processed_image = Image.fromarray(contrast_enhanced)
    enhancer = ImageEnhance.Contrast(processed_image)
    processed_image = enhancer.enhance(1.2)

    return processed_image.convert('RGB')

def extract_text_from_pdf(image):
    """Extract text using Google Vision API"""
    try:
        # Convert image to bytes
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG', optimize=True, quality=95)
        img_byte_arr = img_byte_arr.getvalue()

        # Create Vision API request
        vision_image = vision.Image(content=img_byte_arr)
        image_context = vision.ImageContext(language_hints=['mr', 'hi'])

        # Perform OCR
        response = client.document_text_detection(
            image=vision_image,
            image_context=image_context
        )

        if response.error.message:
            raise Exception(f'Vision API Error: {response.error.message}')

        return response.full_text_annotation.text if response.full_text_annotation else ""

    except Exception as e:
        print(f"OCR Error: {e}")
        return ""

def extract_dates(text, response):
    import re
    pattern = re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b')
    all_dates = pattern.findall(text)

    # quick fallback if you don’t want to parse bounding boxes yet
    top_date = all_dates[-1] if all_dates else None

    return top_date, all_dates

def extract_keywords(text, max_words=3):
    """Extract subject keywords with better Marathi support"""
    # Enhanced stop words including more Marathi terms
    stop_words = {
        'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
        'का', 'की', 'के', 'में', 'पर', 'से', 'को', 'और', 'है', 'में',  # Hindi
        'चा', 'ची', 'चे', 'मध्ये', 'वर', 'पासून', 'आणि', 'या', 'ते', 'ला', 'ने',  # Marathi
        'शासन', 'कार्यालय', 'विभाग', 'मंत्रालय', 'संदर्भ', 'दिनांक'  # Common govt terms
    }

    # Look for "विषय" pattern specifically
    vishay_patterns = [
        r'विषय\s*[:।]\s*(.+?)(?:\n|।|\.)',  # विषय: content
        r'विषय\s*[-—]\s*(.+?)(?:\n|।|\.)',   # विषय - content
        r'विषय\s+(.+?)(?:\n|।|\.|,)',        # विषय content
    ]

    # Try to find subject using विषय patterns first
    for pattern in vishay_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            subject_line = match.group(1).strip()
            if subject_line and len(subject_line) > 2:
                # Clean and extract meaningful words from subject line
                subject_words = re.findall(r'\b[A-Za-z\u0900-\u097F]{3,}\b', subject_line)
                filtered_words = [w for w in subject_words if w.lower() not in stop_words]
                if filtered_words:
                    return filtered_words[:max_words]

    # Fallback: Extract from first few lines if विषय not found
    lines = text.split('\n')[:15]  # First 15 lines
    words = []

    for line in lines:
        # Skip common headers
        if any(skip in line.lower() for skip in ['शासन', 'सरकार', 'government', 'कार्यालय']):
            continue

        line_words = re.findall(r'\b[A-Za-z\u0900-\u097F]{3,}\b', line)
        words.extend(line_words[:3])  # Take first 3 words per line

    # Filter meaningful words
    meaningful_words = []
    seen = set()
    for word in words:
        word_lower = word.lower()
        if (word_lower not in stop_words and
            word_lower not in seen and
            len(word) >= 3):
            meaningful_words.append(word)
            seen.add(word_lower)
            if len(meaningful_words) >= max_words:
                break

    return meaningful_words

def extract_main_header_date(text):
    import re
    date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
    dates = re.findall(date_pattern, text)
    # Picks last matched date, which (in Vision's block order) is often the header date
    return dates[-1] if dates else None

def clean_filename(filename):
    """Create valid filename from extracted content"""
    import re
    # Replace invalid filesystem characters with underscores
    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
    # Replace whitespace with underscores
    filename = re.sub(r'\s+', '_', filename)
    # Keep only alphanumeric, underscores, hyphens, and dots
    filename = re.sub(r'[^\w\-_.]', '', filename)  # Fixed: escaped the hyphen
    # Remove multiple consecutive underscores
    filename = re.sub(r'_+', '_', filename)
    # Remove leading/trailing underscores
    filename = filename.strip('_')
    return filename[:100]

print("✅ All processing functions ready!")

✅ All processing functions ready!


In [8]:
def process_single_pdf(pdf_path):
    """Process one PDF and return results"""
    try:
        print(f"🔄 Processing: {os.path.basename(pdf_path)}")

        # Convert PDF to image (first page)
        pages = convert_from_path(pdf_path, first_page=1, last_page=1, dpi=300)
        if not pages:
            return {"error": "Failed to convert PDF to image"}

        print("  ✓ PDF converted to image")

        # Enhance image quality
        processed_image = preprocess_image(pages[0])
        print("  ✓ Image preprocessed")

        # Extract text with Vision API
        text = extract_text_from_pdf(processed_image)
        if not text:
            return {"error": "No text extracted"}

        print(f"  ✓ Text extracted: {len(text)} characters")

        # Extract information
        keywords = extract_keywords(text)
        header_date = extract_main_header_date(header_text)
        vishay_line = extract_vishay_line(header_text)




        # Generate new filename
        subject = "_".join(keywords) if keywords else "Document"
        date_str = header_date if header_date else datetime.now().strftime("%Y%m%d")
        date_str = re.sub(r'[/\-.]', '', str(date_str))

        new_filename = clean_filename(f"{subject}_{date_str}") + ".pdf"

        return {
        "success": True,
        "original_file": Path(pdf_path).name,
        "keywords": keywords,
        "header_date": header_date,
        "all_dates": all_dates,
        "header_text": text[:800] + "..." if len(text) > 800 else text,
        "text_length": len(text),
        "word_count": len(text.split())
     }


    except Exception as e:
        return {"error": f"Processing failed: {str(e)}"}

print("✅ PDF processing function ready!")

✅ PDF processing function ready!


In [9]:
from google.colab import files
import shutil

# Upload and process single PDF
print("📤 Upload a PDF file to process:")
uploaded_pdf = files.upload()

if uploaded_pdf:
    pdf_filename = list(uploaded_pdf.keys())[0]

    # Process the PDF
    result = process_single_pdf(pdf_filename) # Call the processing function

    if result.get("success"):
        print(f"\n🎉 SUCCESS!")
        print(f"📄 Original filename: {result['original_file']}")
        print(f"🔑 Subject keywords: {', '.join(result['keywords']) if result['keywords'] else 'None'}")
        print(f"📅 Main (header) date: {result['header_date'] if result['header_date'] else 'None'}")
        print(f"📅 All dates found: {', '.join(result['all_dates']) if result['all_dates'] else 'None'}")
        print(f"📊 Header text length: {result['text_length']} characters")
        print(f"📝 Word count: {result['word_count']} words")


        # Generate new filename
        subject = "_".join(result['keywords']) if result['keywords'] else "Document"
        # Use header_date if available, otherwise current date
        date_str = result['header_date'] if result['header_date'] else datetime.now().strftime("%Y%m%d")
        date_str = re.sub(r'[/\-.]', '', str(date_str))

        # Construct new filename
        new_filename = clean_filename(f"{subject}_{date_str}") + ".pdf"
        result['new_filename'] = new_filename # Add new filename to result

        # Create renamed file
        shutil.copy2(pdf_filename, result['new_filename'])

        # Download renamed file
        files.download(result['new_filename'])
        print(f"📥 Downloaded: {result['new_filename']}")

        # Show text preview
        print(f"\n📖 Header Text Preview:")
        print("=" * 50)
        print(result['header_text']) # Use header_text for preview
        print("=" * 50)

    else:
        print(f"❌ Error: {result['error']}")

    print(f"\n🔄 To process another PDF, just run this cell again!")

📤 Upload a PDF file to process:


Saving 1077.pdf to 1077.pdf
🔄 Processing: 1077.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted: 2814 characters

🎉 SUCCESS!
📄 Original filename: 1077.pdf
🔑 Subject keywords: पुण, महानगरपालिक, आस्थापनेवरील
📅 Main (header) date: २८/०३/२०२५
📅 All dates found: २८/३/२०२५, २८/०३/२०२५, २६/०३/२०२५, २८/०३/२०२५
📊 Header text length: 2814 characters
📝 Word count: 385 words


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Downloaded: पण_महनगरपलक_आसथपनवरल_२८०३२०२५.pdf

📖 Header Text Preview:
J
मा. अति. महा. आयुक्त (ज),
पुणे महानगरपालिका.
ठराव क्र. :- १/१०७७
दिनांक :- २८/३/२०२५
-
आज्ञा पत्र -
अति.महा. आयुक्त (ज) कार्यालय,
पुणे महानगरपालिका, पुणे - ०५.
जा.क्र. :- अतिमआ/साप्रवि/आस्था/प्र-५/१५२७८
दिनांक :- २८/०३/२०२५
विषय:- पुणे महानगरपालिका आस्थापनेवरील तांत्रिक सेवा संवर्गातील “क्षेत्रिय
अतिक्रमण निरीक्षक, श्रेणी-३, या पदावरून विभागीय अतिक्रमण अधिकारी,
संदर्भ:-
श्रेणी - २" या वरिष्ठ पदावर तात्पुरती पदोन्नतीने नेमणूक देणेबाबत.
१) मा.महापालिका आयुक्त यांचा ठराव क्र. १/३६५, दि. २६/०३/२०२५.
२) मा.अतिरिक्त
२) मा. अतिरिक्त महापालिका आयुक्त (ज) ठ.क्र. १/१०७६, दि. २८/०३/२०२५
पुणे महानगरपालिका प्रशासनाकडील तांत्रिक सेवा संवर्गातील "क्षेत्रिय अतिक्रमण निरीक्षक, श्रेणी-
३, या पदावरून “विभागीय अतिक्रमण अधिकारी, श्रेणी-२” (७ व्या वेतन आयोगानुसार पे मेट्रिक्स S- १५
: ४१८००-१३२३००) या पदावर बढ...

🔄 To process another PDF, just run this cell again!
