<a href="https://colab.research.google.com/github/SohamNigam/Vision/blob/main/OCR(pt_2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
# Install all required packages
!pip install google-cloud-vision pillow opencv-python google-generativeai pdf2image
!apt-get update -qq && apt-get install -y poppler-utils fonts-indic
print("✅ Installation complete!")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.8).
The following additional packages will be installed:
  fonts-beng fonts-beng-extra fonts-deva fonts-deva-extra fonts-gargi
  fonts-gubbi fonts-gujr fonts-gujr-extra fonts-guru fonts-guru-extra
  fonts-kalapi fonts-knda fonts-lohit-beng-assamese fonts-lohit-beng-bengali
  fonts-lohit-deva fonts-lohit-gujr fonts-lohit-guru fonts-lohit-knda
  fonts-lohit-mlym fonts-lohit-orya fonts-lohit-taml
  fonts-lohit-taml-classical fonts-lohit-telu fonts-mlym fonts-nakula
  fonts-navilu fonts-orya fonts-orya-extra fonts-pagul fonts-sahadeva
  fonts-samyak-deva fonts-samyak-gujr fonts-samyak-mlym fonts-samyak-taml
  fonts-sarai fonts-smc fonts-smc-anjalio

In [45]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [46]:
input_folder = '/content/drive/MyDrive/OCR/circular'   # Folder with original PDFs
output_folder = '/content/drive/MyDrive/OCR/Renamed_files(pt4)' # Folder to save renamed PDFs

import os
os.makedirs(output_folder, exist_ok=True)


In [47]:
import google.generativeai as genai
from google.colab import userdata
genai.configure(api_key='AIzaSyDlPj5vbcQsYCzzDfcCoH_lLhLhrufxGMY')  # Only needed if not in env variable

chat_model = genai.GenerativeModel('gemini-2.5-pro')

In [48]:
from google.colab import files
import os

# Upload your Google Cloud service account JSON file
print("Please upload your Google Cloud service account JSON file:")
uploaded = files.upload()

# Set up authentication
credential_file = list(uploaded.keys())[0]
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_file
print(f"✅ Credentials set: {credential_file}")

Please upload your Google Cloud service account JSON file:


Saving vision-ocr-466709-44d985fb6a51.json to vision-ocr-466709-44d985fb6a51 (3).json
✅ Credentials set: vision-ocr-466709-44d985fb6a51 (3).json


In [49]:
import os
import re
import io
import cv2
import numpy as np
from datetime import datetime
from pathlib import Path
from PIL import Image, ImageEnhance, ImageDraw, ImageFont
from pdf2image import convert_from_path
from google.cloud import vision

client = vision.ImageAnnotatorClient()

def preprocess_image(image):
    """Enhance image quality for better OCR"""
    # Convert to OpenCV format
    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(opencv_image, cv2.COLOR_BGR2GRAY)

    # Apply noise reduction
    denoised = cv2.fastNlMeansDenoising(gray)

    # Sharpen the image
    kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(denoised, -1, kernel)

    # Enhance contrast
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast_enhanced = clahe.apply(sharpened)

    # Convert back to PIL and enhance
    processed_image = Image.fromarray(contrast_enhanced)
    enhancer = ImageEnhance.Contrast(processed_image)
    processed_image = enhancer.enhance(1.2)

    return processed_image.convert('RGB')


def extract_text_from_pdf(image):
    """Extract text using Google Vision API"""
    try:
        # Convert image to bytes
        img_byte_arr = io.BytesIO()
        image.save(img_byte_arr, format='PNG', optimize=True, quality=95)
        img_byte_arr = img_byte_arr.getvalue()

        # Create Vision API request
        vision_image = vision.Image(content=img_byte_arr)
        image_context = vision.ImageContext(language_hints=['mr', 'hi', 'eng'])

        # Perform OCR
        response = client.document_text_detection(
            image=vision_image,
            image_context=image_context
        )

        if response.error.message:
            raise Exception(f'Vision API Error: {response.error.message}')

        return response.full_text_annotation.text if response.full_text_annotation else ""

    except Exception as e:
        print(f"OCR Error: {e}")
        return ""


def extract_keywords(text, max_words=3):
    """Extract subject keywords with better Marathi support"""
    # Enhanced stop words including more Marathi terms
    stop_words = {
        'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
        'का', 'की', 'के', 'में', 'पर', 'से', 'को', 'और', 'है', 'में',  # Hindi
        'चा', 'ची', 'चे', 'मध्ये', 'वर', 'पासून', 'आणि', 'या', 'ते', 'ला', 'ने',  # Marathi
        'शासन', 'कार्यालय', 'विभाग', 'मंत्रालय', 'संदर्भ', 'दिनांक'  # Common govt terms
    }

    # Look for "विषय" pattern specifically
    vishay_patterns = [
        r'विषय\s*[:।]\s*(.+?)(?:\n|।|\.)',  # विषय: content
        r'विषय\s*[-—]\s*(.+?)(?:\n|।|\.)',   # विषय - content
        r'विषय\s+(.+?)(?:\n|।|\.|,)',        # विषय content
    ]

    # Try to find subject using विषय patterns first
    for pattern in vishay_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            subject_line = match.group(1).strip()
            if subject_line and len(subject_line) > 2:
                # Clean and extract meaningful words from subject line
                subject_words = re.findall(r'\b[A-Za-z\u0900-\u097F]{3,}\b', subject_line)
                filtered_words = [w for w in subject_words if w.lower() not in stop_words]
                if filtered_words:
                    return filtered_words[:max_words]

    # Fallback: Extract from first few lines if विषय not found
    lines = text.split('\n')[:15]  # First 15 lines
    words = []

    for line in lines:
        # Skip common headers
        if any(skip in line.lower() for skip in ['शासन', 'सरकार', 'government', 'कार्यालय']):
            continue

        line_words = re.findall(r'\b[A-Za-z\u0900-\u097F]{3,}\b', line)
        words.extend(line_words[:3])  # Take first 3 words per line

    # Filter meaningful words
    meaningful_words = []
    seen = set()
    for word in words:
        word_lower = word.lower()
        if (word_lower not in stop_words and
            word_lower not in seen and
            len(word) >= 3):
            meaningful_words.append(word)
            seen.add(word_lower)
            if len(meaningful_words) >= max_words:
                break

    return meaningful_words

def extract_document_date(ocr_text):
    """
    Extracts the document date from OCR text using context-aware patterns.
    Supports Devanagari and English numerals and several separators.
    """
    date_patterns = [
        r'(\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{2,4})',  # English digits
        r'([०१२३४५६७८९]{1,2}[\/\-\.][०१२३४५६७८९]{1,2}[\/\-\.][०१२३४५६७८९]{2,4})',  # Devanagari digits
    ]
    date_line_keywords = ['दिनांक', 'Date', 'Dated', 'तारीख', 'Tarikh']

    lines = ocr_text.split('\n')
    # Look for dates near keywords
    for idx, line in enumerate(lines):
        for word in date_line_keywords:
            if word in line:
                # Same line
                for pat in date_patterns:
                    found = re.search(pat, line)
                    if found:
                        return found.group()
                # Next 2 lines
                for offset in [1, 2]:
                    if idx + offset < len(lines):
                        for pat in date_patterns:
                            found = re.search(pat, lines[idx + offset])
                            if found:
                                return found.group()
    # Fallback: search header region
    for line in lines[:15]:
        for pat in date_patterns:
            found = re.search(pat, line)
            if found:
                return found.group()
    return ""


def extract_full_vishay_text(text):
    """
    Extracts the full block of text following "विषय" until the next major section (like संदर्भ or महोदय)
    or the end of the document.
    """
    # Pattern to find "विषय" and capture everything after it
    pattern_start = r'(?:^|\n)\s*विषय\s*[:।\-–—]?\s*(.+)'
    match_start = re.search(pattern_start, text)

    if not match_start:
        # Fallback: check for 'subject' in English if text is English
        pattern_start_eng = r'(?:^|\n)\s*subject\s*[:\-–—]?\s*(.+)'
        match_start_eng = re.search(pattern_start_eng, text, flags=re.I)
        if not match_start_eng:
            return None # Subject not found

        # If English subject found, set start index and adjust text
        start_index = match_start_eng.start(1)
        subject_text = text[start_index:].strip()
        # Look for common English section headers to mark the end
        end_match = re.search(r'(?:^|\n)\s*(Reference|Sir|Madam|Dear|Yours)', subject_text, flags=re.I)

    else:
        # If Marathi/Hindi विषय found, set start index and adjust text
        start_index = match_start.start(1)
        subject_text = text[start_index:].strip()
        # Look for common Marathi/Hindi section headers to mark the end
        end_match = re.search(r'(?:^|\n)\s*(संदर्भ|महोदय|महोदया)', subject_text)


    if end_match:
        # If an end pattern is found, truncate the text at that point
        end_index = end_match.start()
        return subject_text[:end_index].strip()
    else:
        # If no end pattern is found, return the rest of the text
        return subject_text.strip()


def clean_filename(text):
    # ...
    text = re.sub(r"[\/.\-]", "_", text)
    text = re.sub(r"_+", "_", text)
    return text


# ── Automatic Confusion Map Generation (synthetic) ──
# Build synthetic Devanagari date strings and get Vision OCR confusions
def build_confusion_map(client, fonts, n_samples=1000):
    """
    client: Google Vision client
    fonts: list of font-file paths on Colab (e.g. Devanagari .ttf)
    """
    # generate many random dates in Devanagari
    true_digits = []
    ocr_digits  = []
    for _ in range(n_samples):
        d = np.random.randint(1,29)
        m = np.random.randint(1,13)
        y = np.random.choice([2023,2024,2025])
        date_str = f"{d:02d}/{m:02d}/{y}"
        # render date_str into image with random font
        font = ImageFont.truetype(np.random.choice(fonts), size=48)
        img = Image.new("RGB", (300,80), "white")
        ImageDraw.Draw(img).text((10,10), date_str.translate(str.maketrans("0123456789","०१२३४५६७८९")), font=font, fill="black")
        # OCR
        from io import BytesIO
        buf = BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
        vision_img = {"content": buf.read()}
        resp = client.text_detection(image=vision_img)
        text = resp.text_annotations[0].description.strip()
        # align digits
        for t,o in zip(date_str, text):
            if t.isdigit() and o.isdigit():
                true_digits.append(int(t))
                ocr_digits.append(int(o))
    # build matrix
    C = np.zeros((10,10), int)
    for t,o in zip(true_digits, ocr_digits):
        C[t, o] += 1
    # map top-3 confusers
    return { d: [int(c) for c in C[d].argsort()[::-1] if c!=d][:3] for d in range(10) }

# Call once at startup to get conf_map
# point to any Devanagari fonts uploaded in Colab, e.g. in /usr/share/fonts/truetype
fonts = ["/usr/share/fonts/truetype/fonts-gujr-extra/Lohit-Gujarati.ttf"] # Using a font from fonts-indic
conf_map = build_confusion_map(client, fonts) # Using the existing client

# Validators for date & amount
_date_rx = re.compile(r'^(\d{1,2})[\/\-\.](\d{1,2})[\/\-\.](\d{2,4})$')
_amt_rx  = re.compile(r'^(₹|रु)\s?([\d,]+)$')

def _valid_date(tok):
    m = _date_rx.match(tok)
    if not m: return False
    d,mn,y = map(int, m.groups())
    y += 2000 if y<50 else 0
    return 1<=d<=31 and 1<=mn<=12 and 1900<=y<=2050

def _valid_amount(tok):
    m = _amt_rx.match(tok)
    if not m: return False
    digits = m.group(2).replace(",", "")
    return int(digits)%5==0

def repair_token(tok):
    if not any(ch.isdigit() for ch in tok):
        return tok
    for i,ch in enumerate(tok):
        if ch.isdigit():
            for alt in conf_map[int(ch)]:
                cand = tok[:i] + str(alt) + tok[i+1:]
                if _valid_date(cand) or _valid_amount(cand):
                    return cand
    return tok

def numeric_postprocess(text):
    parts = re.split(r'(\s+)', text)
    return ''.join(repair_token(p) for p in parts)


print("✅ All processing functions ready!")

✅ All processing functions ready!


In [55]:
import numpy as np
import re

# ── Automatic Confusion Map Generation (synthetic) ──
# Build synthetic Devanagari date strings and get Vision OCR confusions
def build_confusion_map(client, fonts, n_samples=1000):
    """
    client: Google Vision client
    fonts: list of font-file paths on Colab (e.g. Devanagari .ttf)
    """
    # generate many random dates in Devanagari
    true_digits = []
    ocr_digits  = []
    for _ in range(n_samples):
        d = np.random.randint(1,29)
        m = np.random.randint(1,13)
        y = np.random.choice([2023,2024,2025])
        date_str = f"{d:02d}/{m:02d}/{y}"
        # render date_str into image with random font
        from PIL import Image, ImageDraw, ImageFont
        font = ImageFont.truetype(np.random.choice(fonts), size=48)
        img = Image.new("RGB", (300,80), "white")
        ImageDraw.Draw(img).text((10,10), date_str.translate(str.maketrans("0123456789","०१२३४५६७८९")), font=font, fill="black")
        # OCR
        from io import BytesIO
        buf = BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
        vision_img = {"content": buf.read()}
        resp = client.text_detection(image=vision_img)
        text = resp.text_annotations[0].description.strip()
        # align digits
        for t,o in zip(date_str, text):
            if t.isdigit() and o.isdigit():
                true_digits.append(int(t))
                ocr_digits.append(int(o))
    # build matrix
    C = np.zeros((10,10), int)
    for t,o in zip(true_digits, ocr_digits):
        C[t, o] += 1
    # map top-3 confusers
    return { d: [int(c) for c in C[d].argsort()[::-1] if c!=d][:3] for d in range(10) }

# Call once at startup to get conf_map
from google.cloud import vision
vision_client = vision.ImageAnnotatorClient()
# point to any Devanagari fonts uploaded in Colab, e.g. in /usr/share/fonts/truetype
fonts = ["/usr/share/fonts/truetype/fonts-gujr-extra/Lohit-Gujarati.ttf"] # Using a font from fonts-indic
conf_map = build_confusion_map(vision_client, fonts)

# Validators for date & amount
_date_rx = re.compile(r'^(\d{1,2})[\/\-\.](\d{1,2})[\/\-\.](\d{2,4})$')
_amt_rx  = re.compile(r'^(₹|रु)\s?([\d,]+)$')

def _valid_date(tok):
    m = _date_rx.match(tok)
    if not m: return False
    d,mn,y = map(int, m.groups())
    y += 2000 if y<50 else 0
    return 1<=d<=31 and 1<=mn<=12 and 1900<=y<=2050

def _valid_amount(tok):
    m = _amt_rx.match(tok)
    if not m: return False
    digits = m.group(2).replace(",", "")
    return int(digits)%5==0

def repair_token(tok):
    if not any(ch.isdigit() for ch in tok):
        return tok
    for i,ch in enumerate(tok):
        if ch.isdigit():
            for alt in conf_map[int(ch)]:
                cand = tok[:i] + str(alt) + tok[i+1:]
                if _valid_date(cand) or _valid_amount(cand):
                    return cand
    return tok

def numeric_postprocess(text):
    parts = re.split(r'(\s+)', text)
    return ''.join(repair_token(p) for p in parts)

In [56]:
def process_single_pdf(pdf_path):
    """Process one PDF and return extracted header data"""
    try:
        print(f"🔄 Processing: {os.path.basename(pdf_path)}")

        # Convert to image
        pages = convert_from_path(pdf_path, first_page=1, last_page=1, dpi=300)
        if not pages:
            return {"error": "Failed to convert PDF to image"}

        print("  ✓ PDF converted to image")

        # Preprocess image
        processed_image = preprocess_image(pages[0])
        print("  ✓ Image preprocessed")

        # Extract text
        text = extract_text_from_pdf(processed_image)
        if not text:
            return {"error": "No text extracted"}

        print(f"  ✓ Text extracted ({len(text)} characters)")

        # Extract values
        header_date = extract_accurate_header_date(text)
        vishay_line = extract_full_vishay_text(text) # Changed from extract_vishay_line

        # Return only what’s needed
        return {
            "success": True,
            "original_file": Path(pdf_path).name,
            "header_date": header_date,
            "vishay": vishay_line,
            "header_text": text[:800] + "..." if len(text) > 800 else text
        }

    except Exception as e:
        return {"error": f"Processing failed: {str(e)}"}

In [57]:
from google.colab import files
import shutil
from datetime import datetime
import re
import google.generativeai as genai
from google.colab import userdata

# 2. Utility: Convert Devanagari digits to English
def convert_devanagari_to_english(text):
    devanagari_digits = "०१२३४५६७८९"
    eng_digits = "0123456789"
    return text.translate(str.maketrans(devanagari_digits, eng_digits))

# 3. Utility: Summarize subject using Gemini API
def summarize_vishay_with_gemini(vishay_text, gemini_model):
    if not vishay_text or len(vishay_text.strip()) < 4:
        return "Document"
    prompt = (
        "तुमचा कार्य हा PDF मधून मिळालेला विषय ओळखून त्याचं ३ ते ७ शब्दांत सारांश तयार करणे आहे. "
        "कृपया मूळ भाषेतच उत्तर द्या. इंग्रजी अथवा ट्रान्सलिटरेशनमध्ये उत्तर देऊ नका.\n"
        f"\nविषय: {vishay_text}\n\n"
        "✅ उत्तरात केवळ सारांश लिहा. इतर काही नाही."
    )
    try:
        response = gemini_model.generate_content(prompt)
        if response and hasattr(response, "text") and response.text:
            summary = response.text.strip()
            return summary or "Document"
        else:
            print("❌ Gemini API returned an empty/invalid response.")
            return "Document"
    except Exception as e:
        print(f"❌ Gemini summarization failed: {e}")
        return "Document"

# 4. Utility: Clean filename for filesystem safety, preserving language/script
def clean_filename(text):
    # Safely replaces / . - with underscores (hyphen last, no ranges)
    text = re.sub(r"[/.\-]", "_", text)
    text = re.sub(r"_+", "_", text)
    return text.strip('_')[:100]

# 5. Main flow: Upload, process, summarize, rename
# print("📤 Upload a PDF to process and rename:")
# uploaded_pdf = files.upload()

# if uploaded_pdf:
#     pdf_filename = list(uploaded_pdf.keys())[0]
#     result = process_single_pdf(pdf_filename)

#     if result.get("success"):
#         # Extract full (multi-line) विषय block
#         subject_raw = extract_full_vishay_text(result['header_text'])
#         summarized_subject = summarize_vishay_with_gemini(subject_raw, gemini_model)

#         # Fallback logic if Gemini returns nothing
#         if not summarized_subject or summarized_subject == "Document":
#             if result.get('vishay'):
#                 words = re.findall(r'\b[\u0900-\u097F\w]{3,}\b', result['vishay'])
#                 summarized_subject = "_".join(words[:4]) if words else "Document"
#             else:
#                 summarized_subject = "Document"

#         # Clean the subject for filename use
#         clean_subject = clean_filename(summarized_subject)

#         # Prepare and clean date for filename
#         date_raw = result['header_date'] if result['header_date'] else datetime.now().strftime("%Y%m%d")
#         date_eng = convert_devanagari_to_english(date_raw)
#         date_formatted = re.sub(r"[/.\-]", "_", date_eng)
#         date_formatted = re.sub(r"_+", "_", date_formatted)

#         # Build, trim, and save the final filename
#         final_filename = f"{clean_subject}_{date_formatted}.pdf"
#         final_filename = final_filename[:200]

#         shutil.copy2(pdf_filename, final_filename)
#         # files.download(final_filename)

#         print(f"\n✅ Downloaded as: {final_filename}")
#         print(f"🔖 विषय (Subject): {result.get('vishay', 'N/A')}")
#         print(f"📅 Header Date: {result.get('header_date', 'N/A')}")
#     else:
#         print(f"❌ Error: {result['error']}")

In [60]:
import os
from shutil import copy2
from datetime import datetime # Import datetime
import re # Import re for date formatting

# List all PDF files in input folder
pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

# Process only the first 10 files
for filename in pdf_files[:10]: # Limit to first 10 for testing
    full_path = os.path.join(input_folder, filename)
    result = process_single_pdf(full_path) # Use the existing function

    if result.get("success"):
        # Access extracted data directly from the result dictionary
        subject_raw = result.get('vishay', '') # Get vishay from result
        date_raw = result.get('header_date', datetime.now().strftime("%Y%m%d")) # Get date from result or use current date

        # Use the functions defined in cell ALAQBJf_-q_o
        summarized_subject = summarize_vishay_with_gemini(subject_raw, chat_model) # Use chat_model
        clean_subject = clean_filename(summarized_subject)

        # Prepare and clean date for filename
        date_eng = convert_devanagari_to_english(date_raw)
        date_formatted = re.sub(r"[/.\-]", "_", date_eng)
        date_formatted = re.sub(r"_+", "_", date_formatted)

        # Build, trim, and save the final filename
        final_filename = f"{clean_subject}_{date_formatted}.pdf"
        final_filename = final_filename[:200]

        output_file_path = os.path.join(output_folder, final_filename)
        try:
            copy2(full_path, output_file_path)
            print(f"✅ Saved as: {final_filename}\n")
        except Exception as e:
            print(f"❌ Error saving {filename} to {output_file_path}: {e}\n")


    else:
        print(f"❌ Error processing {filename}: {result['error']}\n")

🔄 Processing: 523 09042025.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1376 characters)
✅ Saved as: Document_09_04_2025.pdf

🔄 Processing: कंत्राटदारांच्या सहभागाविषयी व ईएसआय अनुपालनाविषयी.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1224 characters)
✅ Saved as: कंत्राटदारांचा सहभाग व ईएसआय अनुपालन माहिती_07_04_2025.pdf

🔄 Processing: 524 09042025.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1458 characters)
✅ Saved as: Document_09_04_2025.pdf

🔄 Processing: आधार नोडल.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1625 characters)
✅ Saved as: आधार नोडल अधिकाऱ्याची नियुक्ती करणे_08_04_2025.pdf

🔄 Processing: online-training-e-Office-computer-system-8-april_0.pdf
  ✓ PDF converted to image
  ✓ Image preprocessed
  ✓ Text extracted (1141 characters)
✅ Saved as: पुणे मनपात ई_ऑफिस प्रणालीचे ऑनलाईन प्रशिक्षण_06_08_2024.pdf

🔄 Processing: online-training-e-Office-computer-system