# **Multilingual Meme OCR Pipeline**


---



# Project Overview

This notebook implements an Optical Character Recognition (OCR) pipeline specifically optimized for meme content containing Bangla, English, and Banglish (Bengali text written in Roman script). It utilizes a fine-tuned Vision-Language Model (VLM) to extract text and a heuristic-based classifier to determine the language.

# Key Features
**Model:** nanonets/Nanonets-OCR-s (Based on Qwen2.5-VL).

**Optimization:** Uses 4-bit Quantization (NF4) via bitsandbytes to run efficiently on Kaggle's Tesla T4 (16GB VRAM) without Out-Of-Memory (OOM) errors.

**Memory Management:** Implements aggressive garbage collection and resolution capping (max 768px) to maintain stability during batch processing.

**Language Classification:** Custom logic to distinguish between Bangla (Unicode), English (Dictionary check), and Banglish.

# Output
The notebook generates a CSV file with the following columns:
- **extracted_text**: The text extracted from the meme image using OCR.
- **Language**: The detected language classification (Bangla, English, or Banglish).

# Pipeline Architecture
1. Environment Setup
The notebook installs specific versions of transformers and qwen-vl-utils to support the Qwen2.5-VL architecture.

Libraries: torch, transformers, accelerate, bitsandbytes, pillow.

2. Language Classification Logic
A custom function classify_language(text) categorizes the extracted text:

Bangla: Detected via Unicode range (\u0980 to \u09FF).

English: Detected by calculating the density of common English stop words against the total word count.

Banglish: If the text uses Roman characters but lacks sufficient English vocabulary density, it is classified as Banglish.

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git accelerate qwen-vl-utils pandas torch pillow bitsandbytes

In [None]:
import os
import gc
import torch
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# --- CONFIGURATION ---
IMAGE_DIR = "/kaggle/input/poli-meme-decode-cuet-cse-fest/PoliMemeDecode/Test/Image"
OUTPUT_DIR = "/kaggle/working"
MODEL_PATH = "nanonets/Nanonets-OCR-s"

# ======================================================
# 1. LANGUAGE CLASSIFIER (Same as before)
# ======================================================
def classify_language(text):
    if not text or len(text.strip()) < 2: return "Unknown"
    if any('\u0980' <= char <= '\u09FF' for char in text): return "Bangla"
    common_english = {'the', 'is', 'a', 'an', 'and', 'to', 'in', 'of', 'for', 'it', 'you', 'me', 'he', 'she', 'that', 'this', 'what', 'when', 'why', 'how', 'good', 'bad', 'day', 'night', 'lol', 'pov', 'bro', 'meme'}
    words = [w.lower().strip(".,!?\"'") for w in text.split()]
    if not words: return "Unknown"
    english_word_count = sum(1 for w in words if w in common_english)
    if (english_word_count / len(words)) < 0.2: return "Banglish"
    return "English"

# ======================================================
# 2. LOAD MODEL IN 4-BIT MODE (The Fix for OOM)
# ======================================================
print("Loading Model in 4-bit quantization...")

# This config reduces VRAM usage from ~7GB to ~2.5GB
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

try:
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        quantization_config=bnb_config, # <--- 4-bit magic
        attn_implementation="eager",    # <--- T4 safe attention
        device_map="auto"
    )

    processor = AutoProcessor.from_pretrained(MODEL_PATH)
    print("Model loaded! VRAM usage should be very low now.")

except Exception as e:
    print(f"Load Error: {e}")
    raise e

# ======================================================
# 3. PROCESSING LOOP WITH RESOLUTION LIMITS
# ======================================================
def process_image(image_path):
    try:
        # Load image
        image = Image.open(image_path).convert("RGB")

        # FIX: Force resize to max 768px (Prevents 13GB allocations)
        # This is high enough for memes but low enough for T4 GPU
        max_dimension = 768
        if max(image.size) > max_dimension:
            image.thumbnail((max_dimension, max_dimension), Image.LANCZOS)

        prompt = "Extract text from this meme. If it is Banglish, write it exactly as shown. Do not translate."

        messages = [
            {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}
        ]

        # Prepare inputs
        text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = processor(text=[text_input], images=[image], padding=True, return_tensors="pt").to(model.device)

        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=False
            )

        generated_ids_trimmed = [
            out_ids[len(in_ids):]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]

        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]

        # AGGRESSIVE CLEANUP
        del inputs, generated_ids, image
        torch.cuda.empty_cache()

        return output_text.strip()

    except Exception as e:
        print(f"Error on {image_path.name}: {str(e)[:100]}")
        torch.cuda.empty_cache()
        return ""

# ======================================================
# 4. EXECUTION
# ======================================================
image_files = sorted([p for p in Path(IMAGE_DIR).rglob('*') if p.suffix.lower() in ['.jpg', '.jpeg', '.png', '.webp']])
print(f"Found {len(image_files)} images")

results = []
checkpoint_path = f"{OUTPUT_DIR}/checkpoint.csv"

for i, img_path in enumerate(tqdm(image_files)):
    text = process_image(img_path)
    lang = classify_language(text)

    results.append({
        "image_filename": img_path.name,
        "extracted_text": text,
        "Language": lang
    })

    # Save frequently
    if i % 10 == 0:
        pd.DataFrame(results).to_csv(checkpoint_path, index=False)
        gc.collect()

# Final Save
df = pd.DataFrame(results)
df.to_csv(f"{OUTPUT_DIR}/final_meme_texts.csv", index=False, encoding='utf-8-sig')
print(f"Finished! Saved to {OUTPUT_DIR}/final_meme_texts.csv")