In [None]:
# 1. INSTALLATION: Tesseract and Python libraries

# Install system packages for Tesseract OCR
!apt-get update -qq
!apt-get install -y tesseract-ocr libtesseract-dev

# Install Python libraries
!pip install -q pytesseract opencv-python pillow matplotlib

import os
import cv2
import pytesseract
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
from google.colab.patches import cv2_imshow
from google.colab import files

# Point pytesseract to the Tesseract executable installed above
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

print("Tesseract version:", pytesseract.get_tesseract_version())
print("Setup complete.")

## 1. Installation

Install Tesseract OCR and the required Python libraries, then import everything needed for OCR and image processing in Google Colab.

## 2. Image Upload

Upload one or more scanned answer sheet images from your local machine using `google.colab.files.upload()`. The first uploaded image will be used in the following steps.

In [None]:
# 2. IMAGE UPLOAD: upload scanned answer sheet images
uploaded = files.upload()

# Take the first uploaded file as the active image
image_names = list(uploaded.keys())
if not image_names:
    raise RuntimeError("No image uploaded. Please run this cell again and upload an image.")

image_path = image_names[0]
print("Using image:", image_path)

## 3. Display Original Image

Read the uploaded answer sheet image with OpenCV and display it using `cv2_imshow()` to verify that the correct file was loaded.

In [None]:
# 3. DISPLAY ORIGINAL IMAGE
image_bgr = cv2.imread(image_path)

if image_bgr is None:
    raise RuntimeError(f"Failed to read image at path: {image_path}")

# OpenCV loads images in BGR color space; cv2_imshow can display it directly
cv2_imshow(image_bgr)

## 4. Image Preprocessing

Convert the image to grayscale, reduce noise with Gaussian blur, and apply adaptive thresholding to create a high-contrast, binarized image suitable for OCR. The processed image will be displayed.

In [None]:
# 4. IMAGE PREPROCESSING: grayscale, blur, adaptive threshold
# Reload the original image to ensure we start from a clean copy
image_bgr = cv2.imread(image_path)

# Convert to grayscale
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)

# Apply Gaussian blur for noise reduction
blur = cv2.GaussianBlur(gray, (5, 5), 0)

# Adaptive thresholding for binarization
thresh = cv2.adaptiveThreshold(
    blur,
    255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY,
    11,
    2
)

cv2_imshow(thresh)

## 5. OCR Extraction

Run Tesseract OCR on the preprocessed (thresholded) image using `pytesseract.image_to_string` with configuration `--oem 3 --psm 6`. The extracted text is printed clearly for review.

In [None]:
# 5. OCR EXTRACTION: use pytesseract on preprocessed image
custom_config = r"--oem 3 --psm 6"

# pytesseract can work directly on the single-channel thresholded image
extracted_text = pytesseract.image_to_string(thresh, config=custom_config)

print("========== EXTRACTED TEXT ==========:\n")
print(extracted_text)

## 6. Save and Download Extracted Text

Save the OCR output to a `.txt` file and provide a download link using `google.colab.files.download()` so you can easily export the extracted answer sheet text.

In [None]:
# 6. SAVE OUTPUT: write extracted or corrected text to file and download
output_filename = "extracted_text.txt"

# Prefer corrected_text if it exists and is non-empty; otherwise fall back to extracted_text.
if 'corrected_text' in globals() and isinstance(corrected_text, str) and corrected_text.strip():
    text_to_save = corrected_text
elif 'extracted_text' in globals() and isinstance(extracted_text, str) and extracted_text.strip():
    text_to_save = extracted_text
else:
    raise RuntimeError("No OCR text available to save. Run the OCR (and optional correction) cells first.")

with open(output_filename, "w", encoding="utf-8") as f:
    f.write(text_to_save)

print(f"Saved text to {output_filename}")

files.download(output_filename)

## 7. BERT-based Text Correction

Use a transformer-based language model to analyse the OCR output and generate a corrected version of the extracted text. Run this after the OCR extraction cell and before saving.

In [None]:
# 7. BERT-STYLE TEXT CORRECTION USING A TRANSFORMER MODEL
# This cell refines `extracted_text` and produces `corrected_text`.
# Run the OCR extraction cell first so `extracted_text` is defined.

!pip install -q transformers torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "prithivida/grammar_error_correcter_v1"  # transformer-based text correction model
corrector = pipeline("text2text-generation", model=model_name, tokenizer=model_name)

if 'extracted_text' not in globals():
    raise RuntimeError("`extracted_text` is not defined. Run the OCR extraction cell first.")

input_text = extracted_text.strip()
if not input_text:
    raise ValueError("`extracted_text` is empty. Make sure OCR ran correctly.")

outputs = corrector(input_text, max_length=max(64, len(input_text.split()) * 3))
corrected_text = outputs[0].get("generated_text", outputs[0].get("translation_text", input_text))

print("=== ORIGINAL OCR TEXT ===\n")
print(input_text)
print("\n=== CORRECTED TEXT ===\n")
print(corrected_text)

# Deep Learning Handwriting OCR (TrOCR-based)

The following cells implement a handwriting-optimized OCR pipeline using Microsoft's TrOCR model, which usually gives much higher accuracy on handwritten answer sheets than classic Tesseract or EasyOCR.

## 8. Install TrOCR Dependencies

Install the required deep learning libraries and Hugging Face Transformers. Run this in Colab (GPU is recommended but not required for small images).

In [None]:
# 8. INSTALL REQUIRED LIBRARIES FOR TrOCR
!pip install -q transformers pillow torch torchvision torchaudio

print("Installed Transformers + PyTorch dependencies for TrOCR.")

## 9. Upload Answer Sheet Image (for TrOCR)

Upload the handwritten answer sheet you want to process with the TrOCR model.

In [None]:
# 9. UPLOAD ANSWER SHEET IMAGE (TrOCR)
from google.colab import files

uploaded_trocr = files.upload()

if not uploaded_trocr:
    raise RuntimeError("No image uploaded. Please upload a handwritten answer sheet.")

trocr_img_path = list(uploaded_trocr.keys())[0]
print("Uploaded for TrOCR:", trocr_img_path)

## 10. Load and Preprocess Image for TrOCR

Convert the uploaded image to grayscale, enhance contrast, and display it before feeding into the TrOCR model.

In [None]:
# 10. LOAD IMAGE & PREPROCESS FOR TrOCR
from PIL import Image, ImageOps
import matplotlib.pyplot as plt

# Load image for TrOCR
trocr_image = Image.open(trocr_img_path)

# Convert to grayscale and auto-contrast to help handwriting OCR
trocr_gray = ImageOps.grayscale(trocr_image)
trocr_enhanced = ImageOps.autocontrast(trocr_gray)

plt.imshow(trocr_enhanced, cmap="gray")
plt.axis("off")
plt.title("Preprocessed image for TrOCR")

## 11. Load TrOCR Handwriting Model

Use Microsoft's `microsoft/trocr-small-handwritten` model, which is trained specifically for handwritten text.

In [None]:
# 11. LOAD TrOCR HANDWRITING MODEL
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

trocr_model_name = "microsoft/trocr-small-handwritten"
processor = TrOCRProcessor.from_pretrained(trocr_model_name)
trocr_model = VisionEncoderDecoderModel.from_pretrained(trocr_model_name)

print("Loaded TrOCR model:", trocr_model_name)

## 12. Perform TrOCR-based OCR Extraction

Run the TrOCR model on the preprocessed image to get high-quality handwriting text extraction.

In [None]:
# 12. PERFORM TrOCR-BASED OCR EXTRACTION
import torch

# Upscale the preprocessed image to help the model read small handwriting
scale_factor = 2
w, h = trocr_enhanced.size
trocr_up = trocr_enhanced.resize((w * scale_factor, h * scale_factor))

# Ensure image is in RGB format for the processor
trocr_rgb = trocr_up.convert("RGB")

# Prepare image for model
trocr_pixel_values = processor(images=trocr_rgb, return_tensors="pt").pixel_values

# Generate text with a slightly stronger decoder
with torch.no_grad():
    generated_ids = trocr_model.generate(
        trocr_pixel_values,
        num_beams=5,
        max_length=256,
        early_stopping=True,
    )

trocr_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("===== TrOCR EXTRACTED TEXT =====\n")
print(trocr_text)
print("\n===============================")

## 13. Save TrOCR Output to File

Save the TrOCR extracted text to a .txt file and download it from Colab.

In [None]:
# 13. SAVE TrOCR OUTPUT TO FILE
from google.colab import files as colab_files

trocr_output_filename = "deep_ocr_output.txt"

with open(trocr_output_filename, "w", encoding="utf-8") as f:
    f.write(trocr_text)

print(f"Saved TrOCR text to {trocr_output_filename}")

colab_files.download(trocr_output_filename)

## 14. Optional: Language-model Refinement of TrOCR Text

Optionally refine `trocr_text` using a text-correction model (similar to BERT-style correction) to fix spelling and grammar issues in the raw OCR output.

In [None]:
# 14. OPTIONAL: REFINE TrOCR TEXT WITH A TEXT-CORRECTION MODEL
!pip install -q transformers torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

if 'trocr_text' not in globals():
    raise RuntimeError("`trocr_text` is not defined. Run the TrOCR extraction cell first.")

lm_model_name = "prithivida/grammar_error_correcter_v1"
trocr_corrector = pipeline("text2text-generation", model=lm_model_name, tokenizer=lm_model_name)

raw_trocr = trocr_text.strip()
if not raw_trocr:
    raise ValueError("`trocr_text` is empty. Make sure TrOCR ran correctly.")

lm_outputs = trocr_corrector(raw_trocr, max_length=max(64, len(raw_trocr.split()) * 3))
trocr_text_corrected = lm_outputs[0].get("generated_text", lm_outputs[0].get("translation_text", raw_trocr))

print("=== RAW TrOCR TEXT ===\n")
print(raw_trocr)
print("\n=== CORRECTED TrOCR TEXT ===\n")
print(trocr_text_corrected)