In [None]:
!pip install transformers
!pip install torchvision

In [None]:
!pip install opencv-python

In [None]:
!pip install pdf2image

In [None]:
!pip install pymupdf

In [None]:
import fitz  # PyMuPDF
import torch
import cv2
import numpy as np
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import matplotlib.pyplot as plt

In [None]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
def extract_lines_from_image(pil_image):
    img = np.array(pil_image)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    kernel_width = max(20, img.shape[1] // 25)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_width, 5))
    dilated = cv2.dilate(binary, kernel, iterations=1)

    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    line_images = []

    for cnt in sorted(contours, key=lambda c: cv2.boundingRect(c)[1]):
        x, y, w, h = cv2.boundingRect(cnt)
        if h > 20 and w > 50:
            line = img[y:y + h, x:x + w]
            line_img = Image.fromarray(line).convert("RGB")
            line_images.append(line_img)

    return line_images


In [None]:
def preprocess_line_image(line_img):
    img = np.array(line_img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # CLAHE for local contrast enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast = clahe.apply(gray)

    # Denoise
    denoised = cv2.fastNlMeansDenoising(contrast, h=15)

    # Sharpen
    kernel = np.array([[0, -1, 0],
                       [-1, 5, -1],
                       [0, -1, 0]])
    sharpened = cv2.filter2D(denoised, -1, kernel)

    # Adaptive threshold
    binary = cv2.adaptiveThreshold(
        sharpened, 255,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY_INV,
        15, 8
    )

    # Resize for OCR (4×)
    resized = cv2.resize(binary, (binary.shape[1]*4, binary.shape[0]*4))
    return Image.fromarray(resized).convert("RGB")


In [None]:
def run_ocr_on_page(pil_image, page_number):
    lines = extract_lines_from_image(pil_image)
    print(f"\n📄 Page {page_number} — Total Detected Lines: {len(lines)}")

    for i, line in enumerate(lines):
        processed = preprocess_line_image(line)

        pixel_values = processor(images=processed, return_tensors="pt").pixel_values.to(device)
        generated_ids = model.generate(pixel_values, max_length=512)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Filter gibberish lines
        if len(text.strip()) > 3 and any(c.isalpha() for c in text):
            print(f"🖋️ Line {i+1}: {text.strip()}")


In [None]:
pdf_path = "/kaggle/input/exam-paper/iot1 004.pdf"  # change this!
doc = fitz.open(pdf_path)

for i in range(len(doc)):
    page = doc[i]
    pix = page.get_pixmap(dpi=300)
    pil_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    run_ocr_on_page(pil_img, i + 1)


In [None]:
!pip install pymupdf transformers opencv-python pyspellchecker --quiet


In [None]:
import fitz  # PyMuPDF
import cv2
import numpy as np
from PIL import Image
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from spellchecker import SpellChecker


In [None]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

spell = SpellChecker()


In [None]:
def preprocess_line_image(line_img):
    img = np.array(line_img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # Contrast enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast = clahe.apply(gray)

    # Denoise
    denoised = cv2.fastNlMeansDenoising(contrast, h=15)

    # Sharpen
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharpened = cv2.filter2D(denoised, -1, kernel)

    # Adaptive thresholding
    binary = cv2.adaptiveThreshold(sharpened, 255,
                                   cv2.ADAPTIVE_THRESH_MEAN_C,
                                   cv2.THRESH_BINARY_INV, 15, 8)

    # Resize (4x)
    resized = cv2.resize(binary, (binary.shape[1]*4, binary.shape[0]*4))
    return Image.fromarray(resized).convert("RGB")


In [None]:
def extract_lines_from_image(pil_image):
    img = np.array(pil_image)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (img.shape[1]//25, 5))
    dilated = cv2.dilate(binary, kernel, iterations=1)

    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    lines = []

    for cnt in sorted(contours, key=lambda c: cv2.boundingRect(c)[1]):
        x, y, w, h = cv2.boundingRect(cnt)
        if h > 20 and w > 50:
            cropped = img[y:y+h, x:x+w]
            lines.append(Image.fromarray(cropped).convert("RGB"))
    return lines


In [None]:
def correct_spelling(text):
    words = text.split()
    corrected = [
        spell.correction(word) if spell.correction(word) is not None else word
        for word in words
    ]
    return ' '.join(corrected)

def run_ocr_on_page(pil_image, page_num):
    lines = extract_lines_from_image(pil_image)
    print(f"\n📄 Page {page_num} - Total Lines: {len(lines)}")

    for i, line in enumerate(lines):
        processed = preprocess_line_image(line)
        pixel_values = processor(images=processed, return_tensors="pt").pixel_values.to(device)
        generated_ids = model.generate(pixel_values, max_length=512)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        if len(text) > 3 and any(c.isalpha() for c in text):
            corrected = correct_spelling(text)
            print(f"🖋️ Line {i+1}: {corrected}")


In [None]:
pdf_path = "/kaggle/input/exam-paper/iot1 004.pdf"  # Change this
doc = fitz.open(pdf_path)

for i in range(len(doc)):
    page = doc[i]
    pix = page.get_pixmap(dpi=300)
    pil_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    run_ocr_on_page(pil_img, i + 1)


In [None]:
import fitz  # PyMuPDF
from PIL import Image
import os

pdf_path = "/kaggle/input/exam-paper/iot1 004.pdf"
output_dir = "page_images"
os.makedirs(output_dir, exist_ok=True)

doc = fitz.open(pdf_path)

for i in range(len(doc)):
    page = doc[i]
    pix = page.get_pixmap(dpi=300)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img.save(f"{output_dir}/page_{i+1:03}.png")


In [None]:
import cv2
import numpy as np

def segment_lines_from_page(image_path, save_dir, page_num):
    os.makedirs(save_dir, exist_ok=True)
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Threshold for binary image
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Dilate horizontally to group lines
   kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img.shape[1] * 0.8), 5))

    dilated = cv2.dilate(binary, kernel, iterations=1)

    # Find line contours
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    line_id = 1

    for cnt in sorted(contours, key=lambda c: cv2.boundingRect(c)[1]):
        x, y, w, h = cv2.boundingRect(cnt)
        if h > 20 and w > 50:  # ignore small noise
            cropped = img[y:y+h, x:x+w]
            out_path = os.path.join(save_dir, f"page{page_num:03}_line{line_id:03}.png")
            cv2.imwrite(out_path, cropped)
            line_id += 1


In [None]:
line_dir = "line_images"
os.makedirs(line_dir, exist_ok=True)

page_images = sorted(os.listdir("page_images"))

for i, filename in enumerate(page_images):
    path = os.path.join("page_images", filename)
    segment_lines_from_page(path, line_dir, i + 1)


In [None]:
import os

print("Total lines:", len(os.listdir("line_images")))
print(os.listdir("line_images")[:10])  # print a few


In [None]:
import matplotlib.pyplot as plt
import cv2
import os

folder = "line_images"
images = sorted(os.listdir(folder))[:10]  # show first 10

plt.figure(figsize=(10, 20))
for idx, img_name in enumerate(images):
    img = cv2.imread(os.path.join(folder, img_name))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.subplot(10, 1, idx + 1)
    plt.imshow(img)
    plt.axis("off")
    plt.title(img_name)
plt.tight_layout()
plt.show()


In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

line_dir = "line_images"
image_files = sorted(os.listdir(line_dir))

for i, file in enumerate(image_files):
    image = Image.open(os.path.join(line_dir, file)).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    print(f"📄 Line {i+1:03}: {text}")


In [None]:
def run_both_directions_ocr(image):
    normal = image
    flipped = image.transpose(Image.FLIP_LEFT_RIGHT)

    def get_text(pil_img):
        pixel_values = processor(images=pil_img, return_tensors="pt").pixel_values.to(device)
        ids = model.generate(pixel_values)
        return processor.batch_decode(ids, skip_special_tokens=True)[0]

    text_normal = get_text(normal)
    text_flipped = get_text(flipped)

    # Heuristic: choose the longer or more alphabetic one
    if sum(c.isalpha() for c in text_flipped) > sum(c.isalpha() for c in text_normal):
        return text_flipped
    return text_normal


In [None]:
!apt-get install -y poppler-utils


In [None]:
from pdf2image import convert_from_path
import os

pdf_path = "/kaggle/input/exam-paper/iot1 004.pdf"  # <- adjust to your file's actual path
output_folder = "page_images"
os.makedirs(output_folder, exist_ok=True)

# Convert the first page to image
pages = convert_from_path(pdf_path, dpi=300)
pages[0].save(os.path.join(output_folder, "page001.png"))

print("✅ PDF page converted to image successfully!")



In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch

In [None]:
# Load TR-OCR model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)

# STEP 1 — Line segmentation from a single page image
def segment_lines_from_page(image_path, output_dir="line_images_page1", page_num=1):
    os.makedirs(output_dir, exist_ok=True)

    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Binary inverse + dilation to detect horizontal lines
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img.shape[1] * 0.3), 5))

    dilated = cv2.dilate(binary, kernel, iterations=1)
# Step 1: Find contours after dilation
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# ✅ Step 2: Define and filter oversized contours
    filtered_contours = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if h < 150:  # You can tune this value
            filtered_contours.append(c)

    # Step 3: Sort top-to-bottom
    lines = sorted(filtered_contours, key=lambda c: cv2.boundingRect(c)[1])

    # Step 4: Save cropped line images
    for i, c in enumerate(lines):
      x, y, w, h = cv2.boundingRect(c)
      line_img = img[y:y + h, x:x + w]
      cv2.imwrite(os.path.join(output_dir, f"line_{i+1:03}.png"), line_img)
   

    print(f"✅ Extracted {len(lines)} line images from page {page_num}")

# STEP 2 — Preprocess each line image for better OCR
def preprocess_line_for_ocr(pil_image):
    image = np.array(pil_image)
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    # CLAHE contrast enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)

    # Resize
    resized = cv2.resize(enhanced, (1024, 256))

    # Convert single-channel grayscale back to 3-channel RGB
    rgb_resized = cv2.cvtColor(resized, cv2.COLOR_GRAY2RGB)
    flipped_rgb = cv2.flip(rgb_resized, 1)

    return Image.fromarray(rgb_resized), Image.fromarray(flipped_rgb)
   



In [None]:




# STEP 3 — Smart OCR: Try both directions, pick better
def run_both_directions_ocr(image_normal, image_flipped):
    def get_text(pil_img):
        pixel_values = processor(images=pil_img, return_tensors="pt").pixel_values.to(device)
        ids = model.generate(pixel_values)
        return processor.batch_decode(ids, skip_special_tokens=True)[0]

    text_normal = get_text(image_normal)
    text_flipped = get_text(image_flipped)

    # Choose better (heuristic)
    if sum(c.isalpha() for c in text_flipped) > sum(c.isalpha() for c in text_normal):
        return text_flipped
    return text_normal

# STEP 4 — Run all steps on one page
page_image_path = "page_images/page001.png"  # Replace with your actual path
line_dir = "line_images_page1"

segment_lines_from_page(page_image_path, line_dir, page_num=1)

print("\n🧠 Extracting text from lines...\n")
image_files = sorted(os.listdir(line_dir))
seen_lines = set()  # 👈 Add this at the top


for i, filename in enumerate(image_files):
    image_path = os.path.join(line_dir, filename)
    image = Image.open(image_path).convert("RGB")

    pre_normal, pre_flipped = preprocess_line_for_ocr(image)
    text = run_both_directions_ocr(pre_normal, pre_flipped)
    if len(text.strip()) < 3 or sum(c.isalpha() for c in text) < 3:
        continue  # skip garbage lines

    if text.strip() in seen_lines:
        continue  # skip duplicate lines

    seen_lines.add(text.strip())

    if len(text.strip()) > 0:
        print(f"📄 Line {i+1:03}: {text}")
    else:
        print(f"📄 Line {i+1:03}: [EMPTY/NO TEXT]")


In [None]:
import matplotlib.pyplot as plt

def preview_segmented_lines(folder):
    images = sorted(os.listdir(folder))
    print(f"Total Segments: {len(images)}")

    for i, fname in enumerate(images):
        img_path = os.path.join(folder, fname)
        img = cv2.imread(img_path)
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Line {i+1}")
        plt.axis('off')
        plt.show()


In [None]:
preview_segmented_lines("line_images_page1")
