# import 

In [None]:
import os
import pandas as pd
from pathlib import Path
import torch
import easyocr
import traceback
from tqdm import tqdm
import gc
from pix2text import Pix2Text

# Path Configuration

In [None]:
CAPTION_IMAGE_DIR = "YOUR CAPTION IMAGE DIRECTORY"
EASYOCR_OUTPUT_CSV = "EASYOCR OUTPUT CSV PATH"
PIX2TEXT_OUTPUT_CSV = "PIX2TEXT OUTPUT CSV PATH"
FINAL_OUTPUT_CSV = "FINAL OUTPUT CSV PATH"

# Easy-OCR 

### Initialize OCR Model

In [None]:
reader = easyocr.Reader(['en'], gpu=True)

### Settings

In [None]:
img_dir = Path(CAPTION_IMAGE_DIR)
csv_path = Path(EASYOCR_OUTPUT_CSV)
image_extensions = {'.png', '.jpg', '.jpeg', '.tif', '.tiff'}
BATCH_SAVE = 100
counter = 0

### Initialize 

In [None]:
if not csv_path.exists() or csv_path.stat().st_size == 0:
   pd.DataFrame(columns=['image_name','easyocr_ocr']).to_csv(csv_path, index=False)

#check the completed images
df_existing = pd.read_csv(csv_path)
done_set = set(df_existing["image_name"])

sample_img = "SAMPLE IMAGE PATH"
warmup_result = reader.readtext(str(sample_img), detail=0)
print("[INFO] Warm-up complete.")

all_imgs = sorted([p for p in img_dir.iterdir() if p.suffix.lower() in image_extensions])
records = []

### Main Processing

In [None]:
for img_path in tqdm(all_imgs, desc="Easy OCR Processing"):
    img_name = img_path.name
    if img_name in done_set:
       continue
    
    try:
       easy_texts = reader.readtext(str(img_path), detail=0)
       easyocr_ocr = " ".join(easy_texts)
    except Exception as e:
       easyocr_ocr = ""
    
    records.append({
       "image_name": img_name,
       "easyocr_ocr": easyocr_ocr
    })
    
    counter += 1
    
    #Batch Save
    if counter % BATCH_SAVE == 0:
        df_batch = pd.DataFrame(records)
        df_batch.to_csv(csv_path, mode='a', header=False, index=False)
        
        #release memory
        records.clear()
        print(f"Added {BATCH_SAVE} records to {csv_path}")
        torch.cuda.empty_cache()
        gc.collect()

if records:
    pd.DataFrame(records).to_csv(csv_path, mode='a', header=False, index=False)
    print(f"Added final {len(records)} records to {csv_path}")

print("Processing completed!")

# Pix2Text

### Initialize OCR Model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"[INFO] Pix2Text is using device: {device}")
# Mathematical Formula Detection
ocr_agent = Pix2Text(det_model='mfd', device=device)

### Settings

In [None]:
img_dir = Path(CAPTION_IMAGE_DIR)
csv_path = Path(PIX2TEXT_OUTPUT_CSV)
image_extensions = {'.png', '.jpg', '.jpeg', '.tif', '.tiff'}
BATCH_SAVE = 100
counter = 0

### Initialize

In [None]:
if not csv_path.exists() or csv_path.stat().st_size == 0:
    pd.DataFrame(columns=['image_name', 'pix2text_ocr']).to_csv(csv_path, index=False)

#check the completed images
df_existing = pd.read_csv(csv_path)
done_set = set(df_existing["image_name"])

sample_img = "SAMPLE IMAGE PATH"
warmup_result = ocr_agent.recognize(str(sample_img))
print("[INFO] Warm-up complete.")

all_imgs = sorted([p for p in img_dir.iterdir() if p.suffix.lower() in image_extensions])
records = []


### Main Processing

In [None]:
for img_path in tqdm(all_imgs, desc="Pix2Text Processing"):
    img_name = img_path.name
    if img_name in done_set:
        continue
    
    try:
        pix2text_ocr = ocr_agent.recognize(str(img_path))
    except Exception as e:
        pix2text_ocr = ""
    
    records.append({
        "image_name": img_name,
        "pix2text_ocr": pix2text_ocr,
    })
    
    counter += 1
    #Batch Save
    if counter % BATCH_SAVE == 0:
       df_batch = pd.DataFrame(records)
       df_batch.to_csv(csv_path, mode='a', header=False, index=False)
       
       #release memory
       records.clear()
       print(f"Added {BATCH_SAVE} records to {csv_path}")
       torch.cuda.empty_cache()
       gc.collect()

if records:
    pd.DataFrame(records).to_csv(csv_path, mode='a', header=False, index=False)
    print(f"Added final {len(records)} records to {csv_path}")

print("Processing completed!")

# Merge the two datasets of OCR results

In [None]:
df1 = pd.read_csv(EASYOCR_OUTPUT_CSV)
df2 = pd.read_csv(PIX2TEXT_OUTPUT_CSV)

# intersection
common_images = set(df1["image_name"]) & set(df2["image_name"])

df1_filtered = df1[df1["image_name"].isin(common_images)]
df2_filtered = df2[df2["image_name"].isin(common_images)]

# Merge the two datasets
merged_df = pd.merge(df1_filtered, df2_filtered, on="image_name", how="inner")

# Save
merged_df.to_csv(FINAL_OUTPUT_CSV, index=False)
print(f"✅ Merged {len(merged_df)} records, keeping only images with both OCR results.")