## Installing Libraries

In [None]:
!pip install pytesseract transformers torchvision
!pip install paddleocr
!pip install paddlepaddle --extra-index-url https://pypi.org/simple
!pip install easyocr
! pip install qwen-vl-utils

Looking in indexes: https://pypi.org/simple, https://pypi.org/simple


In [None]:
! pip install OpenAI
! pip install datasets
! pip install transformers



## Importing Libraries

In [None]:
import pytesseract
from PIL import Image
import os
import easyocr
import numpy as np
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from openai import OpenAI
from paddleocr import PaddleOCR
import json
from tqdm.auto import tqdm
from google.colab import files
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# OCR

## Tesseract OCR

In [None]:
def ocr_tesseract(image: Image.Image) -> dict:
    """Performs OCR using Tesseract for Arabic text."""
    custom_config = r'-l ara --oem 3 --psm 6'
    data = pytesseract.image_to_data(image, config=custom_config, output_type=pytesseract.Output.DICT)

    words = [word for word in data['text'] if word.strip()]
    confidences = [int(conf) for conf in data['conf'] if conf != '-1']

    text = ' '.join(words)
    avg_conf = sum(confidences) / len(confidences) if confidences else 0.0

    return {
        'text': text,
        'confidence': avg_conf
    }


### EasyOCR

In [None]:
def ocr_easyocr(image: Image.Image) -> dict:
    """Performs OCR using EasyOCR for Arabic text."""
    image_np = np.array(image.convert("RGB"))

    results = easyocr_reader.readtext(image_np, detail=1)

    if not results:
      return {
          'text': '',
          'confidence': 0.0
      }

    # Extract text and confidences
    texts = []
    confidences = []

    for box, text, conf in results:
        texts.append(text)
        confidences.append(conf)

    full_text = ' '.join(texts)
    avg_conf = sum(confidences) / len(confidences) if confidences else 0

    return {
        'text': full_text,
        'confidence': avg_conf
    }

## Paddle

In [None]:
def ocr_paddle(image: Image.Image) -> dict:
    """Performs OCR using PaddleOCR for Arabic text."""
    image = image.convert("RGB")  # Ensure it's in RGB mode
    result = ocr_engine.ocr(np.array(image), cls=True)

    if not result or not result[0]:  # Handles empty or None cases
      return {
          'text': '',
          'confidence': 0.0
      }
    lines = []
    confidences = []
    for line in result[0]:
        text, conf = line[1][0], line[1][1]
        lines.append(text)
        confidences.append(conf)

    full_text = ' '.join(lines)
    avg_conf = sum(confidences) / len(confidences) if confidences else 0

    return {
        'text': full_text,
        'confidence': avg_conf
    }


## AIN-7B

In [None]:
def ocr_qwen2vl(
    image: Image.Image,
    model,
    processor,
    prompt: str = "استخرج النص الموجود في هذه الصورة فقط."
) -> dict:
    """
    Performs OCR using the Qwen2-VL model.

    Args:
        image (PIL.Image.Image): Input image.
        model: The Qwen2VLForConditionalGeneration model.
        processor: The corresponding AutoProcessor.
        prompt (str): Prompt in Arabic asking the model to perform OCR.

    Returns:
        dict: Contains 'text' and a dummy 'confidence' score.
    """
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]

    return {
        'text': output_text.strip(),
        'confidence': None
    }


# Post-Processing

## GPT-4o

In [None]:
import re

def remove_diacritics(text):
    return re.sub(r'[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]', '', text)

In [None]:
def gpt4o_correct_arabic_text(input_text):
    clean_text = remove_diacritics(input_text)
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": "Fix any OCR or spelling mistakes. Remove diacritics if present. Only return the corrected sentence without explanation."
                },
                {
                    "role": "user",
                    "content": f"النص الأصلي: {clean_text}"
                }
            ],
            temperature=0.0  # Deterministic output
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"GPT-4o error: {e}")
        return ""


## ALLam-7B

In [None]:
def allam7b_correct_arabic_text(input_text, model, tokenizer):
    prompt = (
        "أصلح الأخطاء الإملائية أو الناتجة عن التعرف البصري على الحروف (OCR) في النص التالي."
        " أرجو إعادة النص مصححاً فقط بدون أي شرح.\n"
        f"النص الأصلي: {input_text}"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=128)
    corrected_text = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    return corrected_text

## Processing in Action

### User Profiles

In [None]:
user_profile = {
    'yassin': ['tesseract'],
    'akshat': ['paddle'],
    'mj': ['easyocr'],
    'jordan': ['ain-7b']
}

## Utility Functions

In [None]:
def save_json(path, data):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def run_ocr_and_save(dataset_name, dataset, ocr_model_name, ocr_func, output_dir, text_field):
    output_path = os.path.join(output_dir, f"ocr-{dataset_name}-{ocr_model_name}.json")
    results = []
    for example in tqdm(dataset):
        image = example['image'].convert("RGB")
        gt_text = example[text_field]
        ocr_result = ocr_func(image)
        results.append({
            'ocr_model': ocr_model_name,
            'ocr_text': ocr_result['text'],
            'confidence': ocr_result['confidence'],
            'ground_truth': gt_text
        })
    save_json(output_path, results)

def run_post_correction(dataset_name, ocr_model_name, llm_name, correct_func, output_dir, ocr_dir):
    input_path = os.path.join(ocr_dir, f"ocr-{dataset_name}-{ocr_model_name}.json")
    output_path = os.path.join(output_dir, f"post_correction-{dataset_name}-{ocr_model_name}-{llm_name}.json")

    with open(input_path, 'r', encoding='utf-8') as f:
        examples = json.load(f)

    results = []
    for example in tqdm(examples, desc=f"{llm_name} on {dataset_name}-{ocr_model_name}"):
        ocr_text_raw = example['ocr_text']
        ocr_text_cleaned = remove_diacritics(ocr_text_raw)

        gt_raw = example['ground_truth']
        gt_cleaned = remove_diacritics(gt_raw)

        corrected = correct_func(ocr_text_cleaned)

        results.append({
            'ocr_model': ocr_model_name,
            'llm': llm_name,
            'ocr_text_raw': ocr_text_raw,
            'ocr_text': ocr_text_cleaned,
            'corrected_text': corrected,
            'ground_truth': gt_raw,
            'ground_truth_cleaned': gt_cleaned
        })

    save_json(output_path, results)



def free_memory():
    torch.cuda.empty_cache()
    import gc
    gc.collect()

## Environment setup

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
user = ''
while user.lower() not in ['yassin', 'mj', 'akshat', 'jordan']:
  user = input('Enter your name: Yassin - MJ - Akshat - Jordan\n')

Enter your name: Yassin - MJ - Akshat - Jordan
yassin


In [None]:
output_dir = ''
while not os.path.exists(output_dir):
    output_dir = input("Enter an existing output dir (e.g., /content/drive/MyDrive/righting-writing_results): ")
    if not os.path.exists(output_dir):
        tqdm.write(f"Path '{output_dir}' does not exist. Please try again.")

ocr_dir = os.path.join(output_dir, 'ocr')
post_correction_dir = os.path.join(output_dir, 'post_correction')
os.makedirs(ocr_dir, exist_ok=True)
os.makedirs(post_correction_dir, exist_ok=True)


Enter an existing output dir (e.g., /content/drive/MyDrive/righting-writing_results): /content/drive/My Drive/Arabic_data/Righting-Writing


In [None]:
print("Upload your GPT API key file (e.g., api_key.txt):")
uploaded = files.upload()
file_name = next(iter(uploaded))
with open(file_name, 'r') as f:
    gpt_api_key = f.read().strip()

client = OpenAI(api_key=gpt_api_key)

Upload your GPT API key file (e.g., api_key.txt):


Saving gpt-api-key.txt to gpt-api-key (2).txt


In [None]:
ocr_models_to_run = user_profile[user.lower()]
datasets_list = ['synthesizear','patsocr','historyar','historicalbooks','adab','muharaf','onlinekhatt','khatt','isippt','arabicocr','hindawi', 'evarest']

In [None]:
if 'easyocr' in ocr_models_to_run:
    easyocr_reader = easyocr.Reader(['ar'], gpu=False)
if 'paddle' in ocr_models_to_run:
    ocr_engine = PaddleOCR(use_angle_cls=True, lang='ar', show_log=False)
if 'ain-7b' in ocr_models_to_run:
    qwen_model, qwen_processor = Qwen2VLForConditionalGeneration.from_pretrained("MBZUAI/AIN", torch_dtype="auto", device_map="auto"), AutoProcessor.from_pretrained("MBZUAI/AIN")
if 'tesseract' in ocr_models_to_run:
    !wget -O ara.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/ara.traineddata
    !mkdir -p /usr/share/tesseract-ocr/4.00/tessdata/
    !mv ara.traineddata /usr/share/tesseract-ocr/4.00/tessdata/
    os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/4.00/tessdata"

--2025-04-22 13:07:55--  https://github.com/tesseract-ocr/tessdata/raw/main/ara.traineddata
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/ara.traineddata [following]
--2025-04-22 13:07:56--  https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/ara.traineddata
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2494806 (2.4M) [application/octet-stream]
Saving to: ‘ara.traineddata’


2025-04-22 13:07:56 (251 MB/s) - ‘ara.traineddata’ saved [2494806/2494806]



In [None]:
ocr_function_map = {
    'tesseract': lambda img: ocr_tesseract(img),
    'easyocr': lambda img: ocr_easyocr(img),
    'paddle': lambda img: ocr_paddle(img),
    'ain-7b': lambda img: ocr_qwen2vl(img, prompt="استخرج النص الموجود في هذه الصورة فقط.")
}

## Run Scripts

### OCR Phase

In [None]:
for dataset_name in tqdm(datasets_list, desc="OCR Datasets"):
    # Check if all required output files already exist for this dataset
    all_done = True
    for ocr_model in ocr_models_to_run:
        output_path = os.path.join(ocr_dir, f"ocr-{dataset_name}-{ocr_model}.json")
        if not os.path.exists(output_path):
            all_done = False
            break

    if all_done:
        tqdm.write(f"Skipping {dataset_name}: all OCR outputs already exist.")
        continue

    try:
        ds = load_dataset(f"ahmedheakl/arocrbench_{dataset_name}", split='test')
    except Exception:
        try:
            ds = load_dataset(f"ahmedheakl/arocrbench_{dataset_name}", split='train')
            tqdm.write(f"Using 'train' split for {dataset_name} (no 'test' split found).")
        except Exception as e:
            tqdm.write(f"Failed to load {dataset_name}: {e}")
            continue

    # Detect the correct ground-truth text column
    text_field = None
    for candidate in ['text', 'gt_text', 'ground_truth', 'answer']:
        if candidate in ds.column_names:
            text_field = candidate
            break

    if text_field is None:
        tqdm.write(f"Skipping {dataset_name}: no usable ground-truth text field found.")
        continue

    for ocr_model in tqdm(ocr_models_to_run, desc=f"OCR Models [{dataset_name}]", leave=False):
        output_path = os.path.join(ocr_dir, f"ocr-{dataset_name}-{ocr_model}.json")
        if os.path.exists(output_path):
            tqdm.write(f"Skipping {ocr_model} on {dataset_name}: OCR output already exists.")
            continue

        tqdm.write(f"Running OCR: {ocr_model} on {dataset_name}")
        run_ocr_and_save(dataset_name, ds, ocr_model, ocr_function_map[ocr_model], ocr_dir, text_field)


OCR Datasets:   0%|          | 0/12 [00:00<?, ?it/s]

Skipping synthesizear: all OCR outputs already exist.
Skipping patsocr: all OCR outputs already exist.
Skipping historyar: all OCR outputs already exist.
Skipping historicalbooks: all OCR outputs already exist.
Skipping adab: all OCR outputs already exist.
Skipping muharaf: all OCR outputs already exist.
Skipping onlinekhatt: all OCR outputs already exist.
Skipping khatt: all OCR outputs already exist.
Skipping isippt: all OCR outputs already exist.
Skipping arabicocr: all OCR outputs already exist.
Skipping hindawi: all OCR outputs already exist.
Skipping evarest: all OCR outputs already exist.


### Post-Correction Phase

In [None]:
models = ['gpt-4o', 'allam-7b']

for llm_name in models:
    if llm_name == 'gpt-4o':
        correct_func = gpt4o_correct_arabic_text
    else:
        confirm = input("Have you switched to an A100 runtime? (y/n): ").strip().lower()
        if confirm != 'y':
            tqdm.write("Please switch to A100: Runtime > Change runtime type > GPU > A100, then rerun.")
            raise RuntimeError("Switch to A100 and re-run this cell to continue with ALLam-7B.")

        try:
            tokenizer = AutoTokenizer.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview", trust_remote_code=True)
            model = AutoModelForCausalLM.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview", trust_remote_code=True, device_map="auto")
            correct_func = lambda txt: allam7b_correct_arabic_text(txt, model, tokenizer)

            # Post-correction loop for ALLam-7B is below and shared
        except Exception as e:
            tqdm.write(f"Failed to load ALLam-7B: {e}")
            continue

    # Shared post-correction loop for both models
    for dataset_name in tqdm(datasets_list, desc=f"Post-Correction [{llm_name}]"):
        for ocr_model in tqdm(ocr_models_to_run, desc=f"{llm_name} correcting {dataset_name}", leave=False):
            input_path = os.path.join(ocr_dir, f"ocr-{dataset_name}-{ocr_model}.json")
            output_path = os.path.join(post_correction_dir, f"post_correction-{dataset_name}-{ocr_model}-{llm_name}.json")

            if not os.path.exists(input_path):
                tqdm.write(f"Skipping {dataset_name} with {ocr_model}: OCR output not found.")
                continue

            if os.path.exists(output_path):
                tqdm.write(f"Skipping: {output_path} already exists.")
                continue

            tqdm.write(f"Post-correcting {ocr_model} output on {dataset_name} using {llm_name}")
            run_post_correction(dataset_name, ocr_model, llm_name, correct_func, post_correction_dir, ocr_dir)

    # Cleanup only after ALLam-7B
    if llm_name == 'allam-7b':
        del model
        del tokenizer
        free_memory()

tqdm.write("All OCR and post-correction tasks completed successfully!")


Post-Correction [gpt-4o]:   0%|          | 0/12 [00:00<?, ?it/s]

gpt-4o correcting synthesizear:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on synthesizear using gpt-4o


gpt-4o on synthesizear-tesseract:   0%|          | 0/500 [00:00<?, ?it/s]

gpt-4o correcting patsocr:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on patsocr using gpt-4o


gpt-4o on patsocr-tesseract:   0%|          | 0/500 [00:00<?, ?it/s]

gpt-4o correcting historyar:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on historyar using gpt-4o


gpt-4o on historyar-tesseract:   0%|          | 0/200 [00:00<?, ?it/s]

gpt-4o correcting historicalbooks:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on historicalbooks using gpt-4o


gpt-4o on historicalbooks-tesseract:   0%|          | 0/10 [00:00<?, ?it/s]

gpt-4o correcting adab:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on adab using gpt-4o


gpt-4o on adab-tesseract:   0%|          | 0/200 [00:00<?, ?it/s]

gpt-4o correcting muharaf:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on muharaf using gpt-4o


gpt-4o on muharaf-tesseract:   0%|          | 0/200 [00:00<?, ?it/s]

gpt-4o correcting onlinekhatt:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on onlinekhatt using gpt-4o


gpt-4o on onlinekhatt-tesseract:   0%|          | 0/200 [00:00<?, ?it/s]

gpt-4o correcting khatt:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on khatt using gpt-4o


gpt-4o on khatt-tesseract:   0%|          | 0/200 [00:00<?, ?it/s]

gpt-4o correcting isippt:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on isippt using gpt-4o


gpt-4o on isippt-tesseract:   0%|          | 0/500 [00:00<?, ?it/s]

gpt-4o correcting arabicocr:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on arabicocr using gpt-4o


gpt-4o on arabicocr-tesseract:   0%|          | 0/50 [00:00<?, ?it/s]

gpt-4o correcting hindawi:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on hindawi using gpt-4o


gpt-4o on hindawi-tesseract:   0%|          | 0/200 [00:00<?, ?it/s]

gpt-4o correcting evarest:   0%|          | 0/1 [00:00<?, ?it/s]

Post-correcting tesseract output on evarest using gpt-4o


gpt-4o on evarest-tesseract:   0%|          | 0/800 [00:00<?, ?it/s]

Have you switched to an A100 runtime? (y/n): n
Please switch to A100: Runtime > Change runtime type > GPU > A100, then rerun.


RuntimeError: Switch to A100 and re-run this cell to continue with ALLam-7B.