## 1. Installation

In [None]:
%%capture
import os, re

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install jiwer einops addict easydict

## 2. Environment Setup (Colab / Kaggle)

In [None]:
# Mount Google Drive only for Colab
IS_KAGGLE = "KAGGLE_KERNEL_RUN_TYPE" in os.environ

if not IS_KAGGLE:
    print("\nüìÅ Mounting Google Drive...")
    from google.colab import drive
    drive.mount('/content/drive')
    print("‚úÖ Google Drive mounted!")
else:
    print("\nüìÅ Using Kaggle environment settings")
    print("‚úÖ Kaggle input/output paths will be used")

## 3. Choose Model to Evaluate

Select which model to evaluate:
- **Baseline**: Pretrained DeepSeek OCR (before finetuning)
- **Finetuned**: LoRA finetuned model

In [None]:
# ‚ö†Ô∏è SELECT MODEL TO EVALUATE
# Set to 'baseline' or 'finetuned'
MODEL_TYPE = 'finetuned'  # Change this to 'baseline' to evaluate pretrained model

print(f"\nüéØ Selected model: {MODEL_TYPE.upper()}")

In [None]:
from unsloth import FastVisionModel
import torch
from transformers import AutoModel
from huggingface_hub import snapshot_download

os.environ["UNSLOTH_WARN_UNINITIALIZED"] = '0'

if MODEL_TYPE == 'baseline':
    print("\nüì¶ Loading baseline (pretrained) model...")
    # Download baseline model
    snapshot_download("unsloth/DeepSeek-OCR", local_dir="deepseek_ocr")
    MODEL_PATH = "./deepseek_ocr"
    print(f"‚úÖ Downloaded baseline model to: {MODEL_PATH}")
    
else:  # finetuned
    print("\nüì¶ Loading finetuned model...")
    # Setup model path based on environment
    if not IS_KAGGLE:
        # ‚ö†Ô∏è UPDATE THIS PATH TO YOUR FINETUNED MODEL ON GOOGLE DRIVE
        MODEL_PATH = '/content/drive/MyDrive/deepseek_ocr_lora'
        print(f"üì¶ Colab: Loading model from: {MODEL_PATH}")
    else:
        # ‚ö†Ô∏è UPDATE 'your-model-folder' to match your Kaggle input
        MODEL_PATH = '/kaggle/input/deepseek-ocr-lora'  # Change this to your model path
        print(f"üì¶ Kaggle: Loading model from: {MODEL_PATH}")

# Load model
model, tokenizer = FastVisionModel.from_pretrained(
    MODEL_PATH,
    load_in_4bit=False,
    auto_model=AutoModel,
    trust_remote_code=True,
    unsloth_force_compile=True,
    use_gradient_checkpointing="unsloth",
)
FastVisionModel.for_inference(model)

print(f"‚úÖ Model loaded successfully!")
print(f"‚úì Evaluating: {MODEL_TYPE.upper()} model")

## 4. Download and Extract Test Dataset

**Colab:** Update `ZIP_PATH` to your Google Drive location

**Kaggle:** Add dataset to notebook, it will be available at `/kaggle/input/`

In [None]:
import zipfile
import json
from PIL import Image
from tqdm import tqdm

# Setup paths based on environment
if not IS_KAGGLE:
    # ‚ö†Ô∏è UPDATE THIS PATH TO YOUR ZIP FILE ON GOOGLE DRIVE
    ZIP_PATH = '/content/drive/MyDrive/UIT_HWDB_word.zip'
    EXTRACT_DIR = '/content/UIT_HWDB_word'
    
    print(f"üì¶ Colab: Extracting dataset from: {ZIP_PATH}")
    print(f"üìÇ Extracting to: {EXTRACT_DIR}")
    
    # Extract zip file
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall('/content/')
    
    print(f"‚úÖ Extraction complete!")
    
else:
    # Kaggle settings
    # ‚ö†Ô∏è UPDATE 'your-dataset-name' to match your Kaggle dataset
    KAGGLE_INPUT = '/kaggle/input/uit-hwdb-word/UIT_HWDB_word'  # Change this to your dataset name
    
    # Check if zip file exists in Kaggle input
    if os.path.exists(KAGGLE_INPUT):
        zip_files = [f for f in os.listdir(KAGGLE_INPUT) if f.endswith('.zip')]
        
        if zip_files:
            # Extract from zip
            ZIP_PATH = os.path.join(KAGGLE_INPUT, zip_files[0])
            EXTRACT_DIR = '/kaggle/working/UIT_HWDB_word'
            
            print(f"üì¶ Kaggle: Extracting dataset from: {ZIP_PATH}")
            print(f"üìÇ Extracting to: {EXTRACT_DIR}")
            
            with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
                zip_ref.extractall('/kaggle/working/')
            
            print(f"‚úÖ Extraction complete!")
        else:
            # Assume dataset is already extracted in Kaggle input
            EXTRACT_DIR = KAGGLE_INPUT
            print(f"üìÇ Kaggle: Using dataset from: {EXTRACT_DIR}")
    else:
        # Fallback to local path
        EXTRACT_DIR = '../data/UIT_HWDB_word'
        print(f"üìÇ Local: Using dataset from: {EXTRACT_DIR}")

# Set test directory
TEST_DIR = os.path.join(EXTRACT_DIR, 'test_data')

print(f"\n‚úì Test directory: {TEST_DIR}")
print(f"‚úì Test exists: {os.path.exists(TEST_DIR)}")

## 5. Load Test Dataset

In [None]:
def load_test_dataset(base_dir):
    """
    Load test dataset with image paths and ground truth labels.
    Returns list of tuples: (image_path, ground_truth_text)
    """
    test_samples = []
    folders = sorted(
        [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))],
        key=lambda x: int(x)
    )

    for folder in tqdm(folders, desc=f"Loading {os.path.basename(base_dir)}"):
        label_path = os.path.join(base_dir, folder, 'label.json')
        if not os.path.exists(label_path):
            continue

        # Load labels
        with open(label_path, 'r', encoding='utf-8') as f:
            labels = json.load(f)

        # Process each image
        for img_name, label in labels.items():
            img_path = os.path.join(base_dir, folder, img_name)
            if not os.path.exists(img_path):
                continue

            test_samples.append({
                'image_path': img_path,
                'ground_truth': label.strip()
            })

    return test_samples

# Load test dataset
print("\nüì• Loading test data...")
test_dataset = load_test_dataset(TEST_DIR)
print(f"‚úÖ Loaded {len(test_dataset):,} test samples")

## 6. Define CER Metric

Character Error Rate (CER) measures the edit distance between predicted and ground truth text at character level.

In [None]:
from jiwer import cer

def calculate_cer(predictions, references):
    """
    Calculate Character Error Rate (CER) for a list of predictions.
    
    Args:
        predictions: List of predicted texts
        references: List of ground truth texts
    
    Returns:
        float: Average CER score (0-1, lower is better)
    """
    return cer(references, predictions)

# Test the metric
test_pred = "xin ch√†o"
test_ref = "xin chao"
print(f"Example CER: {calculate_cer([test_pred], [test_ref]):.4f}")
print(f"Perfect match CER: {calculate_cer(['test'], ['test']):.4f}")

## 7. Run Inference on Test Set

This will run inference on all test samples. You can limit the number for faster testing.

In [None]:
import tempfile
from tqdm import tqdm

def run_inference(model, tokenizer, test_dataset, max_samples=None):
    """
    Run inference on test dataset.
    
    Args:
        model: Loaded model
        tokenizer: Model tokenizer
        test_dataset: List of test samples
        max_samples: Maximum number of samples to evaluate (None = all)
    
    Returns:
        predictions: List of predicted texts
        ground_truths: List of ground truth texts
    """
    predictions = []
    ground_truths = []
    
    # Limit samples if specified
    samples_to_process = test_dataset[:max_samples] if max_samples else test_dataset
    
    prompt = "<image>\nFree OCR. "
    
    for sample in tqdm(samples_to_process, desc="Running inference"):
        try:
            # Run inference
            result = model.infer(
                tokenizer,
                prompt=prompt,
                image_file=sample['image_path'],
                output_path='./output',
                base_size=1024,
                image_size=640,
                crop_mode=True,
                save_results=False,
                test_compress=False
            )
            
            predictions.append(result.strip())
            ground_truths.append(sample['ground_truth'])
            
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error processing {sample['image_path']}: {e}")
            # Add empty prediction to maintain alignment
            predictions.append("")
            ground_truths.append(sample['ground_truth'])
    
    return predictions, ground_truths

# Run inference on test set
# Option 1: Run on full test set (takes longer)
# predictions, ground_truths = run_inference(model, tokenizer, test_dataset)

# Option 2: Run on subset for faster testing (uncomment below)
print("\nüîÑ Running inference on test set...")
predictions, ground_truths = run_inference(model, tokenizer, test_dataset, max_samples=500)

print(f"\n‚úÖ Completed inference on {len(predictions):,} samples")

## 8. Calculate Evaluation Metrics

In [None]:
# Calculate overall CER
overall_cer = calculate_cer(predictions, ground_truths)

print("="*50)
print(f"üìä EVALUATION RESULTS - {MODEL_TYPE.upper()} MODEL")
print("="*50)
print(f"\nüìà Character Error Rate (CER): {overall_cer:.4f} ({overall_cer*100:.2f}%)")
print(f"üìä Total samples evaluated: {len(predictions):,}")
print(f"‚úì Accuracy (1 - CER): {(1-overall_cer)*100:.2f}%")
print("\n" + "="*50)

## 9. Show Sample Predictions

Display some example predictions to qualitatively assess performance.

In [None]:
import random
from IPython.display import display

# Show random samples
num_samples = 10
sample_indices = random.sample(range(len(predictions)), min(num_samples, len(predictions)))

print("\nüîç Sample Predictions:\n")
for i, idx in enumerate(sample_indices, 1):
    print(f"\n--- Sample {i} ---")
    print(f"Ground Truth: '{ground_truths[idx]}'")
    print(f"Prediction:   '{predictions[idx]}'")
    
    # Calculate CER for this sample
    sample_cer = calculate_cer([predictions[idx]], [ground_truths[idx]])
    match = "‚úÖ" if predictions[idx] == ground_truths[idx] else "‚ùå"
    print(f"CER: {sample_cer:.4f} {match}")
    
    # Display image
    img_path = test_dataset[idx]['image_path']
    img = Image.open(img_path)
    display(img)

## 10. Detailed Error Analysis

In [None]:
# Calculate per-sample CER
sample_cers = []
exact_matches = 0

for pred, ref in zip(predictions, ground_truths):
    sample_cer = calculate_cer([pred], [ref])
    sample_cers.append(sample_cer)
    if pred == ref:
        exact_matches += 1

# Statistics
import numpy as np

print("\nüìä DETAILED ANALYSIS")
print("="*50)
print(f"\n‚úì Exact matches: {exact_matches}/{len(predictions)} ({exact_matches/len(predictions)*100:.2f}%)")
print(f"\nüìà CER Statistics:")
print(f"  - Mean CER: {np.mean(sample_cers):.4f}")
print(f"  - Median CER: {np.median(sample_cers):.4f}")
print(f"  - Min CER: {np.min(sample_cers):.4f}")
print(f"  - Max CER: {np.max(sample_cers):.4f}")
print(f"  - Std Dev: {np.std(sample_cers):.4f}")

# CER distribution
print(f"\nüìä CER Distribution:")
perfect = sum(1 for c in sample_cers if c == 0.0)
low = sum(1 for c in sample_cers if 0.0 < c <= 0.1)
medium = sum(1 for c in sample_cers if 0.1 < c <= 0.3)
high = sum(1 for c in sample_cers if c > 0.3)

print(f"  - Perfect (CER = 0.0): {perfect} ({perfect/len(sample_cers)*100:.2f}%)")
print(f"  - Low (0.0 < CER ‚â§ 0.1): {low} ({low/len(sample_cers)*100:.2f}%)")
print(f"  - Medium (0.1 < CER ‚â§ 0.3): {medium} ({medium/len(sample_cers)*100:.2f}%)")
print(f"  - High (CER > 0.3): {high} ({high/len(sample_cers)*100:.2f}%)")

## 11. Show Worst Predictions

Identify and display samples with highest CER for error analysis.

In [None]:
# Find worst predictions
sorted_indices = np.argsort(sample_cers)[::-1]
worst_n = 5

print(f"\n‚ùå Top {worst_n} Worst Predictions:\n")
for i, idx in enumerate(sorted_indices[:worst_n], 1):
    print(f"\n--- Worst #{i} (CER: {sample_cers[idx]:.4f}) ---")
    print(f"Ground Truth: '{ground_truths[idx]}'")
    print(f"Prediction:   '{predictions[idx]}'")
    
    # Display image
    img_path = test_dataset[idx]['image_path']
    img = Image.open(img_path)
    display(img)

## 12. Save Results

Save evaluation results for later analysis.

In [None]:
import pandas as pd

# Create results dataframe
results_df = pd.DataFrame({
    'image_path': [test_dataset[i]['image_path'] for i in range(len(predictions))],
    'ground_truth': ground_truths,
    'prediction': predictions,
    'cer': sample_cers
})

# Setup save path based on environment
if not IS_KAGGLE:
    RESULTS_PATH = f"/content/drive/MyDrive/evaluation_results_{MODEL_TYPE}.csv"
    print(f"üíæ Colab: Saving results to Google Drive...")
else:
    RESULTS_PATH = f"/kaggle/working/evaluation_results_{MODEL_TYPE}.csv"
    print(f"üíæ Kaggle: Saving results to working directory...")

# Save results
results_df.to_csv(RESULTS_PATH, index=False, encoding='utf-8')
print(f"‚úÖ Results saved to: {RESULTS_PATH}")

# Display summary
print("\nüìã Results Summary:")
print(results_df.head(10))

# Save summary statistics
summary = {
    'model_type': MODEL_TYPE,
    'overall_cer': overall_cer,
    'total_samples': len(predictions),
    'exact_matches': exact_matches,
    'accuracy': 1 - overall_cer,
    'mean_cer': np.mean(sample_cers),
    'median_cer': np.median(sample_cers),
    'std_cer': np.std(sample_cers)
}

summary_path = RESULTS_PATH.replace('.csv', '_summary.json')
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Summary saved to: {summary_path}")

---

## Done! üéâ

You've successfully evaluated the model on Vietnamese handwriting test data.

**Summary:**
- ‚úÖ Selected model type (baseline or finetuned)
- ‚úÖ Loaded model and ran inference
- ‚úÖ Calculated Character Error Rate (CER)
- ‚úÖ Performed error analysis
- ‚úÖ Saved detailed results

**Next steps:**
- Change `MODEL_TYPE` to evaluate the other model
- Compare baseline vs finetuned performance
- Try different inference parameters (base_size, crop_mode)
- Evaluate on full test set