# Installing dependencies

In [35]:
!apt install tesseract-ocr libtesseract-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [36]:
!pip install hdf5storage Levenshtein jiwer



In [37]:
!pip install -q git+https://github.com/allansdefreitas/yolov10.git
!pip install -q supervision

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [38]:
!wget https://github.com/moured/YOLOv10-Document-Layout-Analysis/releases/download/doclaynet_weights/yolov10x_best.pt

--2025-09-04 15:53:15--  https://github.com/moured/YOLOv10-Document-Layout-Analysis/releases/download/doclaynet_weights/yolov10x_best.pt
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/809399250/e52eefec-ac07-4944-997c-59e48e23474b?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-09-04T16%3A35%3A13Z&rscd=attachment%3B+filename%3Dyolov10x_best.pt&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-09-04T15%3A34%3A27Z&ske=2025-09-04T16%3A35%3A13Z&sks=b&skv=2018-11-09&sig=DxuqihqLlPcLPYWQ9DqaCnUm8EmQFuOFfEhBKOqnqu0%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1NzAwMTQ5NSwibmJmIjoxNzU3MDAxMTk1LCJwYXRoIjoic

In [39]:
!pip install reportlab



# Model & Inference Code

## Dewarping Model Architecture (Flow Generator)

In [40]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast
import hdf5storage as h5
import cv2
import numpy as np
from einops import rearrange
import time
import argparse
from torchvision import transforms
import torchvision.utils as vutils
import wandb  # Optional for logging
import matplotlib.pyplot as plt


# ---------------------------
# Model Architecture (Your Transformer+U-Net)
# ---------------------------
class TransformerBlock(nn.Module):
    def __init__(self, dim, heads, mlp_ratio=4.0, p=0.0):
        super().__init__()
        self.n1 = nn.LayerNorm(dim)
        self.attn = nn.MultiheadAttention(dim, heads, dropout=p, batch_first=False)
        self.n2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, int(dim*mlp_ratio)), nn.GELU(), nn.Linear(int(dim*mlp_ratio), dim)
        )
    def forward(self, x):  # x: [HW,B,D]
        h = self.n1(x)
        a, _ = self.attn(h, h, h, need_weights=False)
        x = x + a
        x = x + self.mlp(self.n2(x))
        return x

class MultiStageTransformerEncoder(nn.Module):
    def __init__(self, img_channels=3, embed_dims=[64,128,256], patch_sizes=[8,16,2], depths=[2,2,2], heads=[2,4,8]):
        super().__init__()
        self.stages = nn.ModuleList()
        self.embed_dims = embed_dims
        for i, d in enumerate(embed_dims):
            in_ch = img_channels if i == 0 else embed_dims[i-1]
            self.stages.append(nn.ModuleDict({
                "proj": nn.Conv2d(in_ch, d, kernel_size=patch_sizes[i], stride=patch_sizes[i]),
                "blocks": nn.ModuleList([TransformerBlock(d, heads[i]) for _ in range(depths[i])])
            }))
    def forward(self, x):
        skips = []
        for s in self.stages:
            x = s["proj"](x)             # [B,D,h,w]
            B, D, h, w = x.shape
            x_seq = rearrange(x, "b d h w -> (h w) b d")
            for blk in s["blocks"]:
                x_seq = blk(x_seq)
            x = rearrange(x_seq, "(h w) b d -> b d h w", h=h, w=w)
            skips.append(x)
        return skips  # [low-res ... high-res]

class UNetDecoder(nn.Module):
    def __init__(self, embed_dims=[64,128,256], out_ch=2):
        super().__init__()
        self.up1 = nn.ConvTranspose2d(embed_dims[2], embed_dims[1], 2, 2)
        self.c1  = nn.Sequential(nn.Conv2d(embed_dims[1]*2, embed_dims[1], 3, padding=1), nn.ReLU(True),
                                 nn.Conv2d(embed_dims[1], embed_dims[1], 3, padding=1), nn.ReLU(True))
        self.up2 = nn.ConvTranspose2d(embed_dims[1], embed_dims[0], 2, 2)
        self.c2  = nn.Sequential(nn.Conv2d(embed_dims[0]*2, embed_dims[0], 3, padding=1), nn.ReLU(True),
                                 nn.Conv2d(embed_dims[0], embed_dims[0], 3, padding=1), nn.ReLU(True))
        self.up3 = nn.ConvTranspose2d(embed_dims[0], embed_dims[0]//2, 2, 2)
        self.c3  = nn.Sequential(nn.Conv2d(embed_dims[0]//2, embed_dims[0]//2, 3, padding=1), nn.ReLU(True))
        self.out = nn.Conv2d(embed_dims[0]//2, out_ch, 1)
    def forward(self, skips):
        x = skips[-1]
        x = self.up1(x)
        s1 = F.interpolate(skips[1], size=x.shape[-2:], mode='bilinear', align_corners=False)
        x = torch.cat([x, s1], dim=1); x = self.c1(x)

        x = self.up2(x)
        s0 = F.interpolate(skips[0], size=x.shape[-2:], mode='bilinear', align_corners=False)
        x = torch.cat([x, s0], dim=1); x = self.c2(x)

        x = self.up3(x); x = self.c3(x)
        return self.out(x)

class FlowGenerator(nn.Module):
    """Predicts flow to transform input between domains."""
    def __init__(self, img_channels=3, max_disp=48.0):
        super().__init__()
        self.enc = MultiStageTransformerEncoder(img_channels=img_channels)
        self.dec = UNetDecoder()
        self.max_disp = max_disp
    def forward(self, x):
        B, C, H, W = x.shape
        skips = self.enc(x)
        flow = self.dec(skips)
        flow = F.interpolate(flow, size=(H, W), mode='bilinear', align_corners=False)
        # constrain displacement magnitude for stability
        # flow = torch.tanh(flow) * self.max_disp
        flow = flow * 10.0  # EXPERIMENT: 10x amplification for visibility
        return flow

## Dewarping inference funcs

In [41]:
def apply_bm_doc3d(img, bm_pix, align_corners=True, padding_mode="border", verbose=False, save_path=None):
    """
    Warp an image using a backward map in pixel coordinates.

    Args:
        img: (B, C, H, W) tensor in [0,1], warped image
        bm_pix: (B, 2, H, W) tensor in pixels, backward map (absolute coords)
                bm_pix[:,0] = x pixel coords
                bm_pix[:,1] = y pixel coords
        align_corners: bool, matches normalization convention in grid_sample
        padding_mode: str, 'border' or 'zeros'

    Returns:
        rectified: (B, C, H, W) tensor, unwarped image
    """        
    B, C, H, W = img.shape

    # convert pixel coords -> normalized [-1,1]
    if align_corners:
        norm_x = (bm_pix[:, 0, :, :] / (W - 1)) * 2 - 1
        norm_y = (bm_pix[:, 1, :, :] / (H - 1)) * 2 - 1
    else:
        norm_x = (2 * bm_pix[:, 0, :, :] + 1) / W - 1
        norm_y = (2 * bm_pix[:, 1, :, :] + 1) / H - 1

    grid = torch.stack([norm_x, norm_y], dim=-1)  # (B,H,W,2)

    rectified = F.grid_sample(
        img, grid, mode="bilinear",
        padding_mode=padding_mode, align_corners=align_corners
    )
        
    if verbose:
        img_display = prepare_tensor(img)
        rectified_display = prepare_tensor(rectified)
        
        f,axrr=plt.subplots(1,2)
        for ax in axrr:
            ax.set_xticks([])
            ax.set_yticks([])
        axrr[0].imshow(img_display)
        axrr[0].title.set_text('input')
        axrr[1].imshow(rectified_display)
        axrr[1].title.set_text('unwarped')
        if save_path is not None:
            plt.savefig(save_path)
        plt.show()
        
    return rectified


def scale_flow_to_resolution(flow_low_res, low_res, high_res):
    """
    Scale flow from low resolution to high resolution
    flow_low_res: [1, 2, H_low, W_low]
    low_res: (H_low, W_low)
    high_res: (H_high, W_high)
    """
    if low_res == high_res:
        return flow_low_res
        
    H_low, W_low = low_res
    H_high, W_high = high_res
    
    # Scale factors
    scale_x = W_high / W_low
    scale_y = H_high / H_low
    
    # Resize flow
    flow_high_res = F.interpolate(flow_low_res, size=(H_high, W_high), 
                                 mode='bilinear', align_corners=False)
    
    # Scale flow values
    flow_high_res[:, 0, :, :] *= scale_x  # x coordinates
    flow_high_res[:, 1, :, :] *= scale_y  # y coordinates
    
    return flow_high_res
    
    
def dewarp_high_res_scaled(model, high_res_img_path, device, out_path=None, target_size=(448, 448)):
    """
    (Main inference function)
    Dewarp by scaling flow to high resolution
    """
    # Load high-res image
    high_res_img = cv2.imread(high_res_img_path)
    high_res_img = cv2.cvtColor(high_res_img, cv2.COLOR_BGR2RGB)
    original_size = high_res_img.shape[:2]  # (H, W)
    
    # Resize to training size for prediction
    img_resized = cv2.resize(high_res_img, target_size)
    img_tensor = torch.from_numpy(img_resized).float().permute(2, 0, 1) / 255.0
    img_tensor = img_tensor.unsqueeze(0).to(device)
    
    # Predict flow on resized image
    with torch.no_grad():
        flow_low_res = model(img_tensor)  # [1, 2, 448, 448]
    
    # Scale flow to original resolution
    flow_high_res = scale_flow_to_resolution(flow_low_res, target_size, original_size)
    
    # Prepare high-res image tensor
    img_high_res_tensor = torch.from_numpy(high_res_img).float().permute(2, 0, 1) / 255.0
    img_high_res_tensor = img_high_res_tensor.unsqueeze(0).to(device)
    
    # Apply scaled flow to high-res image
    dewarped_high_res = apply_bm_doc3d(img_high_res_tensor, flow_high_res, align_corners=True)
    
    # Save
    if out_path is not None:
        save_dewarped_result(dewarped_high_res, out_path)

    return dewarped_high_res, high_res_img

## Visualzation helper funcs

In [42]:
from io import BytesIO
from PIL import Image


def prepare_tensor(tensor):
    if isinstance(tensor, np.ndarray):
        if tensor.ndim == 4:
            tensor = tensor[0]  # remove batch dimension
        if tensor.shape[0] in [1, 3, 4]:  # channels first
            tensor = tensor.transpose(1, 2, 0)  # convert to channels last
        return tensor
    if tensor.requires_grad:
        tensor = tensor.detach()
    tensor = tensor.cpu().numpy()
    if tensor.ndim == 4:
        tensor = tensor[0]
    if tensor.shape[0] in [1, 3, 4]:
        tensor = tensor.transpose(1, 2, 0)
    return tensor

def visualize_flow(flow, save_path=None, verbose=True):
    """Visualize flow field as RGB image"""
    try:
        flow = flow.squeeze(0).cpu().numpy()
    except:
        flow = flow.squeeze(0)
    
    # Convert flow to HSV color representation
    h, w = flow.shape[1:]
    hsv = np.zeros((h, w, 3), dtype=np.uint8)
    
    # Magnitude and angle
    mag, ang = cv2.cartToPolar(flow[0], flow[1])
    
    # Normalize for visualization
    hsv[..., 0] = ang * 180 / np.pi / 2
    hsv[..., 1] = 255
    hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
    
    # Convert to BGR and save
    bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    
    if save_path is not None:
        cv2.imwrite(save_path, bgr)

    # Create plot
    if verbose:
        plt.figure(figsize=(10, 8))
        plt.imshow(bgr)
        plt.axis('off')
        plt.title('Optical Flow Visualization')
        plt.tight_layout()
        plt.show()
        
    return bgr

def save_dewarped_result(dewarped_tensor, out_path, clip=True, verbose=False):
    """
    Save a dewarped image tensor to file
    
    Args:
        dewarped_tensor: torch.Tensor of shape [1, 3, H, W] or [3, H, W] in range [0, 1]
        out_path: path to save the image
        clip: whether to clip values to [0, 1] range
        verbose: whether to print debug information
    """
    # Ensure we're working with CPU numpy array
    if hasattr(dewarped_tensor, 'detach'):
        dewarped_tensor = dewarped_tensor.detach()
    if hasattr(dewarped_tensor, 'cpu'):
        dewarped_tensor = dewarped_tensor.cpu()
    if hasattr(dewarped_tensor, 'numpy'):
        dewarped_tensor = dewarped_tensor.numpy()
    
    # Handle different tensor shapes
    if dewarped_tensor.ndim == 4:  # [B, C, H, W]
        dewarped_tensor = dewarped_tensor[0]  # Take first batch
    if dewarped_tensor.ndim == 3:  # [C, H, W]
        dewarped_tensor = dewarped_tensor.transpose(1, 2, 0)  # [H, W, C]
    
    # Clip values to valid range
    if clip:
        dewarped_tensor = np.clip(dewarped_tensor, 0, 1)
    
    # Convert to 8-bit and change channel order for OpenCV
    dewarped_image = (dewarped_tensor * 255).astype(np.uint8)
    dewarped_image = cv2.cvtColor(dewarped_image, cv2.COLOR_RGB2BGR)
    
    # Save image
    success = cv2.imwrite(out_path, dewarped_image)
    
    if verbose:
        print(f"Saved dewarped image to: {out_path}")
        print(f"Image shape: {dewarped_image.shape}")
        print(f"Value range: [{dewarped_tensor.min():.3f}, {dewarped_tensor.max():.3f}]")
        print(f"Save successful: {success}")
    
    return success

# OCR Code

In [43]:
import pytesseract
from PIL import Image
import Levenshtein as lv
from jiwer import cer


# TODO: extend to add the ability for bounding boxes extraction
def extract_OCR_text(filepath):
    with Image.open(filepath) as img:
        OCR_text = pytesseract.image_to_string(img, config="--oem 1")
    return OCR_text

def calculate_OCR_metrics(GT_text, OCR_text):
    CER = cer(GT_text, OCR_text)
    ED = lv.distance(OCR_text, GT_text)
    return {
        'CER': CER,
        'ED': ED
    }

# Layout Analysis Code

In [44]:
import supervision as sv
from ultralytics import YOLOv10


doc_layout_model = YOLOv10('yolov10x_best.pt')

def detect_img_layout(path, verbose=False):
    """
    names: {0: 'Caption', 1: 'Footnote', 2: 'Formula', 3: 'List-item', 4: 'Page-footer', 5: 'Page-header', 
            6: 'Picture', 7: 'Section-header', 8: 'Table', 9: 'Text', 10: 'Title'}
    """
    image = cv2.imread(path)
    results = doc_layout_model(source=path, conf=0.2, iou=0.8)[0]
    detections = sv.Detections.from_ultralytics(results)

    if verbose == True:
        bounding_box_annotator = sv.BoxAnnotator()
        label_annotator = sv.LabelAnnotator()
        
        annotated_image = bounding_box_annotator.annotate(
            scene=image, detections=detections)
        annotated_image = label_annotator.annotate(
            scene=annotated_image, detections=detections)
    
        sv.plot_image(annotated_image)

    return detections

# Testing

## Loading trained models

In [None]:
config = {
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'max_disp': 48.0,
}

ckpt_path_dict = {
    'kaggle_5k_98ep': '/kaggle/input/supervised-dewarping-training/checkpoints/checkpoint_epoch_98.pth',
    'cc_100k_8ep': '/kaggle/input/supervised_dewarping_model/pytorch/epoch-8/1/checkpoint_epoch_8.pth',
    'cc_100k_24ep': '/kaggle/input/supervised_dewarping_model/pytorch/default-epoch-24/1/checkpoint_epoch_24.pth'
}

In [None]:
device = config['device']
max_disp = config['max_disp']

print(f'Using device: {device}, max_disp: {max_disp}')

In [None]:
models_dict = {}

for ckpt, path in ckpt_path_dict.items():
    model = FlowGenerator(max_disp=max_disp).to(device)
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    models_dict[ckpt] = model

models_dict.keys()

## Inference testing

In [None]:
img_paths = ['/kaggle/input/dir300/DIR300/dist/1.png', '/kaggle/input/dir300/DIR300/dist/100.png', '/kaggle/input/dir300/DIR300/dist/106.png', '/kaggle/input/dir300/DIR300/dist/103.png', '/kaggle/input/dir300/DIR300/dist/12.png']

for path in img_paths:
    print(path)
    display(Image.open(path))

In [None]:
for warped_img_path in img_paths:
    print(warped_img_path)
    for name, model in models_dict.items():
        dewarped, high_res_img = dewarp_high_res_scaled(model, warped_img_path, device, out_path=f"{os.path.basename(warped_img_path)[:-4]}-{name}.png")

        if name == 'kaggle_5k_98ep':
            plt.figure(figsize=(10, 8))
            plt.imshow(prepare_tensor(high_res_img))
            plt.axis('off')
            plt.show()

        print(name)
        
        plt.figure(figsize=(10, 8))
        plt.imshow(prepare_tensor(dewarped))
        plt.axis('off')
        plt.show()

## OCR

In [None]:
gt_img_paths = ['/kaggle/input/dir300/DIR300/gt/1.png', '/kaggle/input/dir300/DIR300/gt/100.png', '/kaggle/input/dir300/DIR300/gt/106.png', '/kaggle/input/dir300/DIR300/gt/103.png', '/kaggle/input/dir300/DIR300/gt/12.png']

for warped_img_path, gt_img_path in zip(img_paths, gt_img_paths):
    print(warped_img_path, gt_img_path)
    gt_text = extract_OCR_text(gt_img_path)
    warped_text = extract_OCR_text(warped_img_path)
    metrics = calculate_OCR_metrics(gt_text, warped_text)
    print("GT vs Warped:", metrics)
    for name, model in models_dict.items():
        dewarped_img_path = f"{os.path.basename(warped_img_path)[:-4]}-{name}.png"
        dewarped_text = extract_OCR_text(dewarped_img_path)
        metrics = calculate_OCR_metrics(gt_text, dewarped_text)
        print(f"GT vs {name}: {metrics}")

## Layout analysis 

In [None]:
gt_img_paths = ['/kaggle/input/dir300/DIR300/gt/1.png', '/kaggle/input/dir300/DIR300/gt/100.png', '/kaggle/input/dir300/DIR300/gt/106.png', '/kaggle/input/dir300/DIR300/gt/103.png', '/kaggle/input/dir300/DIR300/gt/12.png']

for warped_img_path, gt_img_path in zip(img_paths, gt_img_paths):
    print(warped_img_path, gt_img_path)
    detect_img_layout(gt_img_path, verbose=True)
    detect_img_layout(warped_img_path, verbose=True)
    for name, model in models_dict.items():
        dewarped_img_path = f"{os.path.basename(warped_img_path)[:-4]}-{name}.png"
        detect_img_layout(dewarped_img_path, verbose=True)

# PDF generation

## Text only

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from IPython.display import IFrame

def text_to_pdf(text, filename="output.pdf"):
    # Create a canvas with letter page size
    c = canvas.Canvas(filename, pagesize=letter)
    width, height = letter

    # Set font and starting position
    c.setFont("Helvetica", 12)
    x, y = 72, height - 72  # 1-inch margin

    # Write text line by line (auto-wrap for long strings)
    for line in text.split("\n"):
        c.drawString(x, y, line)
        y -= 15  # move down for next line
        if y < 72:  # start a new page if out of space
            c.showPage()
            c.setFont("Helvetica", 12)
            y = height - 72

    # Save the PDF
    c.save()
    print(f"PDF saved as {filename}")


def img_to_pdf(img_path, pdf_path, verbose=True):
    if verbose:
        display(Image.open(img_path))
        
    text = extract_OCR_text(img_path)
    text_to_pdf(gt_text, pdf_path)
    if verbose:
        display(IFrame(pdf_path, width=600, height=400))
        
    return text

In [None]:
img_to_pdf('/kaggle/input/dir300/DIR300/gt/1.png', 'text_only-gt_1.pdf')

In [None]:
img_to_pdf('/kaggle/input/dir300/DIR300/gt/100.png', 'text_only-gt_100.pdf')
img_to_pdf('/kaggle/input/dir300/DIR300/dist/100.png', 'text_only-dist_100.pdf')
img_to_pdf('/kaggle/working/100-cc_100k_24ep.png', 'text_only-100-cc_100k_24ep.pdf')

# Evaluation benchmarks

## DIR300

In [56]:
# Source (original authors of the DIR300 dataset): https://github.com/fh2019ustc/DocGeoNet/blob/main/OCR_eval_DIR300.py

def Levenshtein_Distance(str1, str2):
    matrix = [[ i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            if(str1[i-1] == str2[j-1]):
                d = 0
            else:
                d = 1 
            matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+d)

    return matrix[len(str1)][len(str2)]

def cal_cer_ed(path_ours, tail='_rec'):
    path_gt='/kaggle/input/dir300/DIR300 FULL/gt/'
    cer1=[]
    ed1=[]
    lis=[5,7,8,10,12,27,28,29,31,36,53,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,85,94,96]+\
         [103,107,108,111,115,126,128,129,130,133,135,139,140,148,149,151,159,160,161,162,163,164,165,166,167,169,170,173,174,177]+\
         [201,202,203,205,217,218,222,223,225,227,228,237,238,239,264,265,266,271,273,277,278,285,286,288,291,294,295,296,298,300]  # 90 images in DIR300
    print(len(lis))
    for i in range(1,301):
        if i not in lis:
            continue
        if not os.path.exists(path_gt+str(i)+'.png'):
            print(path_gt+str(i)+'.png')
        if not os.path.exists(path_ours+str(i) + tail):
            print(path_ours+str(i) + tail)
        # gt=Image.open(path_gt+str(i)+'.png')
        # img1=Image.open(path_ours+str(i) + tail)
        # content_gt=pytesseract.image_to_string(gt)
        # content1=pytesseract.image_to_string(img1)
        # l1=Levenshtein_Distance(content_gt,content1)
        # ed1.append(l1)
        # cer1.append(l1/len(content_gt))
    print('CER: ', np.mean(cer1))
    print('ED:  ', np.mean(ed1))

def evalu(path_ours, tail):
    cal_cer_ed(path_ours, tail)

In [57]:
evalu("/kaggle/input/dir300/DIR300 FULL/dist/", ".png")

90
CER:  nan
ED:   nan


In [58]:
import os
from PIL import Image
import pytesseract
import numpy as np

def cal_cer_ed(path_ours, tail='_rec', save_text=True, output_dir='extracted_text'):
    path_gt = '/kaggle/input/dir300/DIR300 FULL/gt/'
    cer1 = []
    ed1 = []
    lis = [5,7,8,10,12,27,28,29,31,36,53,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,85,94,96] + \
         [103,107,108,111,115,126,128,129,130,133,135,139,140,148,149,151,159,160,161,162,163,164,165,166,167,169,170,173,174,177] + \
         [201,202,203,205,217,218,222,223,225,227,228,237,238,239,264,265,266,271,273,277,278,285,286,288,291,294,295,296,298,300]  # 90 images in DIR300
    
    print(len(lis))
    
    # Create output directory if it doesn't exist and we want to save text
    if save_text and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for i in range(1, 301):
        if i not in lis:
            continue
        
        gt = Image.open(path_gt + str(i) + '.png')
        img1 = Image.open(path_ours + str(i) + tail)
        
        content_gt = pytesseract.image_to_string(gt)
        content1 = pytesseract.image_to_string(img1)
        
        # Save extracted text to files if requested
        if save_text:
            # Save ground truth text
            gt_filename = os.path.join(output_dir, f'gt_{i}.txt')
            with open(gt_filename, 'w', encoding='utf-8') as f:
                f.write(content_gt)
            
            # Save our method's text
            our_filename = os.path.join(output_dir, f'our_{i}.txt')
            with open(our_filename, 'w', encoding='utf-8') as f:
                f.write(content1)
        
        l1 = Levenshtein_Distance(content_gt, content1)
        ed1.append(l1)
        cer1.append(l1 / len(content_gt))
    
    print('CER: ', np.mean(cer1))
    print('ED:  ', np.mean(ed1))
    
    # Return the results for further analysis if needed
    return cer1, ed1

In [None]:
cer_results, ed_results = cal_cer_ed('/kaggle/input/dir300/DIR300 FULL/dist/', '.png', output_dir='text_results')

90
