In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import torch
import cv2
import glob
import json
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as functional
import matplotlib.patches as patches
import xml.etree.ElementTree as ET
import torchvision.transforms.functional as F

from PIL import Image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms
from torchvision.transforms.functional import to_tensor
from torchinfo import summary
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchinfo import summary
from tensorflow.keras.backend import ctc_decode

In [None]:
BASE_DIR          = '/kaggle/input/car-plates-ocr/data/'
BASE_DIR_CPD      = '/kaggle/input/car-plate-detection/'
BASE_DIR_CPD_ANNO = '/kaggle/input/car-plate-detection/annotations/'
BASE_DIR_CPD_IMG  = '/kaggle/input/car-plate-detection/images/'
IMAGE_SIZE        = (120, 50)
IMAGE_SIZE_WHOLE  = (220, 220)
DEVICE            = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS            = 15
LEARNING_RATE     = 1e-3
BATCH_SIZE        = 32
OUTPUT            = 74 # Upper + lower case, numbers, unique symbols

# car-plate-detection dataset

In [None]:
# data = {
#     "filename": [],
#     "bbox": []
# }
# for file in os.listdir(BASE_DIR_CPD_ANNO):
#     tree = ET.parse(os.path.join(BASE_DIR_CPD_ANNO, file))
#     root = tree.getroot()
#     for obj in root.findall('object'):
#         data['filename'].append(os.path.splitext(file)[0])
        
#         bndbox_elem = obj.find('bndbox')
#         bbox = [int(bndbox_elem.find('xmin').text), int(bndbox_elem.find('ymin').text), int(bndbox_elem.find('xmax').text),  int(bndbox_elem.find('ymax').text)]
#         data['bbox'].append(bbox)
        
# df = pd.DataFrame(data)
# df[df['filename'].duplicated() == True]

In [None]:
# plt.imshow(BASE_DIR_CPD_IMG + "Cars106.png")

In [None]:
# bboxes = []
# for idx, col in df.iterrows():
#     tree = ET.parse(BASE_DIR_CPD_ANNO + col['filename'] + ".xml")
#     root = tree.getroot()
#     for obj in root.findall('object'):
#         bndbox_elem = obj.find('bndbox')
#         bbox = [int(bndbox_elem.find('xmin').text), int(bndbox_elem.find('ymin').text), int(bndbox_elem.find('xmax').text),  int(bndbox_elem.find('ymax').text)]
#         bboxes.append(bbox)

# car-plates-ocr dataset

In [None]:
# Normalize the JSON data and create a DataFrame
df = pd.read_json('/kaggle/input/car-plates-ocr/data/train.json', orient='records')

# Normalize the JSON data and create a DataFrame
df = pd.json_normalize(df.to_dict(orient='records'), 'nums', ['file'])

In [None]:
def convert_to_format(points):
    x_0 = np.min([points[0][0], points[3][0]])
    y_0 = np.min([points[0][1], points[1][1]])
    x_1 = np.max([points[1][0], points[2][0]])
    y_1 = np.max([points[2][1], points[3][1]])

    if x_0 > x_1:
        x_1, x_0 = x_0, x_1
    if y_0 > y_1:
        y_1, y_0 = y_0, y_1

    return [x_0, y_0, x_1, y_1]

# Apply the conversion function to the 'box' column
df['bbox'] = df['box'].apply(lambda x: convert_to_format(x))

In [None]:
df.head()

In [None]:
gg = Image.open('/kaggle/input/car-plates-ocr/data/' + df.iloc[0]['file'])
bounding_boxes = df.iloc[0]['bbox']
box = df.iloc[0]['box']
fig, ax = plt.subplots(1)

# Plot the image
ax.imshow(gg)

# Plot bounding boxes
# for bbox in bounding_boxes[:2]:
x_0, y_0, x_1, y_1 = bounding_boxes
width = x_1 - x_0
height = y_1 - y_0
rect = patches.Rectangle((x_0, y_0), width, height, linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(rect)

In [None]:
def show_cars(images, labels, bboxes, count=10):
    count = len(images)
    rows = 2 # 2 rows
    columns = (count // rows)
    if count % 2 == 1:
        columns += 1
    
    fig, ax = plt.subplots(rows, columns, figsize=(10,5))
    ax = ax.flatten()
    
    for i, col in enumerate(images):
        with Image.open(BASE_DIR + col) as car_image:
            ax[i].imshow(car_image)
            x_0, y_0, x_1, y_1 = bboxes[i]
            width = x_1 - x_0
            height = y_1 - y_0
            rect = patches.Rectangle((x_0, y_0), width, height, linewidth=1, edgecolor='r', facecolor='none')
            ax[i].set_xlabel(labels[i])
            ax[i].set_xticks([])
            ax[i].set_yticks([])
            ax[i].add_patch(rect)
    
    # Remove unused
    for j in range(count, rows * columns):
        fig.delaxes(ax[j])
        

samples = df.sample(10)
show_cars(samples['file'].to_list(), samples['text'].to_list(), samples['bbox'].to_list())

In [None]:
def order_points(pts):
    rect = np.zeros((4, 2), dtype = "float32")
    
    s = np.array(pts).sum(axis = 1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    
    diff = np.diff(pts, axis = 1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect

def four_point_transform(image, pts):
    rect = order_points(pts)
    
    tl, tr, br, bl = pts
    
    width_1 = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    width_2 = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    max_width = max(int(width_1), int(width_2))
    
    height_1 = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    height_2 = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    max_height = max(int(height_1), int(height_2))
    
    dst = np.array([
        [0, 0],
        [max_width, 0],
        [max_width, max_height],
        [0, max_height]], dtype = "float32")
    
    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(np.array(image), M, (max_width, max_height))
    return warped

In [None]:
df_train, df_val = train_test_split(df, test_size=0.2)

In [None]:
all_characters = ''.join(df['text'].astype(str))

# Get unique characters
unique_characters = set(all_characters)

# Print or use the unique characters as needed
print(len(unique_characters))

In [None]:
characters = sorted(unique_characters)
char_to_index = {char: idx+1 for idx, char in enumerate(characters)}
index_to_char = {idx: char for char, idx in char_to_index.items()}

characters = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
char_to_index = {char: idx+1 for idx, char in enumerate(characters)}
index_to_char = {idx: char for char, idx in char_to_index.items()}


to_remove = {
    'train': [],
    'val': []
}

def label_to_indices(label, idx=None, is_train=True):
    #         return [char_to_index[char] for char in label]
    try:
        res = []
        for char in label:
            if char not in char_to_index:
                # Add the character to char_to_index and index_to_char
                if idx != None:
                    if is_train:
                        to_remove['train'].append(idx)
                    else:
                        to_remove['val'].append(idx)
                    break
                    
                index = len(char_to_index)
                char_to_index[char] = index
                index_to_char[index] = char
            res.append(char_to_index[char])
        return res
    except KeyError as e:
        print(f"KeyError: {e} in label '{label}'")
        print([hex(ord(char)) for char in char_to_index])
        print(str(e)[0])
        print(char_to_index)
        print(char_to_index.get(e))
        raise
    
def indices_to_label(indices):
    return ''.join([index_to_char[idx] for idx in indices])

def generate_objectness_from_ground_truth(image_size, ground_truth_box):
    """
    Generate objectness values based on a ground truth box.

    Parameters:
    - image_size (tuple): Size of the input image (height, width).
    - ground_truth_box (torch.Tensor): Coordinates of the ground truth bounding box [x0, y0, x1, y1].

    Returns:
    - torch.Tensor: Objectness values (1 for positive region, 0 elsewhere).
    """
    objectness = torch.zeros(image_size, dtype=torch.int)
    # Extract coordinates of the ground truth box
    x0, y0, x1, y1 = ground_truth_box
    x0 = int(x0)
    y0 = int(y0)
    x1 = int(x1)
    y1 = int(y1)
    # Set the region around the ground truth box to 1 (positive region)
    objectness[y0:y1, x0:x1] = 1

    return objectness

class CarDataset(Dataset):
    def __init__(self, df, image_size, transforms, for_detection=False):
        super(CarDataset, self).__init__()
        self.df = df
        self.image_size = image_size
        self.transform = transforms
        self.index_to_char = {idx: char for char, idx in char_to_index.items()}
        self.for_detection = for_detection
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        col = self.df.iloc[idx]
        label = col['text']
        box = col['box']
        car_image = cv2.imread(BASE_DIR + col['file'])
        car_image = cv2.cvtColor(car_image, cv2.COLOR_BGR2RGB)
        
        if self.for_detection:
            new_width, new_height = IMAGE_SIZE_WHOLE
            original_width, original_height, _ = car_image.shape
            
            car_image = cv2.resize(car_image, IMAGE_SIZE_WHOLE, interpolation = cv2.INTER_AREA)
            
            scale_width = new_width / original_width
            scale_height = new_height / original_height
            
            x0, y0, x1, y1 = col['bbox']
            scaled_x0 = x0 * scale_width
            scaled_y0 = y0 * scale_height
            scaled_x1 = x1 * scale_width
            scaled_y1 = y1 * scale_height
            
            objectness = generate_objectness_from_ground_truth(IMAGE_SIZE_WHOLE, [scaled_x0, scaled_y0, scaled_x1, scaled_y1])
            
            car_image = self.transform(car_image)
            return (car_image, [scaled_x0, scaled_y0, scaled_x1, scaled_y1], objectness)
            
        padded_image = car_image
        # resize image
        padded_image = four_point_transform(padded_image, box)
        resized = cv2.resize(padded_image, dim, interpolation = cv2.INTER_AREA)
        padded_image = self.transform(resized)
        label_indices = label_to_indices(label)
        return padded_image, label_indices
    
    def collate_fn(self, batch):
        # Custom collate function to pad labels
        images, labels = zip(*batch)

        # Pad labels using pad_sequence
        padded_labels = pad_sequence([torch.LongTensor(item) for item in labels], batch_first=True, padding_value=0)

        # Convert padded_labels to a LongTensor
        padded_labels = padded_labels.type(torch.LongTensor)

        # Assuming images are already stacked into a tensor
        images_tensor = torch.stack(images)

        return images_tensor, padded_labels
    
    def custom_collate_detect_fn(self, batch):
        car_images, bboxes, objectness = zip(*batch)
        car_images = torch.stack(car_images)
        objectness = torch.stack(objectness)
        bboxes = torch.tensor(bboxes)
        return car_images, bboxes, objectness

In [None]:
df_train.head()

## Car Detection


In [None]:
transform_detect = transforms.Compose([
    transforms.ToTensor(),
#     transforms.RandomRotation(degrees=(-20, 20)),
    transforms.GaussianBlur(kernel_size=3),
# #     transforms.RandomAffine(degrees=5, shear=5, center=IMAGE_SIZE),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                         std=[0.229, 0.224, 0.225])
])
train_detect_ds = CarDataset(df_train, IMAGE_SIZE, transform_detect, True)
val_detect_ds = CarDataset(df_val, IMAGE_SIZE, transform_detect, True)

In [None]:
data_loader_detect = DataLoader(train_detect_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_detect_ds.custom_collate_detect_fn)

In [None]:
class CNN(nn.Module):
    def __init__(self, num_anchors):
        super(CNN, self).__init__()
        
        # CNN for feature extraction
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        # Fully connected layer for classification
        self.bbox_head = nn.Conv2d(1024, num_anchors * 4, kernel_size=3, padding=1)
        # Objectness prediction head
        self.objectness_head = nn.Sequential(
            nn.Conv2d(1024, num_anchors * 1, kernel_size=3, padding=1),
            nn.Conv2d(num_anchors * 1, 1, kernel_size=1)
        )
        

    def forward(self, x):
        # CNN
        batch_size = x.size(0)
        
        x = self.cnn(x)
        
        bbox_predictions = self.bbox_head(x)

        objectness_predictions = self.objectness_head(x)
        return bbox_predictions, objectness_predictions

In [None]:
num_anchors = 5  # Number of anchor boxes per spatial position
model_d = CNN(num_anchors)

In [None]:
summary(model=model_d,
        input_size=(1, 3, 220, 220),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
def generate_objectness(predicted_boxes, ground_truth_boxes, iou_threshold=0.5):
    """
    Generate objectness values based on IoU between predicted boxes and ground truth boxes.

    Parameters:
    - predicted_boxes (torch.Tensor): Tensor of predicted bounding boxes in the format [x0, y0, x1, y1].
    - ground_truth_boxes (torch.Tensor): Tensor of ground truth bounding boxes in the format [x0, y0, x1, y1].
    - iou_threshold (float): IoU threshold to determine positive matches.

    Returns:
    - torch.Tensor: Objectness values (1 for positive matches, 0 for negative matches).
    """
    batch_size = predicted_boxes.size(0)
    objectness = torch.zeros(batch_size, dtype=torch.int)

    for i in range(batch_size):
        # Calculate IoU
        iou = calculate_iou(predicted_boxes[i], ground_truth_boxes[i])

        # Assign objectness value based on IoU threshold
        objectness_value = 1 if iou > iou_threshold else 0
        objectness[i] = objectness_value

    return objectness

def calculate_iou(box1, box2):
    """
    Calculate Intersection over Union (IoU) between two bounding boxes.

    Parameters:
    - box1 (torch.Tensor): Coordinates of the first bounding box [x0, y0, x1, y1].
    - box2 (torch.Tensor): Coordinates of the second bounding box [x0, y0, x1, y1].

    Returns:
    - torch.Tensor: IoU value.
    """
    intersection_x0 = torch.max(box1[0], box2[0])
    intersection_y0 = torch.max(box1[1], box2[1])
    intersection_x1 = torch.min(box1[2], box2[2])
    intersection_y1 = torch.min(box1[3], box2[3])

    intersection_area = torch.max(torch.zeros_like(intersection_x1 - intersection_x0),
                                  (intersection_x1 - intersection_x0)) * torch.max(torch.zeros_like(intersection_y1 - intersection_y0),
                                                                                 (intersection_y1 - intersection_y0))
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    iou = intersection_area / (box1_area + box2_area - intersection_area + 1e-7)  # Adding a small epsilon to avoid division by zero

    return iou

In [None]:
num_epochs = 10
criterion_bbox = nn.SmoothL1Loss()
criterion_objectness = nn.BCEWithLogitsLoss()
optimizer_bbox = torch.optim.Adam(model_d.bbox_head.parameters(), lr=0.0001, weight_decay=1e-4)
optimizer_objectness = torch.optim.Adam(model_d.objectness_head.parameters(), lr=0.0001, weight_decay=1e-4)
num_epochs = 20
patience = 5

# Training loop
model.train()
def train_detect_model(model, train_loader, criterion_box, criterion_objectness, optimizer_box, optimizer_objectness, num_epochs=20):
    model.train()
    model.to(DEVICE)
    best_loss = float('inf')
    curr_patience = 0
    
    for epoch in range(num_epochs):
        model.train()
        loop = tqdm(train_loader, leave=True, desc=f'Epoch {epoch + 1}/{num_epochs}')

        total_loss = 0
        correct_predictions = 0
        total_samples = 0

        for batch_idx, (images, labels, objectness) in enumerate(loop):
            images, labels, objectness = images.to(DEVICE), labels.to(DEVICE), objectness.to(DEVICE)
            
            batch_size = images.size(0)
            
            criterion_objectness.zero_grad()
            optimizer_objectness.zero_grad()

            # Forward pass
            # outputs_box : self.bbox_head = nn.Conv2d(1024, num_anchors * 4, kernel_size=3, padding=1)
            # outputs_objectness : self.objectness_head = nn.Conv2d(1024, num_anchors * 1, kernel_size=3, padding=1)
            outputs_box, outputs_objectness = model(images)
            
            outputs_box = outputs_box.view(batch_size, -1, 4)
            labels = labels.unsqueeze(1).expand_as(outputs_box)
            
#             outputs_objectness = outputs_objectness.view(batch_size, -1, num_anchors, 1)
            
            box_loss = criterion_box(outputs_box, labels)
            objectness_loss = criterion_objectness(torch.sigmoid(outputs_objectness), objectness.unsqueeze(1).float())

            # Backward pass
            box_loss.backward()
            objectness_loss_value.backward()

            # Update weights
            criterion_objectness.step()
            optimizer_objectness.step()

            # Update metrics
            total_loss += loss.item()
            
#             _, predicted = torch.max(outputs, 2)
#             correct_predictions += (predicted == labels).sum().item()
#             total_samples += batch_size
            
            # Update tqdm bar description with the current loss and accuracy
        
            loop.set_postfix(loss_bbox=total_loss / (batch_idx + 1), loss_objectness=objectness_loss.item() / (batch_idx + 1))

        epoch_loss = total_loss / len(train_loader)
        # Save model checkpoint only if it's the best model so far
        if epoch_loss < best_loss:
            print("Saving")
            curr_patience = 0
            best_loss = epoch_loss
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': best_loss,
            }
            torch.save(checkpoint, 'detection_model.pth')
        else:
            curr_patience += 1
        
        if curr_patience > patience:
            print('Stopping due to patience')
            break
            
#         accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}')

In [None]:
train_detect_model(model_d, data_loader_detect, criterion_bbox, criterion_objectness, optimizer_bbox, optimizer_objectness)

## OCR Plate

In [None]:
for idx, col in df_train.iterrows():
    label = col['text']
    indexes = label_to_indices(label, idx)
    
for idx, col in df_val.iterrows():
    label = col['text']
    indexes = label_to_indices(label, idx, False)

In [None]:
df_train.drop(index=to_remove['train'], inplace=True)
df_val.drop(index=to_remove['val'], inplace=True)

DROP INDEX 26805 as it cause issue on converting it into image

In [None]:
df_train.drop(index=26805, inplace=True)

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
#     transforms.Resize(IMAGE_SIZE),
#     transforms.RandomRotation(degrees=(-20, 20)),
#     transforms.GaussianBlur(kernel_size=3),
# #     transforms.RandomAffine(degrees=5, shear=5, center=IMAGE_SIZE),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                         std=[0.229, 0.224, 0.225])
])
train_ds = CarDataset(df_train, IMAGE_SIZE, transform)
val_ds = CarDataset(df_val, IMAGE_SIZE, transform)

In [None]:
# unique_characters = set()

# for _, label in train_ds:
#     unique_characters.update(set(label))

# # Convert the set back to a list if needed
# unique_characters_list = list(unique_characters)

In [None]:
img, label = train_ds[2]
plt.imshow(img.permute(1, 2, 0).numpy())
plt.xlabel(indices_to_label(label))
plt.show()

In [None]:
data_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_ds.collate_fn)

In [None]:
gg = next(iter(data_loader))

In [None]:
gg[1].shape

# MODEL

In [None]:
len(char_to_index)

In [None]:
len(index_to_char)

In [None]:
class CRNN(nn.Module):
    def __init__(self, num_classes, cnn_out_channels=512, rnn_hidden_size=256):
        super(CRNN, self).__init__()
        
        # CNN for feature extraction
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
#             nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
#             nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1)),
            
            nn.Conv2d(512, cnn_out_channels, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
#             nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))
        )
        
        # RNN for sequence processing
        self.rnn = nn.LSTM(6144, 512, bidirectional=True, batch_first=True, num_layers=1)
#         self.rnn2 = nn.LSTM(1024, 512, bidirectional=True, batch_first=True, num_layers=1)
        
        # Fully connected layer for classification
        self.fc1 = nn.Linear(512 * 2, 1024)
        self.fc2 = nn.Linear(1024, num_classes)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        # CNN
        x = self.cnn(x)
        
        # Reshape for RNN
        batch_size, channels, height, width = x.size()
        x = x.permute(0, 3, 1, 2).contiguous()
        x = x.view(batch_size, width, channels * height)
        # RNN
        x, _ = self.rnn(x)
#         x, _ = self.rnn2(x)
        
        # Fully connected layer
#         x = self.fc(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [None]:
model = CRNN(len(char_to_index) + 1)

In [None]:
summary(model=model,
        input_size=(1, 3, 50, 120),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

In [None]:
checkpoint_path = "crnn_checkpoint.pth"

In [None]:
num_epochs = 10
criterion = nn.CTCLoss(blank=0, zero_infinity=False)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)
num_epochs = 20
patience = 5

# Training loop
model.train()
def train_model(model, train_loader, criterion, optimizer, num_epochs=20):
    model.train()
    model.to(DEVICE)
    best_loss = float('inf')
    curr_patience = 0
    
    for epoch in range(num_epochs):
        model.train()
        loop = tqdm(train_loader, leave=True, desc=f'Epoch {epoch + 1}/{num_epochs}')

        total_loss = 0
        correct_predictions = 0
        total_samples = 0

        for batch_idx, (images, labels) in enumerate(loop):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            
            batch_size = images.size(0)
            
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)  # outputs is torch.Size([N, 62, 51]])
            log_probs = outputs  # (T, N, C) based on documentation
            log_probs = nn.functional.log_softmax(log_probs, dim=2).permute(1, 0, 2)
            input_lengths = torch.tensor([len(seq) for seq in outputs])
    
            target_lengths = torch.sum(labels != 0, dim=1)
            target = labels.long()
            
            loss = criterion(log_probs, target, input_lengths, target_lengths)
            
            # Backward pass
            loss.backward()

            # Update weights
            optimizer.step()

            # Update metrics
            total_loss += loss.item()
            
#             _, predicted = torch.max(outputs, 2)
#             correct_predictions += (predicted == labels).sum().item()
#             total_samples += batch_size
            
            # Update tqdm bar description with the current loss and accuracy
        
            loop.set_postfix(loss=total_loss / (batch_idx + 1))

        epoch_loss = total_loss / len(train_loader)
        # Save model checkpoint only if it's the best model so far
        if epoch_loss < best_loss:
            print("Saving")
            curr_patience = 0
            best_loss = epoch_loss
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': best_loss,
            }
            torch.save(checkpoint, checkpoint_path)
        else:
            curr_patience += 1
        
        if curr_patience > patience:
            print('Stopping due to patience')
            break
            
#         accuracy = correct_predictions / total_samples
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}')

In [None]:
train_model(model, data_loader, criterion, optimizer)

# Load Checkpoint

In [None]:
import keras

In [None]:
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
img1, lbl1 = val_ds[54]
img1 = img1.to(DEVICE)

In [None]:
model.eval()
with torch.no_grad():
    outputt = model(img1.unsqueeze(0)).cpu()
    print(outputt.argmax(2))
    text_out = ''.join([index_to_char[x] for x in outputt.argmax(2)[0].numpy() if x != 0 ] )

input_len = torch.tensor([len(seq) for seq in outputt])
preds = keras.backend.ctc_decode(outputt, input_length=input_len, greedy=True)
decoded = ''.join([index_to_char[x] for x in preds[0][0][0].numpy() if x != 0 and x != -1])

print("Actual: ", indices_to_label(lbl1))
print("Predicted: ", text_out)
print("Predicted ctc decode: ", decoded)

### Extract Data from car-plate-detection dataset

In [None]:
def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    filename = root.find('filename').text

    # Extract bounding box information
    bboxes = []
    for obj in root.findall('object'):
        xmin = int(obj.find('bndbox/xmin').text)
        ymin = int(obj.find('bndbox/ymin').text)
        xmax = int(obj.find('bndbox/xmax').text)
        ymax = int(obj.find('bndbox/ymax').text)

        bbox = {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
        bboxes.append(bbox)

    return filename, bboxes
data = {'file_name': [], 'bbox': []}
for file in os.listdir('/kaggle/input/car-plate-detection/annotations/'):
    filename, bboxes = parse_xml(file)

    for bbox in bboxes:
        data['file_name'].append(filename)
        data['bbox'].append([bbox['xmin'], bbox['ymin'], bbox['xmax'], bbox['ymax']])
#         data['xmin'].append(bbox['xmin'])
#         data['ymin'].append(bbox['ymin'])
#         data['xmax'].append(bbox['xmax'])
#         data['ymax'].append(bbox['ymax'])

# Will be used for testing as this is just for detection not reading
df_test = pd.DataFrame(data) # Pascal voc format for bbox (Xmin, Ymin, Xmax, Ymax)