In [None]:
# %% [markdown]
# ## Imports
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import ViTForImageClassification
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from VIT_Model import VIT
# %% [markdown]
# ## Load Data
print(torch.__version__)

def load_dir_into_df(directory='/home/hdd/OTABEK/VIT_OCR/dataset'):
    im_paths_list = []
    im_labels_list = []
    class_dict = {}
    all_labels = []
    for i, dir in enumerate(sorted(os.listdir(directory))):
        class_dict[dir] = i
        all_labels.append(dir)
        for impath in sorted(os.listdir(os.path.join(directory, dir))):
            im_paths_list.append(os.path.join(directory, dir, impath))
            im_labels_list.append(i)
    df = pd.DataFrame({'img': im_paths_list, 'label': im_labels_list})
    return df, class_dict, all_labels

data_df, class_dict, all_labels = load_dir_into_df()

# %% [markdown]
# ## Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]

        # Load image
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)

        return img, label

# %% [markdown]
# ## Data Preprocessing
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# Create dataset
dataset = CustomDataset(data_df['img'].values, data_df['label'].values, transform)

# %% [markdown]
# ## K-Fold Cross Validation
num_epochs = 20
k_folds = 5

# Create KFold object
kfold = KFold(n_splits=k_folds, shuffle=True)

# Store results
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(np.arange(len(dataset)))):
    print(f"Fold {fold + 1}/{k_folds}")
    
    # Sample elements for the current fold
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    
    # Create data loaders
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)
    
    # Load the Vision Transformer Model
    model = ViTForImageClassification.from_pretrained(
        'google/vit-base-patch16-224',
        num_labels=len(all_labels),
        ignore_mismatched_sizes=True
    )
    
    # Define Loss and Optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation Phase
        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).logits
                loss = criterion(outputs, labels)
                running_val_loss += loss.item()

        avg_val_loss = running_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Evaluate the Model
    model.eval()
    correct = 0
    total = 0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Collect true and predicted labels for report
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Print accuracy for this fold
    accuracy = 100 * correct / total
    print(f'Accuracy of the model on the validation set for fold {fold + 1}: {accuracy:.2f}%')
    
    # Store results for classification report and confusion matrix
    all_y_true.extend(y_true)
    all_y_pred.extend(y_pred)

# %% [markdown]
# ## Classification Report
report = classification_report(all_y_true, all_y_pred)
print(report)

# %% [markdown]
# ## Confusion Matrix Visualization
confusion_mat = confusion_matrix(all_y_true, all_y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=all_labels)
cm_display.plot()
plt.title('Confusion Matrix Across K-Folds')
plt.show()
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': num_epochs,
}, 'final_model.pth')

print('Final model saved successfully.')

In [None]:
import os
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import ViTForImageClassification
from torchvision import transforms
from PIL import Image
# Load the trained ViT model for character recognition
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
characterRecognition = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224', 
    num_labels=36,  # Set to 36 to match the checkpoint
    ignore_mismatched_sizes=True
)
checkpoint = torch.load('final_model.pth')
characterRecognition.load_state_dict(checkpoint['model_state_dict'])
characterRecognition.to(device)
characterRecognition.eval()

# Adjust the dictionary to account for 36 classes (if necessary)
dictionary = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9',
              10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'I', 19: 'J',
              20: 'K', 21: 'L', 22: 'M', 23: 'N', 24: 'O', 25: 'P', 26: 'Q', 27: 'R', 28: 'S', 29: 'T',
              30: 'U', 31: 'V', 32: 'W', 33: 'X', 34: 'Y', 35: 'Z'}  # Update as needed


# Define preprocessing transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

def cnnCharRecognition(img):
    # Preprocess the character image
    img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    img = transform(img).unsqueeze(0).to(device)

    # Predict character
    with torch.no_grad():
        outputs = characterRecognition(img).logits
        predicted_idx = torch.argmax(outputs, dim=1).item()

    # Map prediction to character
    return dictionary[predicted_idx]

def auto_canny(image, sigma=0.33):
    v = np.median(image)
    lower = int(max(0, (1.0 - sigma) * v))
    upper = int(min(255, (1.0 + sigma) * v))
    edged = cv2.Canny(image, lower, upper)
    return edged

def opencvReadPlate(img):
    charList = []
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh_inv = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 39, 1)
    edges = auto_canny(thresh_inv)
    ctrs, _ = cv2.findContours(edges.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])
    img_area = img.shape[0] * img.shape[1]

    for i, ctr in enumerate(sorted_ctrs):
        x, y, w, h = cv2.boundingRect(ctr)
        roi_area = w * h
        non_max_sup = roi_area / img_area

        # Filter out non-character contours
        if (non_max_sup >= 0.015) and (non_max_sup < 0.09):
            if (h > 1.2 * w) and (3 * w >= h):
                # Recognize character
                char_img = img[y:y + h, x:x + w]
                char = cnnCharRecognition(char_img)
                charList.append(char)
                cv2.rectangle(img, (x, y), (x + w, y + h), (90, 0, 255), 2)

    # Display the result with bounding boxes
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

    licensePlate = "".join(charList)
    return licensePlate

# Load a test image
test_image_path = "/home/hdd/OTABEK/VIT_OCR/plate1.PNG"  # Replace with your license plate image path
plate_img = cv2.imread(test_image_path)

# Recognize characters
if plate_img is not None:
    license_text = opencvReadPlate(plate_img)
    print("Recognized License Plate Text:", license_text)
else:
    print("Error: Could not load the image.")

