In [None]:
# ===============================
# IMPORTS
# ===============================
import pandas as pd
import matplotlib.pyplot as plt
import timeit
from collections import Counter
import csv
import string

import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize

import cv2
import numpy as np
import os
import imutils
from imutils.contours import sort_contours

import seaborn as sns
import time

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform

from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

In [None]:

# ===============================
# SETTINGS
# ===============================
from pandas import DataFrame


img_size = 32
max_train = 500
max_val = 100

max_images = (max_train + max_val)

train_dir = "dataset/train_v2/train"
test_dir = "dataset/test_v2/test"
val_dir   = "dataset/validation_v2/validation"

train_csv_path = "dataset/written_name_train_v2.csv"
test_csv_path = "dataset/written_name_test_v2.csv"
val_csv_path   = "dataset/written_name_validation_v2.csv"

# ===============================
# LOAD CSV LABELS
# ===============================
train_csv: DataFrame = pd.read_csv(train_csv_path)
test_csv: DataFrame = pd.read_csv(test_csv_path)
val_csv: DataFrame   = pd.read_csv(val_csv_path)

# Ensure labels are strings
train_csv["IDENTITY"] = train_csv["IDENTITY"].astype(str)
test_csv["IDENTITY"] = test_csv["IDENTITY"].astype(str)
val_csv["IDENTITY"]   = val_csv["IDENTITY"].astype(str)

train_file_to_label = dict(zip(train_csv['FILENAME'], train_csv['IDENTITY']))
test_file_to_label = dict(zip(test_csv['FILENAME'], test_csv['IDENTITY']))
val_file_to_label   = dict(zip(val_csv['FILENAME'], val_csv['IDENTITY']))

In [None]:
# ===============================
# DATASET EXPLORATION
# ===============================
print("\nTrain CSV\n",train_csv.head())
print("\nTest CSV\n",test_csv.head())
print("\nValidation CSV\n",val_csv.head())

# Counter for letters
letter_counts = Counter()

with open(train_csv_path, newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        # Join all columns into one string
        text = " ".join(row).lower()

        # Keep only alphabet letters
        for ch in text:
            if ch in string.ascii_lowercase:
                letter_counts[ch] += 1

# Print results sorted alphabetically
for letter in string.ascii_lowercase:
    print(f"{letter}: {letter_counts[letter]}")


In [None]:
# ===============================
# MANUAL LETTER SPLITTER
# ===============================

def get_bounding_boxes(img_path: str, show_plts = False):
    if not os.path.exists(img_path):
        print("❌ Error: File not found at", img_path)
        return [], None

    image = cv2.imread(img_path)
    if image is None:
        print("❌ Error: Could not read the image. Check file format or path.")
        return [], None
    
    if show_plts:
        print("✅ Loaded image shape:", image.shape)


        plt.figure(figsize=(8,6))
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.title("Original Image")
        plt.show()

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Threshold + dilation
    # Flip image to be black on white instead of white on black 
    # Fill holes in contours
    _, thresh1 = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)
    dilated = cv2.dilate(thresh1, kernel=filter_kernel, iterations=2)

    if show_plts:
        plt.figure(figsize=(8,6))
        plt.imshow(cv2.cvtColor(dilated, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.title("Dilated Image")
        plt.show()

    # Find contours
    cnts = cv2.findContours(dilated.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)

    if len(cnts) == 0:
        if show_plts:
            print("Detected 0 letters: ", img_path)
        return image, []

    cnts = sort_contours(cnts, method="left-to-right")[0]

    letters = []

    for i, c in enumerate(cnts):
        if cv2.contourArea(c) < 10:
            continue
        x, y, w, h = cv2.boundingRect(c)

        # add buffer area
        w += 2
        h += 2

        x -= 1
        y -= 1

        # Extract ROI
        roi = gray[y:y+h, x:x+w]

        # Safety check
        if roi.size == 0 or roi.shape[0] == 0 or roi.shape[1] == 0:
            if show_plts:
                print(f"⚠️ Skipping empty ROI at contour {i} -> x={x}, y={y}, w={w}, h={h}")
            continue
        if show_plts:
            print(f"➡️ Contour {i}: ROI shape = {roi.shape}")

        # Draw bounding box
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # Preprocess ROI for CNN
        thresh = cv2.threshold(roi, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
        thresh = cv2.resize(thresh, (img_size, img_size), interpolation=cv2.INTER_CUBIC)
        thresh = thresh.astype("float32") / 255.0
        # thresh = np.expand_dims(thresh, axis=-1)  # (32, 32, 1)
        # thresh = np.expand_dims(thresh, axis=0)   # (1, 32, 32, 1)
        # thresh = np.squeeze(thresh)               # ()

        letters.append(thresh)
    return image, letters

def plot_processed_images(images, img_size=32):
    """
    Plots a list of preprocessed images as subplots.

    Parameters:
        images: list or array of preprocessed images, shape can be (N, H, W, 1) or (N, H, W)
        img_size: size of the images (for reshaping if needed)
    """
    # Ensure images are 2D for plotting
    processed_images = []
    for im in images:
        if im.ndim == 4:  # (1, H, W, 1)
            processed_images.append(im[0, :, :, 0])
        elif im.ndim == 3 and im.shape[-1] == 1:  # (H, W, 1)
            processed_images.append(im[:, :, 0])
        else:
            processed_images.append(im)

    n = len(processed_images)
    cols = 6
    rows = (n // cols) + int(n % cols != 0)

    plt.figure(figsize=(3*cols, 3*rows))
    for i, roi in enumerate(processed_images):
        plt.subplot(rows, cols, i+1)
        plt.imshow(roi, cmap='gray')
        plt.axis('off')
    plt.suptitle("Preprocessed ROIs", fontsize=16)
    plt.tight_layout()
    plt.show()  

filter_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))

example_image = os.path.join(r"dataset\test_v2\test", "TEST_0001.jpg")
image, counturs = get_bounding_boxes(example_image, show_plts=True)

plt.figure(figsize=(8,6))
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # type: ignore
plt.axis('off')
plt.title("Letter bounding boxes")
plt.show()

plot_processed_images(images=counturs)


In [None]:
# ===============================
# BENCHMARK
# ===============================

example_image = os.path.join(r"dataset\test_v2\test", "TEST_0001.jpg")

def wrapper():
    get_bounding_boxes(example_image)
time = timeit.timeit(wrapper, number=50)
print("Time to run: ",time)
print("Time for all images: ",time * (max_train + max_val))


In [None]:
# ===============================
# LOGISTIC REGRESSION PREP
# ===============================
def logistic_image_prep(file_name,folder_path,file_label, debug=False):
    example_image = os.path.join(folder_path, file_name)
    image, counturs = get_bounding_boxes(example_image, show_plts=debug)

    if debug:
        plt.figure(figsize=(8,6))
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # type: ignore
        plt.axis('off')
        plt.show()

        plot_processed_images(images=counturs)

    if debug:
        print(train_file_to_label[file_name])

    if len(file_label) != len(counturs): # type: ignore
        if debug:
            print("Mismatch in counturs and label")
        return 1
    return [[img, label] for img, label in zip(counturs, file_label)]  # type: ignore 
            # (letter[], letterLabel)



In [None]:

# ===============================
# CREATE TRAINING SET
# ===============================
print("Creating Training Sets")
errors = 0
images = []
for idx, file in train_csv.iterrows():
    file_label = file["IDENTITY"]
    file_name = file["FILENAME"]
    res = logistic_image_prep(file_name=file_name,folder_path=r"dataset\train_v2\train",file_label=file_label)
    if res != 1:
        images.extend(res)
    else:
        errors += 1

    if len(images) >= max_train:
        print("Rows Processed: ", idx)
        print("% of full dataset: ", round(idx / len(train_csv)*100,2)) # type: ignore
        break

print("Errors: ", errors)
print("Images:",len(images))

arr = np.array(images, dtype=object)   # shape (3, 2)


# Split into two arrays (columns)
x_train = np.stack(arr[:, 0], axis=0)    # type: ignore
y_train = arr[:, 1]   # second column

print(x_train.shape)


In [None]:
# ===============================
# CREATE VALIDATION SET
# ===============================
print("Creating validation Sets")
errors = 0
images = []
for idx, file in val_csv.iterrows():
    file_label = file["IDENTITY"]
    file_name = file["FILENAME"]
    res = logistic_image_prep(file_name=file_name,folder_path=r"dataset\validation_v2\validation",file_label=file_label)
    if res != 1:
        images.extend(res)
    else:
        errors += 1

    if len(images) >= max_val:
        print("Rows Processed: ", idx)
        print("% of full dataset: ", round(idx / len(test_csv)*100,2)) # type: ignore
        break

print("Errors: ", errors)
print("Images:",len(images))

arr = np.array(images, dtype=object)   # shape (3, 2)

# Split into two arrays (columns)
x_val = np.stack(arr[:, 0], axis=0) # type: ignore
y_val = arr[:, 1]   # second column

print(x_train.shape)


In [None]:
# Flatten each 28x28 image into a 784-length vector
x_train_flat = x_train.reshape(x_train.shape[0], -1)
x_val_flat = x_val.reshape(x_val.shape[0], -1)

# Normalize pixel values to range [0, 1]
x_train_flat = x_train_flat.astype('float32') / 255.0
x_val_flat = x_val_flat.astype('float32') / 255.0

In [58]:
# Encode labels consistently across train + val
le = LabelEncoder()
le.fit(list(y_train) + list(y_val))

y_train_enc = le.transform(y_train) - 1
y_val_enc  = le.transform(y_val) - 1 





print("Unique encoded labels:", np.unique(y_train_enc))
print("Number of unique classes:", len(np.unique(y_train_enc)))

Unique encoded labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]
Number of unique classes: 27


In [59]:

param_distributions = {
    "n_estimators": randint(200, 600),
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.2),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "gamma": uniform(0, 0.5),
    "reg_lambda": uniform(0.5, 2.0)
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)



In [60]:
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=len(np.unique(y_train_enc)),
    eval_metric="mlogloss",
    tree_method="gpu_hist",     # ⚡ GPU training
    predictor="gpu_predictor",  # ⚡ GPU inference
    use_label_encoder=False,
    verbosity=1,                # Show training progress per fold
    random_state=42
)



In [61]:
from sklearn.model_selection import ParameterSampler, cross_val_score
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

def tqdm_random_search_cv(
    estimator,
    param_distributions,
    n_iter,
    X,
    y,
    cv,
    scoring,
    refit=True,
    random_state=None,
    n_jobs=-1,
):
    """
    Safe, tqdm-based RandomizedSearchCV replacement for modern scikit-learn.
    """
    results = []
    best_score = -np.inf
    best_params = None

    # Sample random hyperparameter combinations
    sampler = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=random_state))

    pbar = tqdm(sampler, desc="Random Search Progress", leave=True)

    for params in pbar:
        # Apply parameters to the model
        model = estimator.set_params(**params)

        # Cross-validation scoring
        scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs)
        mean_score = np.mean(scores)
        std_score = np.std(scores)

        results.append({
            "params": params,
            "mean_score": mean_score,
            "std_score": std_score
        })

        if mean_score > best_score:
            best_score = mean_score
            best_params = params

        # Update progress bar
        pbar.set_postfix({"Best Acc": f"{best_score:.4f}"})

    pbar.close()

    # Convert results to DataFrame for easy inspection
    results_df = pd.DataFrame(results).sort_values("mean_score", ascending=False)

    best_estimator = None
    if refit:
        best_estimator = estimator.set_params(**best_params)
        best_estimator.fit(X, y)

    return best_estimator, best_params, best_score, results_df


In [62]:
best_model, best_params, best_score, search_results = tqdm_random_search_cv(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=30,
    X=x_train_flat,
    y=y_train_enc,
    cv=cv,
    scoring="accuracy",
    refit=True,
    random_state=42,
    n_jobs=-1
)


1 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\thomw\LaptopBackup\UNI\PRML\PRML_Digit_Recognition\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\thomw\LaptopBackup\UNI\PRML\PRML_Digit_Recognition\.venv\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\thomw\LaptopBackup\UNI\PRML\PRML_Digit_Recognition\.venv\Lib\site-packages\xgboost\sklearn.py", line 1641, in fit
    raise ValueError(
ValueError: Invalid classes inferred from 

TypeError: xgboost.sklearn.XGBModel.set_params() argument after ** must be a mapping, not NoneType

In [None]:
# ================================================================
# 5️⃣ Display best parameters and performance
# ================================================================

print("\n✅ Best Parameters Found:")
for key, val in random_search.best_params_.items():
    print(f"{key:20s}: {val}")

print(f"\nBest Cross-Validation Accuracy: {random_search.best_score_:.4f}")

# ================================================================
# 6️⃣ Evaluate best model on validation set
# ================================================================

best_model = random_search.best_estimator_
val_accuracy = best_model.score(x_val_flat, y_val_enc)
print(f"\nValidation Accuracy (Best Model): {val_accuracy:.4f}")

# ================================================================
# 7️⃣ Visualize feature importances
# ================================================================

plt.figure(figsize=(8, 6))
xgb.plot_importance(best_model, importance_type="gain", max_num_features=15)
plt.title("Top 15 Feature Importances (Gain)")
plt.show()

# ================================================================
# 8️⃣ Heatmap of hyperparameter impact (importance)
# ================================================================

# Extract all tested parameter sets + mean test scores
results_df = pd.DataFrame(random_search.cv_results_)

# Select relevant columns for visualization
cols = [c for c in results_df.columns if c.startswith('param_')] + ['mean_test_score']
params_df = results_df[cols].copy()

# Normalize continuous hyperparameters for correlation analysis
for col in params_df.columns:
    if params_df[col].dtype.kind in "fi":  # float/int columns
        params_df[col] = params_df[col].astype(float)

# Compute correlations between params and score
corr = params_df.corr(numeric_only=True)['mean_test_score'].sort_values(ascending=False)

# Display correlations
print("\n🔍 Hyperparameter correlation with mean CV score:\n")
print(corr)

# Plot as heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(corr.to_frame(), annot=True, cmap="coolwarm", cbar=False, fmt=".2f")
plt.title("Correlation of Hyperparameters with CV Accuracy")
plt.ylabel("Hyperparameter")
plt.xlabel("Correlation with Accuracy")
plt.tight_layout()
plt.show()

print("\n✅ Hyperparameter tuning with GPU and cross-validation complete.")


In [None]:
evals_result = best_model.evals_result()

# --- Plot logloss ---
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(evals_result['validation_0']['mlogloss'], label='Train Logloss')
plt.plot(evals_result['validation_1']['mlogloss'], label='Val Logloss')
plt.xlabel('Iteration')
plt.ylabel('Log Loss')
plt.title('Training vs Validation Log Loss')
plt.legend()
plt.grid(True)

# --- Plot classification error ---
plt.subplot(1, 2, 2)
plt.plot(evals_result['validation_0']['merror'], label='Train Error')
plt.plot(evals_result['validation_1']['merror'], label='Val Error')
plt.xlabel('Iteration')
plt.ylabel('Error Rate')
plt.title('Training vs Validation Error')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
y_pred = clf.predict(x_val_flat)
cm = confusion_matrix(y_val_enc, y_pred)
print("\nClassification Report:\n", classification_report(y_val_enc, y_pred,target_names=le.classes_))

In [None]:
y_pred_proba = est.predict_proba(x_val_flat)
# Get list of classes from the label encoder
classes = le.classes_

# Binarize the labels (one-hot encode)
y_val_bin = label_binarize(y_val_enc, classes=np.arange(len(classes)))
n_classes = y_val_bin.shape[1]

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_bin[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Micro-average (considers all classes at once)
fpr["micro"], tpr["micro"], _ = roc_curve(y_val_bin.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Macro-average (average of all class AUCs)
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

plt.figure(figsize=(8, 6))
for i, label in enumerate(classes):
    plt.plot(fpr[i], tpr[i], lw=1.5, label=f'Class {label} (AUC = {roc_auc[i]:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves per Class')
plt.legend(loc='lower right', fontsize='small')
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(fpr["micro"], tpr["micro"],
         label=f'Micro-average (AUC = {roc_auc["micro"]:.3f})',
         color='deeppink', linestyle='-', linewidth=2)
plt.plot(fpr["macro"], tpr["macro"],
         label=f'Macro-average (AUC = {roc_auc["macro"]:.3f})',
         color='navy', linestyle='--', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Micro vs Macro ROC Curves')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
y_pred_decoded = le.inverse_transform(y_pred)

correct_indices = np.where(y_pred_decoded == y_val)[0]
incorrect_indices = np.where(y_pred_decoded != y_val)[0]

# Visualize first 10 correct and incorrect predictions
plt.figure(figsize=(12, 3.5))

# First row: correct predictions
for i, idx in enumerate(correct_indices[:10]):
    plt.subplot(2, 10, i+1)
    plt.imshow(x_val[idx], cmap='gray')
    plt.title(f"P:{y_pred_decoded[idx]}\nT:{y_val[idx]}", color='green', fontsize=8)
    plt.axis('off')

# Second row: incorrect predictions
for i, idx in enumerate(incorrect_indices[:10]):
    plt.subplot(2, 10, 10 + i + 1)
    plt.imshow(x_val[idx], cmap='gray')
    plt.title(f"P:{y_pred_decoded[idx]}\nT:{y_val[idx]}", color='red', fontsize=8)
    plt.axis('off')

plt.suptitle("Top Row: Correct Predictions | Bottom Row: Incorrect Predictions", fontsize=14)
plt.show()

In [None]:
import joblib
joblib.dump(clf, "XGBoost2.pkl")
#joblib save us