In [1]:
# Cell 1 — Imports & paths (edit nothing if your paths are as stated)
import os
from pathlib import Path
import random
import shutil
from tqdm import tqdm
from PIL import Image, ImageEnhance, ImageOps
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

# === EDIT IF NEEDED ===
DATA_ROOT = Path(r"C:\Users\TIK03\Documents\GitHub\DIT5411-HoYiTik\Assgnment\data\characters")
NOTEBOOK_ROOT = Path(r"C:\Users\TIK03\Documents\GitHub\DIT5411-HoYiTik\Assgnment")
OUTPUT_ROOT = NOTEBOOK_ROOT / "processed_data"            # will be created
MODELS_DIR = NOTEBOOK_ROOT / "saved_models"
REPORTS_DIR = NOTEBOOK_ROOT / "reports"
# =======================

OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_ROOT:", DATA_ROOT)
print("OUTPUT_ROOT:", OUTPUT_ROOT)


DATA_ROOT: C:\Users\TIK03\Documents\GitHub\DIT5411-HoYiTik\Assgnment\data\characters
OUTPUT_ROOT: C:\Users\TIK03\Documents\GitHub\DIT5411-HoYiTik\Assgnment\processed_data


In [2]:
# Cell 2 — Configuration & helper augmentation functions
# General config (tune these to your hardware)
IMG_SIZE = (64, 64)         # resize images to 64x64 (grayscale)
TARGET_PER_CLASS = 200      # create at least this many training samples per class
TRAIN_SAMPLES_PER_CLASS = 40  # the first 40 samples are training seeds (per assignment)
BATCH_SIZE = 128
EPOCHS = 12                 # increase when you have more time/GPU
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE
FORCE_NUM_CLASSES = None    # set to 13065 to force output neurons (optional). If None uses actual classes.

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Image augmentation helpers using PIL (keeps dependencies minimal)
def random_augment_pil(img: Image.Image):
    """Apply random augmentation to a PIL image and return a new PIL image."""
    # Random rotation
    angle = random.uniform(-20, 20)
    img = img.rotate(angle, resample=Image.BILINEAR, expand=False, fillcolor=255)

    # Random affine (translate / shear)
    max_shift = 0.12  # fraction of width/height
    tx = random.uniform(-max_shift, max_shift) * img.width
    ty = random.uniform(-max_shift, max_shift) * img.height
    shear = random.uniform(-8, 8)
    img = img.transform(
        img.size,
        Image.AFFINE,
        (1, -shear/100.0, tx, shear/100.0, 1, ty),
        resample=Image.BILINEAR,
        fillcolor=255
    )

    # Random scale / crop then pad back (zoom)
    zoom = random.uniform(0.85, 1.15)
    new_w = int(img.width * zoom)
    new_h = int(img.height * zoom)
    img = img.resize((new_w, new_h), resample=Image.BILINEAR)
    # center-crop or pad to original size
    img = ImageOps.fit(img, (img.width, img.height), method=Image.BILINEAR)
    img = ImageOps.fit(img, IMG_SIZE, method=Image.BILINEAR)

    # Random contrast/brightness
    if random.random() < 0.5:
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(random.uniform(0.8, 1.2))
    if random.random() < 0.5:
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(random.uniform(0.85, 1.15))

    # final convert to 'L' (grayscale)
    img = img.convert('L')
    return img

def load_image_as_pil(path):
    img = Image.open(path).convert('L')
    # ensure consistent canvas and size
    img = ImageOps.invert(img) if np.mean(img) < 128 else img  # try to make background light if needed
    img = ImageOps.fit(img, IMG_SIZE, method=Image.BILINEAR)
    return img


In [3]:
# Cell 3 — Build train/test directories and perform augmentation until >= TARGET_PER_CLASS per class
# Directory layout created:
# OUTPUT_ROOT/train/<class_name>/*.png
# OUTPUT_ROOT/test/<class_name>/*.png

train_dir = OUTPUT_ROOT / "train"
test_dir = OUTPUT_ROOT / "test"
# clean previous processed dirs if exist (uncomment to force fresh)
# shutil.rmtree(OUTPUT_ROOT, ignore_errors=True)
train_dir.mkdir(parents=True, exist_ok=True)
test_dir.mkdir(parents=True, exist_ok=True)

# discover class folders
class_dirs = [p for p in sorted(DATA_ROOT.iterdir()) if p.is_dir()]
print(f"Found {len(class_dirs)} class folders under DATA_ROOT.")

classes = []
for cpath in class_dirs:
    class_name = cpath.name
    classes.append(class_name)

# If no class directories (maybe files in root), try a fallback: treat files named like "<char>_xxx.png"
if not classes:
    images = [p for p in sorted(DATA_ROOT.iterdir()) if p.is_file()]
    # group by prefix before underscore
    from collections import defaultdict
    groups = defaultdict(list)
    for im in images:
        name = im.name
        key = name.split('_')[0]
        groups[key].append(im)
    classes = sorted(groups.keys())
    class_dirs = []
    for k in classes:
        # create temporary list of files
        class_dirs.append((k, groups[k]))

print("Preparing train/test split and augmentation. This can take time depending on dataset size.")

# If classes were normal Path objects:
if all(isinstance(p, Path) for p in class_dirs):
    for p in tqdm(class_dirs, desc="Processing classes"):
        class_name = p.name
        files = sorted([f for f in p.iterdir() if f.suffix.lower() in ('.png','.jpg','.jpeg')])
        if len(files) == 0:
            continue
        # take first 40 as seeds for training (or fewer if not enough)
        seed_train = files[:TRAIN_SAMPLES_PER_CLASS]
        seed_test = files[TRAIN_SAMPLES_PER_CLASS:]
        # create dirs
        (train_dir / class_name).mkdir(parents=True, exist_ok=True)
        (test_dir / class_name).mkdir(parents=True, exist_ok=True)
        # copy test files
        for t in seed_test:
            dst = test_dir / class_name / t.name
            if not dst.exists():
                shutil.copy2(t, dst)
        # copy seeds (and augment to reach TARGET_PER_CLASS)
        # copy original seeds first
        for idx, s in enumerate(seed_train):
            dst = train_dir / class_name / f"seed_{idx:03d}{s.suffix}"
            if not dst.exists():
                shutil.copy2(s, dst)
        # count existing
        existing = list((train_dir / class_name).glob("*"))
        # augment from seeds randomly until we reach target
        seed_images = [load_image_as_pil(p) for p in seed_train] if seed_train else []
        # if no seed images but there are test images, we will use any test image to seed (unlikely)
        if not seed_images and seed_test:
            seed_images = [load_image_as_pil(seed_test[0])]
        i = len(existing)
        attempts = 0
        while i < TARGET_PER_CLASS and attempts < TARGET_PER_CLASS * 20:
            attempts += 1
            if not seed_images:
                break
            src = random.choice(seed_images)
            aug = random_augment_pil(src)
            # save
            out_name = f"aug_{i:04d}.png"
            out_path = train_dir / class_name / out_name
            aug.save(out_path)
            i += 1
        if i < TARGET_PER_CLASS:
            print(f"Warning: class {class_name} only reached {i} training images (target {TARGET_PER_CLASS}).")
else:
    # fallback grouping case (if data was files with prefixes)
    for class_name, files in tqdm(class_dirs, desc="Processing grouping fallback"):
        (train_dir / class_name).mkdir(parents=True, exist_ok=True)
        (test_dir / class_name).mkdir(parents=True, exist_ok=True)
        files = sorted(files)
        seed_train = files[:TRAIN_SAMPLES_PER_CLASS]
        seed_test = files[TRAIN_SAMPLES_PER_CLASS:]
        for t in seed_test:
            shutil.copy2(t, test_dir / class_name / t.name)
        for idx, s in enumerate(seed_train):
            dst = train_dir / class_name / f"seed_{idx:03d}{s.suffix}"
            if not dst.exists():
                shutil.copy2(s, dst)
        seed_images = [load_image_as_pil(p) for p in seed_train] if seed_train else []
        i = len(list((train_dir / class_name).glob("*")))
        attempts = 0
        while i < TARGET_PER_CLASS and attempts < TARGET_PER_CLASS * 20:
            attempts += 1
            if not seed_images:
                break
            src = random.choice(seed_images)
            aug = random_augment_pil(src)
            out_name = f"aug_{i:04d}.png"
            out_path = train_dir / class_name / out_name
            aug.save(out_path)
            i += 1
        if i < TARGET_PER_CLASS:
            print(f"Warning: class {class_name} only reached {i} training images (target {TARGET_PER_CLASS}).")

print("Finished data processing.")


Found 13065 class folders under DATA_ROOT.
Preparing train/test split and augmentation. This can take time depending on dataset size.


Processing classes: 100%|██████████| 13065/13065 [38:29<00:00,  5.66it/s]

Finished data processing.





In [4]:
# Cell 4 — Build tf.data datasets from processed directories
# Determine classes and num_classes
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    labels="inferred",
    label_mode="int",
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=True,
    seed=SEED
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    labels="inferred",
    label_mode="int",
    color_mode="grayscale",
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=False
)

class_names = train_ds.class_names
NUM_CLASSES = len(class_names)
print("Detected classes:", NUM_CLASSES)

if FORCE_NUM_CLASSES is not None:
    print(f"Forcing num classes to {FORCE_NUM_CLASSES} (assignment requirement).")
    NUM_CLASSES = FORCE_NUM_CLASSES

# normalize and prepare pipeline
def normalize_img(image, label):
    # convert uint8 [0,255] -> float32 [0,1]
    image = tf.cast(image, tf.float32) / 255.0
    return image, tf.one_hot(label, NUM_CLASSES)

train_ds = train_ds.map(normalize_img, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)
test_ds = test_ds.map(normalize_img, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)

# small check: one batch
for images, labels in train_ds.take(1):
    print("Batch shape:", images.shape, labels.shape)


Found 2613000 files belonging to 13065 classes.
Found 162550 files belonging to 13065 classes.
Detected classes: 13065
Batch shape: (128, 64, 64, 1) (128, 13065)


In [5]:
# Cell 5 — Model definitions: Baseline MLP, small CNN, deeper CNN
def build_mlp(input_shape, num_classes):
    model = keras.Sequential([
        layers.Input(shape=input_shape),
        layers.Flatten(),
        layers.Dense(1024, activation='relu'),
        layers.Dropout(0.4),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_small_cnn(input_shape, num_classes):
    model = keras.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(32, 3, padding='same', activation='relu'),
        layers.MaxPool2D(2),
        layers.Conv2D(64, 3, padding='same', activation='relu'),
        layers.MaxPool2D(2),
        layers.Dropout(0.25),
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.4),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_deeper_cnn(input_shape, num_classes):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, 3, padding='same', activation='relu')(inputs)
    x = layers.Conv2D(32, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool2D(2)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv2D(64, 3, padding='same', activation='relu')(x)
    x = layers.Conv2D(64, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool2D(2)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv2D(128, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool2D(2)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Input shape (grayscale)
input_shape = IMG_SIZE + (1,)
print("Input shape:", input_shape, "Num classes:", NUM_CLASSES)


Input shape: (64, 64, 1) Num classes: 13065


In [None]:
# Cell 6 — Training helper that trains a model and returns history + evaluation
def train_and_evaluate(model_fn, model_name, epochs=EPOCHS):
    print("\n\n========= Training:", model_name, "=========")
    model = model_fn(input_shape, NUM_CLASSES)
    model.summary()

    ckpt_path = MODELS_DIR / f"{model_name}.h5"
    callbacks = [
        keras.callbacks.ModelCheckpoint(str(ckpt_path), monitor='val_accuracy', save_best_only=True, save_weights_only=False),
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6),
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)
    ]

    history = model.fit(
        train_ds,
        validation_data=test_ds,
        epochs=epochs,
        callbacks=callbacks
    )

    # Evaluate best saved model
    model.load_weights(str(ckpt_path))
    loss, acc = model.evaluate(test_ds, verbose=2)
    print(f"{model_name} test accuracy: {acc:.4f}")

    # Return model, history, and numeric accuracy
    return model, history, acc

# Train models sequentially (be careful: this will take time)
results = []
models_to_run = [
    (build_mlp, "mlp_baseline"),
    (build_small_cnn, "small_cnn"),
    (build_deeper_cnn, "deeper_cnn")
]

for fn, name in models_to_run:
    model, hist, acc = train_and_evaluate(fn, name)
    results.append({"model": name, "accuracy": float(acc)})
    # Save history CSV
    hist_df = pd.DataFrame(hist.history)
    hist_df.to_csv(REPORTS_DIR / f"{name}_history.csv", index=False)

# Save results summary
pd.DataFrame(results).to_csv(REPORTS_DIR / "model_accuracies.csv", index=False)
print("All done. Summaries at:", REPORTS_DIR)






Epoch 1/12
[1m20415/20415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 5.6677e-05 - loss: 9.4844



[1m20415/20415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1956s[0m 96ms/step - accuracy: 4.8220e-05 - loss: 9.4834 - val_accuracy: 6.7671e-05 - val_loss: 9.4783 - learning_rate: 0.0010
Epoch 2/12
[1m12054/20415[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m11:51[0m 85ms/step - accuracy: 4.0799e-05 - loss: 9.4813

In [None]:
# Cell 7 — Classification report on test set (samples only)
# This will compute predicted labels and show overall accuracy and a short classification report
print("Building predictions for test dataset ... (this may take time for large class counts)")

# collect true labels and predictions (may be large)
y_true = []
y_pred = []
file_paths = []

# iterate through test_ds dataset but we need the original integer labels; to get them easier,
# reload a non-one-hot dataset with label_mode='int' and no batching
test_ds_int = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    labels="inferred",
    label_mode="int",
    color_mode="grayscale",
    batch_size=1,
    image_size=IMG_SIZE,
    shuffle=False
)

# load best model (choose deepest as example)
best_model_path = MODELS_DIR / "deeper_cnn.h5"
if best_model_path.exists():
    best_model = keras.models.load_model(best_model_path)
else:
    # fallback to the last trained model
    best_model = model

for img, label in tqdm(test_ds_int, desc="Predicting"):
    img_norm = tf.cast(img, tf.float32) / 255.0
    preds = best_model.predict(img_norm, verbose=0)
    pred_label = np.argmax(preds, axis=-1)[0]
    true_label = int(label.numpy()[0])
    y_true.append(true_label)
    y_pred.append(pred_label)

# global accuracy
acc = accuracy_score(y_true, y_pred)
print("Test accuracy (computed):", acc)

# print a short classification report for up to first 30 classes to avoid huge outputs
n_display = min(30, NUM_CLASSES)
target_names = class_names[:n_display]
report = classification_report(y_true, y_pred, labels=list(range(n_display)), target_names=target_names, zero_division=0)
print("Classification report (first classes):\n", report)

# Save predictions CSV
pred_df = pd.DataFrame({"true": y_true, "pred": y_pred})
pred_df.to_csv(REPORTS_DIR / "test_predictions.csv", index=False)
print("Predictions saved to:", REPORTS_DIR / "test_predictions.csv")
