<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

def read_idx_file(file_path):
    with open(file_path, 'rb') as f:
        magic_number = int.from_bytes(f.read(4), byteorder='big')

        if magic_number == 2051:
            num_images = int.from_bytes(f.read(4), byteorder='big')
            rows = int.from_bytes(f.read(4), byteorder='big')
            cols = int.from_bytes(f.read(4), byteorder='big')

            data = np.frombuffer(f.read(), dtype=np.uint8)
            data = data.reshape(num_images, rows * cols)
            return data

        elif magic_number == 2049:
            num_labels = int.from_bytes(f.read(4), byteorder='big')
            labels = np.frombuffer(f.read(), dtype=np.uint8)
            return labels
        else:
            raise ValueError("Invalid magic number")

if __name__ == "__main__":
    # load data
    path = kagglehub.dataset_download("hojjatk/mnist-dataset")

    train_images_path = os.path.join(path, "train-images.idx3-ubyte")
    train_labels_path = os.path.join(path, "train-labels.idx1-ubyte")

    test_images_path = os.path.join(path, "t10k-images.idx3-ubyte")
    test_labels_path = os.path.join(path, "t10k-labels.idx1-ubyte")

    X_train, y_train = read_idx_file(train_images_path), read_idx_file(train_labels_path)
    X_test, y_test = read_idx_file(test_images_path), read_idx_file(test_labels_path)

    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_test shape:", y_test.shape)

    # Normalize
    X_train = X_train / 255.0
    X_test = X_test / 255.0

    # Scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # PCA
    pca = PCA(n_components=30)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Gradient Boosting
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=3, verbose=1)
    model.fit(X_train_pca, y_train)

    # Predict
    y_pred = model.predict(X_test_pca)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    print("\nPer-class performance analysis:")
    for digit in range(10):
        digit_mask = y_test == digit
        if np.sum(digit_mask) > 0:
            digit_accuracy = accuracy_score(y_test[digit_mask], y_pred[digit_mask])

            print(f"Digit {digit}:")
            print(f"  - Accuracy: {digit_accuracy:.4f}")



X_train shape: (60000, 784)
y_train shape: (60000,)
X_test shape: (10000, 784)
y_test shape: (10000,)
      Iter       Train Loss   Remaining Time 
         1           1.1302           22.74m
         2           0.8649           22.52m
         3           0.7177           22.26m
         4           0.6202           22.12m
         5           0.5528           22.19m
         6           0.4995           21.98m
         7           0.4569           21.76m
         8           0.4223           21.47m
         9           0.3991           21.24m
        10           0.3760           20.92m
        20           0.2576           18.66m
        30           0.2082           16.37m
        40           0.2236           14.07m
        50           0.2151           11.74m
        60           0.1948            9.41m
        70           0.1667            7.07m
        80           0.1529            4.71m
        90           0.1409            2.36m
       100           0.1303            0.0