<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

class GradientBoostingClassifier:
    """
    Gradient Boosting Classifier using decision trees as weak learners.
    """
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=2):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        """
        Fit the model to the training data.
        """
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)

        n_samples, _ = X.shape

        # Initial predictions
        class_scores = np.zeros((n_samples, self.n_classes_))
        self.initial_predictions = class_scores.copy()
        self.trees = []

        # Create one-hot encoding using np.eye
        y_one_hot = np.eye(self.n_classes_)[y]

        for i in range(self.n_estimators):
            # Convert scores to probabilities using softmax
            exp_scores = np.exp(class_scores - np.max(class_scores, axis=1, keepdims=True))
            probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

            # Calculate residuals for all classes
            residuals = y_one_hot - probabilities

            tree_list = []
            for j in range(self.n_classes_):
                # Fit a decision tree to the residuals
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X, residuals[:, j])
                tree_list.append(tree)

                # Update the predictions
                class_scores[:, j] += self.learning_rate * tree.predict(X)

            self.trees.append(tree_list)

    def predict_proba(self, X):
        """
        Predict the class probabilities for the given data.
        """
        n_samples, _ = X.shape

        # Initialize with zeros (since we're not using initial log-odds anymore)
        class_scores = np.zeros((n_samples, self.n_classes_))

        # Accumulate predictions from all trees
        for tree_list in self.trees:
            for j in range(self.n_classes_):
                class_scores[:, j] += self.learning_rate * tree_list[j].predict(X)

        # Convert to probabilities using softmax
        exp_scores = np.exp(class_scores - np.max(class_scores, axis=1, keepdims=True))
        probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        return probabilities

    def predict(self, X):
        """
        Predict class for given data.
        """
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)



In [7]:
class PCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.mean_ = None
        self.components_ = None

    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)

        X_centered = X - self.mean_

        _, _, V = np.linalg.svd(X_centered, full_matrices=False)

        self.components_ = V[:self.n_components]

    def transform(self, X):
        X_centered = X - self.mean_
        return np.dot(X_centered, self.components_.T)

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [8]:
import kagglehub
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix


def read_idx_file(file_path):
    with open(file_path, 'rb') as f:
        magic_number = int.from_bytes(f.read(4), byteorder='big')

        if magic_number == 2051:
            num_images = int.from_bytes(f.read(4), byteorder='big')
            rows = int.from_bytes(f.read(4), byteorder='big')
            cols = int.from_bytes(f.read(4), byteorder='big')

            data = np.frombuffer(f.read(), dtype=np.uint8)
            data = data.reshape(num_images, rows * cols)
            return data

        elif magic_number == 2049:
            num_labels = int.from_bytes(f.read(4), byteorder='big')
            labels = np.frombuffer(f.read(), dtype=np.uint8)
            return labels
        else:
            raise ValueError("Invalid magic number")

if __name__ == "__main__":
    # load data
    path = kagglehub.dataset_download("hojjatk/mnist-dataset")

    train_images_path = os.path.join(path, "train-images.idx3-ubyte")
    train_labels_path = os.path.join(path, "train-labels.idx1-ubyte")

    test_images_path = os.path.join(path, "t10k-images.idx3-ubyte")
    test_labels_path = os.path.join(path, "t10k-labels.idx1-ubyte")

    X_train, y_train = read_idx_file(train_images_path), read_idx_file(train_labels_path)
    X_test, y_test = read_idx_file(test_images_path), read_idx_file(test_labels_path)

    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_test shape:", y_test.shape)

    # Normalize
    X_train = X_train / 255.0
    X_test = X_test / 255.0

    # Scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # PCA
    pca = PCA(n_components=30)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Gradient Boosting
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=3)
    model.fit(X_train_pca, y_train)

    # Predict
    y_pred = model.predict(X_test_pca)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    print("\nPer-class performance analysis:")
    for digit in range(10):
        digit_mask = y_test == digit
        if np.sum(digit_mask) > 0:
            digit_accuracy = accuracy_score(y_test[digit_mask], y_pred[digit_mask])

            print(f"Digit {digit}:")
            print(f"  - Accuracy: {digit_accuracy:.4f}")



X_train shape: (60000, 784)
y_train shape: (60000,)
X_test shape: (10000, 784)
y_test shape: (10000,)
Accuracy: 0.8868
Confusion Matrix:
[[ 933    0    2    2    1   21   12    2    5    2]
 [   0 1103    5    3    1    2    4    0   17    0]
 [  15    4  871   28   16    6   20   13   57    2]
 [   1    8   22  859    2   57    1   18   35    7]
 [   1    4    6    1  860    2   21   11    5   71]
 [  17    2    5   41   15  756   11    7   28   10]
 [  19    4    9    1   11   22  887    0    5    0]
 [   2   14   20    5   12    0    0  910    9   56]
 [  16    5   15   40   11   41    3   12  813   18]
 [   9   10    4   15   49    7    0   30    9  876]]

Per-class performance analysis:
Digit 0:
  - Accuracy: 0.9520
Digit 1:
  - Accuracy: 0.9718
Digit 2:
  - Accuracy: 0.8440
Digit 3:
  - Accuracy: 0.8505
Digit 4:
  - Accuracy: 0.8758
Digit 5:
  - Accuracy: 0.8475
Digit 6:
  - Accuracy: 0.9259
Digit 7:
  - Accuracy: 0.8852
Digit 8:
  - Accuracy: 0.8347
Digit 9:
  - Accuracy: 0.8682