<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer


class GradientBoostingClassifier:
    """
    Gradient Boosting Classifier using decision trees as weak learners.
    """
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.trees = []

    def fit(self, X, y):
        """
        Fit the model to the training data.
        """
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)

        n_samples, _ = X.shape

        # Initial predictions
        class_scores = np.zeros((n_samples, self.n_classes_))
        self.initial_predictions = class_scores.copy()
        self.trees = []

        # Create one-hot encoding using np.eye
        y_one_hot = np.eye(self.n_classes_)[y]

        for i in range(self.n_estimators):
            # Convert scores to probabilities using softmax
            exp_scores = np.exp(class_scores - np.max(class_scores, axis=1, keepdims=True))
            probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

            # Calculate residuals for all classes
            residuals = y_one_hot - probabilities

            tree_list = []
            for j in range(self.n_classes_):
                # Fit a decision tree to the residuals
                tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_split)
                tree.fit(X, residuals[:, j])
                tree_list.append(tree)

                # Update the predictions
                class_scores[:, j] += self.learning_rate * tree.predict(X)

            self.trees.append(tree_list)

    def _softmax(self, F):
        """
        Compute softmax probabilities from raw scores.
        """
        # Numerical stability: subtract max
        F_stable = F - np.max(F, axis=1, keepdims=True)
        exp_F = np.exp(F_stable)
        return exp_F / np.sum(exp_F, axis=1, keepdims=True)

    def predict_proba(self, X):
        """
        Predict the class probabilities for the given data.
        """
        n_samples, _ = X.shape

        # Initialize with zeros (since we're not using initial log-odds anymore)
        class_scores = np.zeros((n_samples, self.n_classes_))

        # Accumulate predictions from all trees
        for tree_list in self.trees:
            for j in range(self.n_classes_):
                class_scores[:, j] += self.learning_rate * tree_list[j].predict(X)

        # Convert to probabilities using softmax
        exp_scores = np.exp(class_scores - np.max(class_scores, axis=1, keepdims=True))
        probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        return probabilities

    def predict(self, X):
        """
        Predict class for given data.
        """
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)

def fetch_data():
    """
    Fetch the 20 Newsgroups dataset, vectorize the text using TF-IDF,
    and split into train and test sets.
    """
    categories = [
        'comp.graphics',
        'comp.os.ms-windows.misc',
        'comp.sys.ibm.pc.hardware',
        'comp.sys.mac.hardware',
        'rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey',
    ]

    newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
    X_train_text, y_train = newsgroups_train.data, newsgroups_train.target

    newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
    X_test_text, y_test = newsgroups_test.data, newsgroups_test.target

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=2000)

    X_train = vectorizer.fit_transform(X_train_text)
    X_test = vectorizer.transform(X_test_text)

    return X_train, y_train, X_test, y_test, newsgroups_train.target_names


def find_best_parameters(data):
    """
    Find the best parameters for the model.
    """
    print("\n" + "="*50)
    print("Searching for Best Parameters")
    print("="*50)

    X_train, y_train, X_test, y_test, target_names = data

    param_grid = [
        {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 3},
    ]

    results = []
    best_test_acc = 0
    best_params = None
    best_model = None

    # Test params
    for idx, params in enumerate(param_grid):
        print(f"\n[{idx+1}/{len(param_grid)}] Testing: {params}")

        # train model
        model = GradientBoostingClassifier(**params)
        model.fit(X_train, y_train)

        # Evaluate
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)

        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)


        result = {
            'params': params,
            'train_acc': train_acc,
            'test_acc': test_acc,
            'gap': train_acc - test_acc,
            'model': model
        }
        results.append(result)

        print(f"  Train: {train_acc:.4f}, Test: {test_acc:.4f}, Gap: {train_acc - test_acc:.4f}")

        # Update
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_params = params
            best_model = model

    print("\n" + "="*50)
    print("Best Model Analysis")
    print("="*50)
    print(f"Best parameters: {best_params}")
    print(f"Best test accuracy: {best_test_acc:.4f}")


    test_pred = best_model.predict(X_test)
    print("\nPer-class accuracy (best model):")
    for i in range(best_model.n_classes_):
        mask = y_test == i
        if np.sum(mask) > 0:
            class_acc = accuracy_score(y_test[mask], test_pred[mask])
            class_name = target_names[i]
            print(f"  {class_name}: {class_acc:.2%}")


if __name__ == '__main__':
    # Load complete dataset
    data = fetch_data()

    find_best_parameters(data)


Searching for Best Parameters

[1/1] Testing: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 3}
  Train: 0.7597, Test: 0.6584, Gap: 0.1013

Best Model Analysis
Best parameters: {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 3}
Best test accuracy: 0.6584

Per-class accuracy (best model):
  comp.graphics: 81.23%
  comp.os.ms-windows.misc: 58.38%
  comp.sys.ibm.pc.hardware: 64.54%
  comp.sys.mac.hardware: 60.78%
  rec.autos: 62.12%
  rec.motorcycles: 56.03%
  rec.sport.baseball: 68.01%
  rec.sport.hockey: 75.69%
