<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
from re import X
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import os
import urllib.request
import zipfile
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import shutil


class GradientBoostingClassifier:
    """
    Gradient Boosting Classifier using decision trees as weak learners.
    """
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_sample_split=2):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.trees = []

    def fit(self, X, y):
        """
        Fit the model to the training data.
        """
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)

        n_samples, _ = X.shape

        # Initialize the target variable
        class_scores = np.zeros((n_samples, self.n_classes_))

        # Calculate initial log-odds
        self.initial_predictions = np.zeros(self.n_classes_)
        for i in range(self.n_classes_):
            p = np.mean(y == i)
            self.initial_predictions[i] = np.log((p + 1e-8) / (1 - p + 1e-8))
            class_scores[:, i] = self.initial_predictions[i]

        self.trees = [[] for _ in range(self.n_classes_)]

        for i in range(self.n_estimators):
            # For each class, fit a tree
            for j in range(self.n_classes_):
                # Binary Labels
                y_bin = (y == j).astype(int)

                # Calculate the predict probability using sigmoid
                y_pred_prob = 1 / (1 + np.exp(-class_scores[:, j]))

                # Calculate the residuals
                residuals = y_bin - y_pred_prob

                # Fit a decision tree to the residuals
                tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_sample_split)
                tree.fit(X, residuals)

                # Update the predictions
                class_scores[:, j] += self.learning_rate * tree.predict(X)

                self.trees[j].append(tree)

    def predict_proba(self, X):
        """
        Predict the class probabilities for the given data.
        """
        n_samples, _ = X.shape

        # Initialize with initial predictions
        class_scores = np.zeros((n_samples, self.n_classes_))
        for i in range(self.n_classes_):
            class_scores[:, i] = self.initial_predictions[i]

        # Accumulate predictions
        for i in range(self.n_classes_):
            for tree in self.trees[i]:
                class_scores[:, i] += self.learning_rate * tree.predict(X)

        # Convert to probabilities
        probabilities = 1 / (1 + np.exp(-class_scores))

        # Normalize
        probabilities /= np.sum(probabilities, axis=1, keepdims=True)

        return probabilities

    def predict(self, X):
        """
        Predict class for given data.
        """
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)

def load_8newsgroup_data():
    """
    Simplified 8newsgroup dataset loader
    """
    import urllib.request
    import zipfile
    import tempfile
    import re

    # Download and extract to temporary directory
    url = 'https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/8newsgroup.zip'

    with tempfile.TemporaryDirectory() as temp_dir:
        # Download
        zip_path = os.path.join(temp_dir, '8newsgroup.zip')
        urllib.request.urlretrieve(url, zip_path)

        # Extract
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)

        # Try to read category mappings from train.trec/data_settings.txt
        categories = {}
        data_settings_files = []

        # Look for data_settings.txt in train.trec and test.trec directories
        for root, dirs, files in os.walk(temp_dir):
            if 'data_settings.txt' in files and ('train.trec' in root or 'test.trec' in root):
                data_settings_files.append(os.path.join(root, 'data_settings.txt'))

        if data_settings_files:
            # Use the first found file (preferably from train.trec)
            setting_file = data_settings_files[0]

            with open(setting_file, 'r') as f:
                for line in f:
                    # Parse format: intId=8496,extId=8496,intLabel=6,extLabel=space
                    match = re.search(r'intLabel=(\d+),extLabel=(\w+)', line)
                    if match:
                        label_id = int(match.group(1))
                        label_name = match.group(2)
                        categories[label_id] = label_name


        category_names = [categories[i] for i in sorted(categories.keys())]

        # Load data function
        def load_file(file_path):
            """Load sparse data in TREC format"""
            labels, rows, cols, values = [], [], [], []

            with open(file_path, 'r') as f:
                for i, line in enumerate(f):
                    parts = line.strip().split()
                    if parts:
                        labels.append(int(parts[0]))
                        for feat in parts[1:]:
                            if ':' in feat:
                                idx, val = feat.split(':')
                                rows.append(i)
                                cols.append(int(idx))
                                values.append(float(val))

            return labels, (values, (rows, cols))

        # Find data files
        train_file = test_file = None
        for root, dirs, files in os.walk(temp_dir):
            if 'feature_matrix.txt' in files:
                if 'train' in root:
                    train_file = os.path.join(root, 'feature_matrix.txt')
                elif 'test' in root:
                    test_file = os.path.join(root, 'feature_matrix.txt')

        # Load train and test data
        y_train, train_data = load_file(train_file)
        y_test, test_data = load_file(test_file)

        # Determine number of features from the data
        # Find the maximum feature index in both train and test data
        train_max_feature = max(train_data[1][1]) if train_data[1][1] else 0
        test_max_feature = max(test_data[1][1]) if test_data[1][1] else 0
        n_features = max(train_max_feature, test_max_feature) + 1  # +1 because indices start at 0

        # Convert to sparse matrices
        X_train = csr_matrix(train_data, shape=(len(y_train), n_features))
        X_test = csr_matrix(test_data, shape=(len(y_test), n_features))

        return X_train, np.array(y_train), X_test, np.array(y_test), category_names

def find_best_parameters(data):
    """
    Find the best parameters for the model.
    """
    print("\n" + "="*50)
    print("Searching for Best Parameters")
    print("="*50)

    X_train, y_train, X_test, y_test, target_names = data

    # Define parameters combine
    param_grid = [
        # {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 2, 'min_sample_split': 2},
        # {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 4, 'min_sample_split': 2},
        # {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 7, 'min_sample_split': 2},
        # {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 7, 'min_sample_split': 20},
        # {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 2, 'min_sample_split': 2},
        # {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 5, 'min_sample_split': 2},
        {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 5, 'min_sample_split': 20},
        {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 6, 'min_sample_split': 5},
    ]

    results = []
    best_test_acc = 0
    best_params = None
    best_model = None

    # Test params
    for idx, params in enumerate(param_grid):
        print(f"\n[{idx+1}/{len(param_grid)}] Testing: {params}")

        # train model
        model = GradientBoostingClassifier(**params)
        model.fit(X_train, y_train)

        # Evaluate
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)

        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)


        result = {
            'params': params,
            'train_acc': train_acc,
            'test_acc': test_acc,
            'gap': train_acc - test_acc,
            'model': model
        }
        results.append(result)

        print(f"  Train: {train_acc:.4f}, Test: {test_acc:.4f}, Gap: {train_acc - test_acc:.4f}")

        # Update
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_params = params
            best_model = model

    print("\n" + "="*50)
    print("Best Model Analysis")
    print("="*50)
    print(f"Best parameters: {best_params}")
    print(f"Best test accuracy: {best_test_acc:.4f}")


    test_pred = best_model.predict(X_test)
    print("\nPer-class accuracy (best model):")
    for i in range(best_model.n_classes_):
        mask = y_test == i
        if np.sum(mask) > 0:
            class_acc = accuracy_score(y_test[mask], test_pred[mask])
            class_name = target_names[i]
            print(f"  {class_name}: {class_acc:.2%}")


if __name__ == '__main__':
    # Load complete dataset
    data = load_8newsgroup_data()

    find_best_parameters(data)


Searching for Best Parameters

[1/2] Testing: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 5, 'min_sample_split': 20}
  Train: 0.8574, Test: 0.7939, Gap: 0.0635

[2/2] Testing: {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 6, 'min_sample_split': 5}
  Train: 0.9073, Test: 0.8255, Gap: 0.0817

Best Model Analysis
Best parameters: {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 6, 'min_sample_split': 5}
Best test accuracy: 0.8255

Per-class accuracy (best model):
  religion: 80.99%
  computer: 95.30%
  forsale: 72.31%
  autos: 76.57%
  sports: 85.68%
  med: 48.99%
  space: 69.54%
  politics: 74.19%
