<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from re import X
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import os
import urllib.request
import zipfile
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import shutil
# import matplotlib.pyplot as plt
# import seaborn as sns
import time


class GradientBoostingClassifier:
    """
    Gradient Boosting Classifier using decision trees as weak learners.
    """
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_sample_split=2):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.trees = []

    def fit(self, X, y):
        """
        Fit the model to the training data.
        """
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)

        n_samples, _ = X.shape

        # Initialize the target variable
        class_scores = np.zeros((n_samples, self.n_classes_))

        # Calculate initial log-odds
        self.initial_predictions = np.zeros(self.n_classes_)
        for i in range(self.n_classes_):
            p = np.mean(y == i)
            self.initial_predictions[i] = np.log((p + 1e-8) / (1 - p + 1e-8))
            class_scores[:, i] = self.initial_predictions[i]

        self.trees = [[] for _ in range(self.n_classes_)]

        for i in range(self.n_estimators):
            # For each class, fit a tree
            for j in range(self.n_classes_):
                # Binary Labels
                y_bin = (y == j).astype(int)

                # Calculate the predict probability using sigmoid
                y_pred_prob = 1 / (1 + np.exp(-class_scores[:, j]))

                # Calculate the residuals
                residuals = y_bin - y_pred_prob

                # Fit a decision tree to the residuals
                tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_sample_split)
                tree.fit(X, residuals)

                # Update the predictions
                class_scores[:, j] += self.learning_rate * tree.predict(X)

                self.trees[j].append(tree)

    def predict_proba(self, X):
        """
        Predict the class probabilities for the given data.
        """
        n_samples, _ = X.shape

        # Initialize with initial predictions
        class_scores = np.zeros((n_samples, self.n_classes_))
        for i in range(self.n_classes_):
            class_scores[:, i] = self.initial_predictions[i]

        # Accumulate predictions
        for i in range(self.n_classes_):
            for tree in self.trees[i]:
                class_scores[:, i] += self.learning_rate * tree.predict(X)

        # Convert to probabilities
        probabilities = 1 / (1 + np.exp(-class_scores))

        # Normalize
        probabilities /= np.sum(probabilities, axis=1, keepdims=True)

        return probabilities

    def predict(self, X):
        """
        Predict class for given data.
        """
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)


def download_and_extract_data(url='https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/8newsgroup.zip',
                              data_dir='./data'):
    """
    Download and extract 8newsgroup dataset. Always downloads fresh copy.
    """
    # Remove existing data directory to ensure fresh download
    if os.path.exists(data_dir):
        print(f"Removing existing data directory: {data_dir}")
        shutil.rmtree(data_dir)

    # Create data directory
    os.makedirs(data_dir)

    zip_path = os.path.join(data_dir, '8newsgroup.zip')

    # Download dataset
    print(f"Downloading dataset from {url}...")
    try:
        urllib.request.urlretrieve(url, zip_path)
        print("Download completed!")
    except Exception as e:
        print(f"Error downloading file: {e}")
        raise

    # Extract dataset
    print("Extracting dataset...")
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print("Extraction completed!")

        # Remove zip file to save space
        os.remove(zip_path)
        print("Removed zip file.")
    except Exception as e:
        print(f"Error extracting file: {e}")
        raise

    return data_dir


def load_sparse_data(file_path):
    """
    Load sparse matrix data from file.
    Format: label feature_id:value feature_id:value ...
    """
    labels = []
    rows = []
    cols = []
    values = []

    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            parts = line.strip().split()
            if len(parts) > 0:  # Skip empty lines
                labels.append(int(parts[0]))

                for feat_val in parts[1:]:
                    if ':' in feat_val:  # Make sure it's a valid feature:value pair
                        feat_id, val = feat_val.split(':')
                        rows.append(i)
                        cols.append(int(feat_id))
                        values.append(float(val))

    return labels, (values, (rows, cols))


def load_and_prepare_data():
    """
    Load 8newsgroup dataset.
    """
    print("Loading 8newsgroup dataset...")

    # Download and extract (always fresh)
    data_path = download_and_extract_data()

    # Based on the typical structure, the files should be in:
    # data/train.trec/ and data/test.trec/
    train_dir = os.path.join(data_path, 'train.trec')
    test_dir = os.path.join(data_path, 'test.trec')

    # Check if directories exist
    if not os.path.exists(train_dir):
        print(f"\nTrain directory not found at: {train_dir}")
        print("Checking alternative locations...")

        # Try to find the correct structure
        for root, dirs, files in os.walk(data_path):
            if 'train.trec' in dirs:
                train_dir = os.path.join(root, 'train.trec')
                test_dir = os.path.join(root, 'test.trec')
                print(f"Found directories at: {root}")
                break

    if not os.path.exists(train_dir):
        raise ValueError(f"Could not find training directory. Expected at: {train_dir}")

    print(f"\nUsing directories:")
    print(f"  Train: {train_dir}")
    print(f"  Test: {test_dir}")

    # Load config
    config = {}
    config_file = os.path.join(train_dir, 'config.txt')
    if os.path.exists(config_file):
        with open(config_file, 'r') as f:
            for line in f:
                if '=' in line and not line.startswith('#'):
                    key, val = line.strip().split('=')
                    config[key] = int(val)
        n_features = config.get('numFeatures', 1754)
        print(f"\nLoaded config: {config}")
    else:
        print(f"Warning: config.txt not found at {config_file}")
        n_features = 1754

    # Load training data
    print("\nLoading training data...")
    train_file = os.path.join(train_dir, 'feature_matrix.txt')
    if not os.path.exists(train_file):
        raise ValueError(f"Training data file not found: {train_file}")

    y_train, train_data = load_sparse_data(train_file)
    X_train = csr_matrix(train_data, shape=(len(y_train), n_features))
    y_train = np.array(y_train)

    # Load test data
    print("Loading test data...")
    test_file = os.path.join(test_dir, 'feature_matrix.txt')
    y_test, test_data = load_sparse_data(test_file)
    X_test = csr_matrix(test_data, shape=(len(y_test), n_features))
    y_test = np.array(y_test)

    # Get category names
    categories = ['religion', 'graphics', 'windows', 'ibm.hardware',
                  'mac.hardware', 'windows.x', 'forsale', 'autos']

    print(f"\nDataset loaded successfully!")
    print(f"  Training: {X_train.shape[0]} samples")
    print(f"  Test: {X_test.shape[0]} samples")
    print(f"  Features: {n_features}")
    print(f"  Classes: {len(categories)}")
    print(f"  Sparsity: {1 - X_train.nnz / (X_train.shape[0] * n_features):.1%}")

    return {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test,
        'vectorizer': None,
        'target_names': categories
    }

def run_training_and_evaluate(data):
    print("\n" + "="*50)
    print("Starting Complete Training Process")
    print("="*50)

    X_train, y_train = data['X_train'], data['y_train']
    X_test, y_test = data['X_test'], data['y_test']

    # Create model and train
    model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5)
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Training accuracy: {train_accuracy:.2%}")
    print(f"Test accuracy: {test_accuracy:.2%}")

    # Per-class accuracy
    print("\nPer-class accuracy (Test set):")
    for i in range(model.n_classes_):
        mask = data['y_test'] == i
        if np.sum(mask) > 0:
            class_acc = accuracy_score(data['y_test'][mask], y_pred[mask])
            class_name = data['target_names'][i] if i < len(data['target_names']) else f"Class {i}"
            print(f"  {class_name}: {class_acc:.2%}")

    # # Confusion matrix
    # cm = confusion_matrix(y_test, y_pred)

    # # Create figure
    # plt.figure(figsize=(10, 8))

    # # Plot heatmap
    # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
    #             xticklabels=data['target_names'],
    #             yticklabels=data['target_names'])

    # plt.title('Confusion Matrix - Gradient Boosting Classifier')
    # plt.xlabel('Predicted Label')
    # plt.ylabel('True Label')
    # plt.tight_layout()

    # plt.show()

def find_best_parameters(data):
    """
    Find the best parameters for the model.
    """
    print("\n" + "="*50)
    print("Searching for Best Parameters")
    print("="*50)

    # Define parameters combine
    param_grid = [
        {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 10, 'min_sample_split': 2},
        # {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 8, 'min_sample_split': 20}
    ]

    results = []
    best_test_acc = 0
    best_params = None
    best_model = None

    # Test params
    for idx, params in enumerate(param_grid):
        print(f"\n[{idx+1}/{len(param_grid)}] Testing: {params}")

        # train model
        model = GradientBoostingClassifier(**params)
        start_time = time.time()
        model.fit(data['X_train'], data['y_train'])
        train_time = time.time() - start_time

        # Evaluate
        train_pred = model.predict(data['X_train'])
        test_pred = model.predict(data['X_test'])

        train_acc = accuracy_score(data['y_train'], train_pred)
        test_acc = accuracy_score(data['y_test'], test_pred)


        result = {
            'params': params,
            'train_acc': train_acc,
            'test_acc': test_acc,
            'gap': train_acc - test_acc,
            'train_time': train_time,
            'model': model
        }
        results.append(result)

        print(f"  Train: {train_acc:.4f}, Test: {test_acc:.4f}, Gap: {train_acc - test_acc:.4f}")
        print(f"  Training time: {train_time:.1f}s")

        # Update
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_params = params
            best_model = model

    print("\n" + "="*50)
    print("Results Summary")
    print("="*50)

    # Sort top 5 accuracy
    results_sorted = sorted(results, key=lambda x: x['test_acc'], reverse=True)

    print("\nTop 5 configurations by test accuracy:")
    for i, r in enumerate(results_sorted[:5]):
        print(f"\n{i+1}. {r['params']}")
        print(f"   Test: {r['test_acc']:.4f}, Train: {r['train_acc']:.4f}, Gap: {r['gap']:.4f}")


    print("\n" + "="*50)
    print("Best Model Analysis")
    print("="*50)
    print(f"Best parameters: {best_params}")
    print(f"Best test accuracy: {best_test_acc:.4f}")


    test_pred = best_model.predict(data['X_test'])
    print("\nPer-class accuracy (best model):")
    for i in range(best_model.n_classes_):
        mask = data['y_test'] == i
        if np.sum(mask) > 0:
            class_acc = accuracy_score(data['y_test'][mask], test_pred[mask])
            class_name = data['target_names'][i]
            print(f"  {class_name}: {class_acc:.2%}")

    return best_model, best_params, results

if __name__ == '__main__':
    # Load complete dataset
    data = load_and_prepare_data()

    print(f"\nDataset summary:")
    print(f"  Training samples: {data['X_train'].shape[0]}")
    print(f"  Test samples: {data['X_test'].shape[0]}")
    print(f"  Feature dimension: {data['X_train'].shape[1]}")
    print(f"  Number of classes: {len(np.unique(data['y_train']))}")
    print(f"  Classes: {np.unique(data['y_train'])}")

    # run_training_and_evaluate(data)
    best_model, best_params, results = find_best_parameters(data)
    print("\n" + "="*50)
    print("Best Model Summary")
    print("="*50)
    print(f"Best parameters: {best_params}")

Loading 8newsgroup dataset...
Removing existing data directory: ./data
Downloading dataset from https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/8newsgroup.zip...
Download completed!
Extracting dataset...
Extraction completed!
Removed zip file.

Using directories:
  Train: ./data/train.trec
  Test: ./data/test.trec

Loaded config: {'numClasses': 8, 'numDataPoints': 11314, 'numFeatures': 1754}

Loading training data...
Loading test data...

Dataset loaded successfully!
  Training: 11314 samples
  Test: 7532 samples
  Features: 1754
  Classes: 8
  Sparsity: 98.9%

Dataset summary:
  Training samples: 11314
  Test samples: 7532
  Feature dimension: 1754
  Number of classes: 8
  Classes: [0 1 2 3 4 5 6 7]

Searching for Best Parameters

[1/1] Testing: {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 10, 'min_sample_split': 2}
  Train: 0.9547, Test: 0.8456, Gap: 0.1092
  Training time: 232.4s

Results Summary

Top 5 configurations by test accuracy:

1. {'n_estimators