<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import csr_matrix
import pandas as pd
import os
import shutil
import urllib.request
import zipfile

def load_8newsgroup_data():
    """
    Simplified 8newsgroup dataset loader
    """
    import urllib.request
    import zipfile
    import tempfile
    import re

    # Download and extract to temporary directory
    url = 'https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/8newsgroup.zip'

    with tempfile.TemporaryDirectory() as temp_dir:
        # Download
        zip_path = os.path.join(temp_dir, '8newsgroup.zip')
        urllib.request.urlretrieve(url, zip_path)

        # Extract
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)

        # Try to read category mappings from train.trec/data_settings.txt
        categories = {}
        data_settings_files = []

        # Look for data_settings.txt in train.trec and test.trec directories
        for root, dirs, files in os.walk(temp_dir):
            if 'data_settings.txt' in files and ('train.trec' in root or 'test.trec' in root):
                data_settings_files.append(os.path.join(root, 'data_settings.txt'))

        if data_settings_files:
            # Use the first found file (preferably from train.trec)
            setting_file = data_settings_files[0]

            with open(setting_file, 'r') as f:
                for line in f:
                    # Parse format: intId=8496,extId=8496,intLabel=6,extLabel=space
                    match = re.search(r'intLabel=(\d+),extLabel=(\w+)', line)
                    if match:
                        label_id = int(match.group(1))
                        label_name = match.group(2)
                        categories[label_id] = label_name


        category_names = [categories[i] for i in sorted(categories.keys())]

        # Load data function
        def load_file(file_path):
            """Load sparse data in TREC format"""
            labels, rows, cols, values = [], [], [], []

            with open(file_path, 'r') as f:
                for i, line in enumerate(f):
                    parts = line.strip().split()
                    if parts:
                        labels.append(int(parts[0]))
                        for feat in parts[1:]:
                            if ':' in feat:
                                idx, val = feat.split(':')
                                rows.append(i)
                                cols.append(int(idx))
                                values.append(float(val))

            return labels, (values, (rows, cols))

        # Find data files
        train_file = test_file = None
        for root, dirs, files in os.walk(temp_dir):
            if 'feature_matrix.txt' in files:
                if 'train' in root:
                    train_file = os.path.join(root, 'feature_matrix.txt')
                elif 'test' in root:
                    test_file = os.path.join(root, 'feature_matrix.txt')

        # Load train and test data
        y_train, train_data = load_file(train_file)
        y_test, test_data = load_file(test_file)

        # Determine number of features from the data
        # Find the maximum feature index in both train and test data
        train_max_feature = max(train_data[1][1]) if train_data[1][1] else 0
        test_max_feature = max(test_data[1][1]) if test_data[1][1] else 0
        n_features = max(train_max_feature, test_max_feature) + 1  # +1 because indices start at 0

        # Convert to sparse matrices
        X_train = csr_matrix(train_data, shape=(len(y_train), n_features))
        X_test = csr_matrix(test_data, shape=(len(y_test), n_features))

        return X_train, np.array(y_train), X_test, np.array(y_test), category_names


def main():
    # Load data
    X_train, y_train, X_test, y_test, categories = load_8newsgroup_data()

    # Using strong L1 regularization
    l1_model = LogisticRegression(penalty='l1', solver='saga', C=0.15, max_iter=2000, random_state=42, tol=0.001)
    l1_model.fit(X_train, y_train)

    # Get top 200 features
    top_200_indices = np.argsort(np.sum(np.abs(l1_model.coef_), axis=0))[-200:]

    X_train_selected = X_train[:, top_200_indices]
    X_test_selected = X_test[:, top_200_indices]

    l2_model = LogisticRegression(penalty='l2', solver='lbfgs', C=1.0, max_iter=1000, random_state=42, tol=0.0001)
    l2_model.fit(X_train_selected, y_train)

    y_pred = l2_model.predict(X_test_selected)

    cm = confusion_matrix(y_test, y_pred)
    class_accuracies = cm.diagonal() / cm.sum(axis=1)

    # Show per-class accuracy
    print("\nAccuracy per class:")
    print("-"*60)
    for i, (category, accuracy) in enumerate(zip(categories, class_accuracies)):
        correct = cm[i, i]
        total = cm[i].sum()
        print(f"Class {i} ({category:<15}): {accuracy:6.2%}")

    # Overall accuracy
    overall_accuracy = l2_model.score(X_test_selected, y_test)
    print("-"*60)
    print(f"Overall Accuracy: {overall_accuracy:.2%}")

if __name__ == "__main__":
    main()


Accuracy per class:
------------------------------------------------------------
Class 0 (religion       ): 86.16%
Class 1 (computer       ): 95.08%
Class 2 (forsale        ): 69.74%
Class 3 (autos          ): 80.23%
Class 4 (sports         ): 89.45%
Class 5 (med            ): 59.09%
Class 6 (space          ): 79.19%
Class 7 (politics       ): 82.76%
------------------------------------------------------------
Overall Accuracy: 86.02%
