<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

def fetch_data():
    """
    Fetch the 20 Newsgroups dataset, vectorize the text using TF-IDF,
    and split into train and test sets.
    """
    categories = [
        'soc.religion.christian',
        'comp.graphics',
        'misc.forsale',
        'rec.autos',
        'sci.space',
        'sci.med',
        'rec.sport.hockey',
        'talk.politics.guns'
    ]

    newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
    X_train_text, y_train = newsgroups_train.data, newsgroups_train.target

    newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
    X_test_text, y_test = newsgroups_test.data, newsgroups_test.target

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.95, sublinear_tf=True)

    X_train = vectorizer.fit_transform(X_train_text)
    X_test = vectorizer.transform(X_test_text)

    return X_train, y_train, X_test, y_test, newsgroups_train.target_names


def main():
    # Load data
    X_train, y_train, X_test, y_test, categories = fetch_data()

    # Using strong L1 regularization
    l1_model = LogisticRegression(penalty='l1', solver='saga', C=0.5, max_iter=2000, tol=0.001)
    l1_model.fit(X_train, y_train)

    # Get top 200 features
    top_200_indices = np.argsort(np.sum(np.abs(l1_model.coef_), axis=0))[-200:]

    X_train_selected = X_train[:, top_200_indices]
    X_test_selected = X_test[:, top_200_indices]

    l2_model = LogisticRegression(penalty='l2', solver='lbfgs', C=1.0, max_iter=1000, tol=0.0001)
    l2_model.fit(X_train_selected, y_train)

    y_pred = l2_model.predict(X_test_selected)

    cm = confusion_matrix(y_test, y_pred)
    class_accuracies = cm.diagonal() / cm.sum(axis=1)

    # Show per-class accuracy
    print("\nAccuracy per class:")
    print("-"*60)
    for i, (category, accuracy) in enumerate(zip(categories, class_accuracies)):
        correct = cm[i, i]
        total = cm[i].sum()
        print(f"Class {i} ({category:<15}): {accuracy:6.2%}")

    # Overall accuracy
    overall_accuracy = l2_model.score(X_test_selected, y_test)
    print("-"*60)
    print(f"Overall Accuracy: {overall_accuracy:.2%}")

if __name__ == "__main__":
    main()


Accuracy per class:
------------------------------------------------------------
Class 0 (comp.graphics  ): 80.21%
Class 1 (misc.forsale   ): 76.15%
Class 2 (rec.autos      ): 61.36%
Class 3 (rec.sport.hockey): 83.46%
Class 4 (sci.med        ): 65.40%
Class 5 (sci.space      ): 78.68%
Class 6 (soc.religion.christian): 77.89%
Class 7 (talk.politics.guns): 65.66%
------------------------------------------------------------
Overall Accuracy: 73.67%
