In [None]:
# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import PCA

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load Dataset (Binary Classification)

In [3]:
categories = ['sci.space', 'rec.sport.hockey']

data = fetch_20newsgroups(subset='all', categories=categories)

df = pd.DataFrame({
    "text": data.data,
    "label": data.target
})

df.head()


Unnamed: 0,text,label
0,From: mccall@mksol.dseg.ti.com (fred j mccall ...,1
1,From: epritcha@s.psych.uiuc.edu ( Evan Pritcha...,0
2,From: baalke@kelvin.jpl.nasa.gov (Ron Baalke)\...,1
3,From: mse@cc.bellcore.com (25836-michael evenc...,0
4,From: apanjabi@guvax.acc.georgetown.edu\nSubje...,0


In [6]:
#  Text Cleaning

In [5]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)


In [7]:
#  Train-Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42
)


In [9]:
# CountVectorizer + Models

In [11]:
cv = CountVectorizer(max_features=5000)

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC()
}

print("= CountVectorizer Results =\n")

for name, model in models.items():
    model.fit(X_train_cv, y_train)
    y_pred = model.predict(X_test_cv)
    
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("-"*50)


= CountVectorizer Results =

Model: Naive Bayes
Accuracy: 0.9949748743718593
Confusion Matrix:
 [[201   1]
 [  1 195]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       202
           1       0.99      0.99      0.99       196

    accuracy                           0.99       398
   macro avg       0.99      0.99      0.99       398
weighted avg       0.99      0.99      0.99       398

--------------------------------------------------
Model: Logistic Regression
Accuracy: 0.992462311557789
Confusion Matrix:
 [[201   1]
 [  2 194]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       202
           1       0.99      0.99      0.99       196

    accuracy                           0.99       398
   macro avg       0.99      0.99      0.99       398
weighted avg       0.99      0.99      0.99       398

--------------------------------------------------
Model: SVM
Accuracy: 0.97738

In [12]:
# TF-IDF + Models (Bonus Comparison)

In [16]:
tfidf = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("-  - TF-IDF Results -  -\n")

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("-"*50)


-  - TF-IDF Results -  -

Model: Naive Bayes
Accuracy: 0.9949748743718593
--------------------------------------------------
Model: Logistic Regression
Accuracy: 0.9974874371859297
--------------------------------------------------
Model: SVM
Accuracy: 0.9974874371859297
--------------------------------------------------


In [14]:
# Top Important Words (Logistic Regression)

In [15]:
feature_names = tfidf.get_feature_names_out()
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_tfidf, y_train)

coefs = log_model.coef_[0]
top_positive = np.argsort(coefs)[-10:]
top_negative = np.argsort(coefs)[:10]

print("Top words for Class 1:")
print([feature_names[i] for i in top_positive])

print("\nTop words for Class 0:")
print([feature_names[i] for i in top_negative])


Top words for Class 1:
['nasa', 'henry', 'launch', 'earth', 'sky', 'pat', 'shuttle', 'orbit', 'moon', 'space']

Top words for Class 0:
['game', 'hockey', 'team', 'nhl', 'games', 'play', 'players', 'espn', 'go', 'pittsburgh']
