In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

In [None]:
# Load subset of 20 Newsgroups
categories = ['rec.sport.baseball', 'sci.space', 'comp.graphics']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X = data.data
y = data.target

In [None]:
# Split into train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Store accuracy scores for plotting
model_names = []
accuracies = []

# 1. BoW
print("\n--- Bag of Words ---")
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train_bow, y_train)
y_pred_bow = clf_bow.predict(X_test_bow)
acc_bow = accuracy_score(y_test, y_pred_bow)
model_names.append("BoW")
accuracies.append(acc_bow)

print(classification_report(y_test, y_pred_bow, target_names=data.target_names))

In [None]:
# 2. TF-IDF
print("\n--- TF-IDF ---")
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
acc_tfidf = accuracy_score(y_test, y_pred_tfidf)
model_names.append("TF-IDF")
accuracies.append(acc_tfidf)

In [None]:
print(classification_report(y_test, y_pred_tfidf, target_names=data.target_names))

In [None]:
# 3. N-Gram (Bigrams)
print("\n--- N-Gram (Bigrams) ---")
ngram_vectorizer = CountVectorizer(ngram_range=(2,2))
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)

In [None]:
clf_ngram = LogisticRegression(max_iter=1000)
clf_ngram.fit(X_train_ngram, y_train)
y_pred_ngram = clf_ngram.predict(X_test_ngram)
acc_ngram = accuracy_score(y_test, y_pred_ngram)
model_names.append("N-Gram (2-grams)")
accuracies.append(acc_ngram)

In [None]:
print(classification_report(y_test, y_pred_ngram, target_names=data.target_names))

In [None]:
# Plotting the comparison
plt.figure(figsize=(8,5))
plt.bar(model_names, accuracies, color=['skyblue', 'lightgreen', 'orange'])
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
for i, acc in enumerate(accuracies):
    plt.text(i, acc + 0.01, f"{acc:.2f}", ha='center')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()