In [3]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from time import time

# Load minimal dataset
categories = ['alt.atheism', 'comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, 
                                     remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, 
                                    remove=('headers', 'footers', 'quotes'))

# Configure vectorizers
vectorizers = {
    'Count': CountVectorizer(max_features=500, stop_words='english'),
    'TF-IDF': TfidfVectorizer(max_features=500, stop_words='english')
}

# Configure classifiers
classifiers = {
    'NB': MultinomialNB(),
    'LR': LogisticRegression(max_iter=100, solver='liblinear'),
    'SVM': LinearSVC(max_iter=100, tol=0.01),
    'DecisionTree': DecisionTreeClassifier(max_depth=50, min_samples_split=5)
}

# Benchmark results
results = []

for vec_name, vectorizer in vectorizers.items():
    # Transform data
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)
    
    for clf_name, clf in classifiers.items():
        start_time = time()
        clf.fit(X_train, newsgroups_train.target)
        y_pred = clf.predict(X_test)
        train_time = time() - start_time
        
        accuracy = accuracy_score(newsgroups_test.target, y_pred)
        results.append({
            'Model': f'{clf_name}-{vec_name}',
            'Accuracy': f'{accuracy:.3f}',
            'Time': f'{train_time:.2f}s'
        })

# Print results
print("\nBenchmark Results:")
print(pd.DataFrame(results))


Benchmark Results:
                 Model Accuracy   Time
0             NB-Count    0.914  0.01s
1             LR-Count    0.904  0.02s
2            SVM-Count    0.888  0.00s
3   DecisionTree-Count    0.853  0.03s
4            NB-TF-IDF    0.900  0.01s
5            LR-TF-IDF    0.907  0.00s
6           SVM-TF-IDF    0.912  0.00s
7  DecisionTree-TF-IDF    0.842  0.04s
