In [1]:
!pip install scikit-learn



 We will use the fetch_20newsgroups function from scikit-learn to download the data and split it into training and testing sets. We will also use the TfidfVectorizer class to transform the raw text into numerical features that represent the term frequency-inverse document frequency (TF-IDF) of each word in each document. TF-IDF is a common way to measure how important a word is in a document relative to the whole corpus.

In [2]:
# Import libraries

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset and split into train and test sets

categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)
X_train = newsgroups.data[:3000]
y_train = newsgroups.target[:3000]
X_test = newsgroups.data[3000:]
y_test = newsgroups.target[3000:]

# Transform the text into TF-IDF features

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Now we are ready to train and evaluate our models. We will use four different algorithms: SVM, MNB, random forest, and k-nearest neighbor.

In [3]:
# Train and evaluate SVM

svm = LinearSVC(random_state=42)
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
acc_svm = accuracy_score(y_test, y_pred_svm)
print(f'SVM accuracy: {acc_svm:.2f}')

# Train and evaluate MNB

mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)
y_pred_mnb = mnb.predict(X_test_tfidf)
acc_mnb = accuracy_score(y_test, y_pred_mnb)
print(f'MNB accuracy: {acc_mnb:.2f}')

# Train and evaluate random forest

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random forest accuracy: {acc_rf:.2f}')

# Train and evaluate KNN

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_tfidf, y_train)
y_pred_knn = knn.predict(X_test_tfidf)
acc_knn = accuracy_score(y_test, y_pred_knn)
print(f'KNN accuracy: {acc_knn:.2f}')

SVM accuracy: 0.94
MNB accuracy: 0.93
Random forest accuracy: 0.92
KNN accuracy: 0.89


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Here is the output:

SVM accuracy: 0.94
MNB accuracy: 0.93
Random forest accuracy: 0.92
KNN accuracy: 0.89