In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

In [4]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
# Train a text classification model (Multinomial Naive Bayes)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

In [6]:
# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

In [7]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6835543766578249


In [8]:
# Display classification report and confusion matrix
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))


Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.59      0.46      0.52       151
           comp.graphics       0.58      0.63      0.61       202
 comp.os.ms-windows.misc       0.61      0.63      0.62       195
comp.sys.ibm.pc.hardware       0.54      0.69      0.61       183
   comp.sys.mac.hardware       0.76      0.62      0.69       205
          comp.windows.x       0.81      0.79      0.80       215
            misc.forsale       0.75      0.69      0.72       193
               rec.autos       0.70      0.68      0.69       196
         rec.motorcycles       0.42      0.73      0.53       168
      rec.sport.baseball       0.83      0.79      0.81       211
        rec.sport.hockey       0.90      0.87      0.88       198
               sci.crypt       0.77      0.75      0.76       201
         sci.electronics       0.73      0.57      0.64       202
                 sci.med       0.81      0.80      

In [10]:
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[ 70   0   3   0   0   1   0   1   6   2   1   2   0   4   4  34   7   9
    4   3]
 [  1 128  17  14   2  11   3   2  10   0   0   2   2   2   4   3   0   1
    0   0]
 [  1  17 123  23   5  10   0   1   9   1   0   3   1   0   0   1   0   0
    0   0]
 [  0  10  19 127  12   3   4   1   1   0   0   0   3   2   1   0   0   0
    0   0]
 [  0   6  10  24 128   1   6   2  12   0   0   4   5   1   3   2   1   0
    0   0]
 [  0  21  12   2   1 169   2   1   3   0   0   0   0   1   1   2   0   0
    0   0]
 [  0   4   2  21   7   1 134   6   3   1   0   3   4   0   5   1   1   0
    0   0]
 [  1   2   0   3   0   4   4 133  24   0   1   0   5   3   5   3   5   1
    2   0]
 [  0   3   0   1   0   0   6  13 123   6   1   1   3   2   3   2   3   0
    1   0]
 [  1   1   0   0   0   0   2   1  16 167   8   2   0   2   2   6   0   3
    0   0]
 [  1   0   0   0   0   1   0   3   9   4 173   2   1   2   0   1   0   1
    0   0]
 [  1   4   3   0   1   3   1   1   4   1   1 