In [15]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Define categories
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

# Fetch dataset
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)

# Check if data is present
print("Training samples:", len(twenty_train.data))
print("Testing samples:", len(twenty_test.data))
print("Categories:", twenty_train.target_names)

# Display first training sample
print("\nFirst training document:")
print("\n".join(twenty_train.data[0].split("\n")))
print("Category:", twenty_train.target[0])

# Convert text data to numerical vectors
count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(twenty_train.data)

# Apply TF-IDF transformation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)

# Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, twenty_train.target)

# Transform test data
X_test_tf = count_vect.transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)

# Make predictions
predicted = model.predict(X_test_tfidf)

# Display results
print("\nAccuracy:", accuracy_score(twenty_test.target, predicted))
print("\nClassification Report:\n", classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))
print("\nConfusion Matrix:\n", confusion_matrix(twenty_test.target, predicted))


Training samples: 2257
Testing samples: 1502
Categories: ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

First training document:
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.

Category: 1

Accuracy: 0.8362183754993342

Classification Report:
                         precision    recall  f1-score   support

           alt.atheism      