In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Load dataset
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

# Convert to DataFrame
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})

# Print total instances of dataset
print("Total Instances of Dataset: ", df.shape[0])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Convert text data to numerical data using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_counts)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

# Print the classification report for more detailed metrics
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Classify a new document
new_document = ["God does not exist. There is no proof of any deity."]
new_document_counts = vectorizer.transform(new_document)
predicted_label = clf.predict(new_document_counts)
predicted_category = newsgroups.target_names[predicted_label[0]]
print(f'Classified label: {predicted_category}')


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Total Instances of Dataset:  3387
Accuracy: 0.9365781710914455
Precision: 0.9374763408495319
Recall: 0.9365781710914455

Classification Report:

                    precision    recall  f1-score   support

       alt.atheism       0.87      0.94      0.90       155
     comp.graphics       0.98      0.98      0.98       195
         sci.space       0.96      0.96      0.96       201
talk.religion.misc       0.91      0.83      0.87       127

          accuracy                           0.94       678
         macro avg       0.93      0.93      0.93       678
      weighted avg       0.94      0.94      0.94       678

Classified label: alt.atheism
