In [22]:

#1. Import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [6]:

#2. Import 20 News Group dataset from scikit-learn datasets



categories = ['alt.atheism', 'comp.graphics', 'sci.space']  # Selecting 3 categories
news_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
news_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))


In [4]:
#3. Load 20 News Group train subset



X_train, y_train = news_train.data, news_train.target


In [23]:
#4. Load 20 News Group test subset

X_test, y_test = news_test.data, news_test.target


In [24]:
#5. Print all target labels

print("Target Labels:", news_train.target_names)


Target Labels: ['alt.atheism', 'comp.graphics', 'sci.space']


In [None]:
#6. Prepare a subset of categories (alt.atheism, comp.graphics, sci.space)
#This is already done in Step 2, where we selected these categories.



In [8]:

#7. Load 20 News Group train subset with three categories (alt.atheism, comp.graphics)


categories_subset = ['alt.atheism', 'comp.graphics']
news_train_subset = fetch_20newsgroups(subset='train', categories=categories_subset, remove=('headers', 'footers', 'quotes'))


In [9]:

#9. Print new training set target names (Labels)

print("Subset Target Labels:", news_train_subset.target_names)


Subset Target Labels: ['alt.atheism', 'comp.graphics']


In [10]:

#10. Print news training data of the 5th article

print("5th Training Article:\n", X_train[4])


5th Training Article:
 
I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.


In [12]:
#11. Print shape of data and targets


# Ensure the test dataset is loaded
news_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

# Assign test data and labels
X_test, y_test = news_test.data, news_test.target

# Print shape of data
print("Train Data Shape:", len(X_train), "Test Data Shape:", len(X_test))


Train Data Shape: 1657 Test Data Shape: 1102


In [25]:
#12. Print training set filenames

print("Training Filenames (First 5):", news_train.filenames[:5])


Training Filenames (First 5): ['/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60869'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38633'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53534'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38516'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61210']


In [26]:
#13. Convert text data into numerical format using CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)


In [31]:

#14. Train the Bernoulli Naïve Bayes model

bnb =BernoulliNB()
bnb.fit(X_train_counts, y_train)


In [16]:

#15. Convert test data into numeric format using CountVectorizer
#Already done in Step 13 using:


X_test_counts = vectorizer.transform(X_test)


In [17]:

#16. Predict target labels for the test set


y_pred_bnb = bnb.predict(X_test_counts)


In [18]:
#17. Find accuracy score on the test set


bnb_accuracy = accuracy_score(y_test, y_pred_bnb)
print("BernoulliNB Accuracy:", bnb_accuracy)


BernoulliNB Accuracy: 0.7141560798548094


In [19]:

#18. Use TfidfVectorizer instead of CountVectorizer and train MultinomialNB


tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)


In [29]:
#19. Find test set accuracy for MultinomialNB



y_pred_mnb = mnb.predict(X_test_tfidf)
mnb_accuracy = accuracy_score(y_test, y_pred_mnb)
print("MultinomialNB Accuracy:", mnb_accuracy)


MultinomialNB Accuracy: 0.8729582577132486


In [27]:

#20. Try avoiding stopwords and repeat the same


tfidf_vectorizer_no_stopwords = TfidfVectorizer()
X_train_tfidf_no_stopwords = tfidf_vectorizer_no_stopwords.fit_transform(X_train)
X_test_tfidf_no_stopwords = tfidf_vectorizer_no_stopwords.transform(X_test)

mnb_no_stopwords = MultinomialNB()
mnb_no_stopwords.fit(X_train_tfidf_no_stopwords, y_train)

y_pred_no_stopwords = mnb_no_stopwords.predict(X_test_tfidf_no_stopwords)
mnb_accuracy_no_stopwords = accuracy_score(y_test, y_pred_no_stopwords)
print("MultinomialNB Accuracy without stopwords:", mnb_accuracy_no_stopwords)


MultinomialNB Accuracy without stopwords: 0.8620689655172413
