In [2]:
# 1. Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score

In [3]:
# 3. Load 20newsgroup train subset
newsgroups_train = fetch_20newsgroups(subset='train')

In [4]:
# 4. Load 20newsgroup test subset
newsgroups_test = fetch_20newsgroups(subset='test')

In [5]:
# 5. Print all target labels
print("All target labels:")
print(newsgroups_train.target_names)
print("\n")

All target labels:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']




In [6]:
# 6. Prepare subset of categories: alt.atheism, comp.graphics, sci.space
categories = ['alt.atheism', 'comp.graphics', 'sci.space']

In [7]:
# 7. Load 20newsgroup train subset with three categories
train_subset = fetch_20newsgroups(subset='train', categories=categories)

In [8]:
# 8. Load 20newsgroup test subset with three categories
test_subset = fetch_20newsgroups(subset='test', categories=categories)

In [9]:
# 9. Print new training set target names (Labels)
print("Subset target labels:")
print(train_subset.target_names)
print("\n")

Subset target labels:
['alt.atheism', 'comp.graphics', 'sci.space']




In [10]:
# 10. Print news training data of 5th article
print("5th article text:")
print(train_subset.data[4])
print("\n")

5th article text:
From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer @ U of Toronto Zoology
between SVR3 and SunOS.    - Dick Dunn  |  henry@zoo.toronto.edu  utzoo!henry





In [11]:
# 11. Print shape of data and targets
print(f"Training data shape: {len(train_subset.data)} articles")
print(f"Training targets shape: {train_subset.target.shape}")
print("\n")

Training data shape: 1657 articles
Training targets shape: (1657,)




In [12]:
# 12. Print training set filenames
print("First 5 training set filenames:")
print(train_subset.filenames[:5])
print("\n")


First 5 training set filenames:
['/home/varikuppalashirishaa/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60869'
 '/home/varikuppalashirishaa/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38633'
 '/home/varikuppalashirishaa/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53534'
 '/home/varikuppalashirishaa/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38516'
 '/home/varikuppalashirishaa/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61210']




In [13]:
# 13. By using CountVectorizer train data into numerical format
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_subset.data)

In [14]:
# 14. Use BernoulliNB for training
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_counts, train_subset.target)

In [15]:
# 15. By using CountVectorizer convert test data into numeric format
X_test_counts = count_vectorizer.transform(test_subset.data)

In [16]:
# 16. Predict target labels for testing set
y_pred = bernoulli_nb.predict(X_test_counts)

In [17]:
# 17. Find accuracy score on test set
accuracy = accuracy_score(test_subset.target, y_pred)
print(f"BernoulliNB with CountVectorizer accuracy: {accuracy:.4f}")
print("\n")

BernoulliNB with CountVectorizer accuracy: 0.8530




In [18]:
# 18. Use TfidfVectorizer instead of count vectorizer and use MultinomialNB
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_subset.data)

multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train_tfidf, train_subset.target)

In [19]:

# 19. Find test set accuracy
X_test_tfidf = tfidf_vectorizer.transform(test_subset.data)
y_pred_tfidf = multinomial_nb.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(test_subset.target, y_pred_tfidf)
print(f"MultinomialNB with TfidfVectorizer accuracy: {accuracy_tfidf:.4f}")
print("\n")

MultinomialNB with TfidfVectorizer accuracy: 0.9474




In [20]:
# 20. Try with avoiding stopwords and repeat the same
tfidf_vectorizer_nostop = TfidfVectorizer(stop_words='english')
X_train_tfidf_nostop = tfidf_vectorizer_nostop.fit_transform(train_subset.data)

multinomial_nb_nostop = MultinomialNB()
multinomial_nb_nostop.fit(X_train_tfidf_nostop, train_subset.target)

X_test_tfidf_nostop = tfidf_vectorizer_nostop.transform(test_subset.data)
y_pred_tfidf_nostop = multinomial_nb_nostop.predict(X_test_tfidf_nostop)
accuracy_tfidf_nostop = accuracy_score(test_subset.target, y_pred_tfidf_nostop)
print(f"MultinomialNB with TfidfVectorizer (no stopwords) accuracy: {accuracy_tfidf_nostop:.4f}")

MultinomialNB with TfidfVectorizer (no stopwords) accuracy: 0.9555
