# 1. Import required libraries (Already done above)

In [44]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score

# 2. Import 20news group dataset from scikit-learn

In [45]:
news_data = fetch_20newsgroups(subset='all')

# 3. Load 20news group train subset

In [47]:
train_data = fetch_20newsgroups(subset='train')

# 4. Load 20news group test subset

In [48]:
test_data = fetch_20newsgroups(subset='test')

# 5. Print all target labels

In [24]:
print("All Target Labels:", train_data.target_names)

All Target Labels: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


# 6. Prepare subset of categories

In [49]:
categories = ['alt.atheism', 'comp.graphics', 'sci.space']

# 7. Load 20news group train subset with specific categories

In [50]:
train_subset = fetch_20newsgroups(subset='train', categories=categories)

# 8. Load 20news group test subset with specific categories

In [51]:
test_subset = fetch_20newsgroups(subset='test', categories=categories)

# 9. Print new training set target names

In [52]:
print("Subset Target Labels:", train_subset.target_names)

Subset Target Labels: ['alt.atheism', 'comp.graphics', 'sci.space']


# 10. Print news training data of the 5th article

In [53]:
print("5th Article:", train_subset.data[4])

5th Article: From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer @ U of Toronto Zoology
between SVR3 and SunOS.    - Dick Dunn  |  henry@zoo.toronto.edu  utzoo!henry



# 11. Print shape of data and targets

In [54]:
print("Training Data Shape:", len(train_subset.data))
print("Training Target Shape:", len(train_subset.target))

Training Data Shape: 1657
Training Target Shape: 1657


# 12. Print training set filenames (Not available in fetch_20newsgroups)

# 13. Convert train data to numerical format using CountVectorizer

In [32]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train_subset.data)

# 14. Train model using BernoulliNB

In [55]:
clf = BernoulliNB()
clf.fit(X_train_counts, train_subset.target)

# 15. Convert test data to numerical format

In [56]:
X_test_counts = vectorizer.transform(test_subset.data)

# 16. Predict target labels for test set

In [57]:
y_pred = clf.predict(X_test_counts)

# 17. Find accuracy score on test set

In [58]:
accuracy = accuracy_score(test_subset.target, y_pred)
print("BernoulliNB Accuracy:", accuracy)

BernoulliNB Accuracy: 0.852994555353902


# 18. Use TfidfVectorizer and MultinomialNB

In [59]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_subset.data)
X_test_tfidf = tfidf_vectorizer.transform(test_subset.data)

clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, train_subset.target)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

# 19. Find test set accuracy

In [60]:
accuracy_tfidf = accuracy_score(test_subset.target, y_pred_tfidf)
print("MultinomialNB Accuracy with TF-IDF:", accuracy_tfidf)

MultinomialNB Accuracy with TF-IDF: 0.9473684210526315


# 20. Try avoiding stopwords and repeat the process

In [61]:
vectorizer_stopwords = TfidfVectorizer(stop_words='english')
X_train_tfidf_sw = vectorizer_stopwords.fit_transform(train_subset.data)
X_test_tfidf_sw = vectorizer_stopwords.transform(test_subset.data)

clf_tfidf_sw = MultinomialNB()
clf_tfidf_sw.fit(X_train_tfidf_sw, train_subset.target)
y_pred_tfidf_sw = clf_tfidf_sw.predict(X_test_tfidf_sw)

accuracy_tfidf_sw = accuracy_score(test_subset.target, y_pred_tfidf_sw)
print("MultinomialNB Accuracy without Stopwords:", accuracy_tfidf_sw)

MultinomialNB Accuracy without Stopwords: 0.9555353901996371
