In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
news_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

In [3]:
news_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [4]:
news_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

In [5]:
print(news_train.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [6]:
categories = ['alt.atheism', 'comp.graphics', 'sci.space']

In [7]:
news_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [8]:
news_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [9]:
print(news_train.target_names)

['alt.atheism', 'comp.graphics', 'sci.space']


In [10]:
print(news_train.data[4])

From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer @ U of Toronto Zoology
between SVR3 and SunOS.    - Dick Dunn  |  henry@zoo.toronto.edu  utzoo!henry



In [11]:
print(len(news_train.data), len(news_train.target))

1657 1657


In [12]:
print(news_train.filenames)

['/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60869'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38633'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53534'
 ...
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60915'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60176'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60929']


In [13]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(news_train.data)

In [14]:
clf = BernoulliNB().fit(X_train_counts, news_train.target)

In [15]:
X_test_counts = vectorizer.transform(news_test.data)

In [16]:
y_pred = clf.predict(X_test_counts)

In [17]:
print(accuracy_score(news_test.target, y_pred))

0.852994555353902


In [18]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(news_train.data)
X_test_tfidf = tfidf_vectorizer.transform(news_test.data)

clf_tfidf = MultinomialNB().fit(X_train_tfidf, news_train.target)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [19]:
print(accuracy_score(news_test.target, y_pred_tfidf))

0.9473684210526315


In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(news_train.data)
X_test_tfidf = tfidf_vectorizer.transform(news_test.data)

clf_tfidf = MultinomialNB().fit(X_train_tfidf, news_train.target)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

print(accuracy_score(news_test.target, y_pred_tfidf))

0.9555353901996371
