In [545]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn import metrics
from sklearn.svm import LinearSVC

In [546]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(data_home=None, subset='train', categories=categories, shuffle=True, random_state=42, remove=(), download_if_missing=True)
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [547]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [548]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [549]:
#clf = MultinomialNB().fit(twenty_train.data, twenty_train.target)
#clf = LinearSVC().fit(X_train_tfidf, twenty_train.target)
#twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
#predicted = clf.predict(twenty_test.data)
#np.mean(predicted == twenty_test.target)
#print(clf)

In [550]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
data_test = twenty_test.data
predicted = text_clf.predict(data_test)
print("Accuracy with Multinomial Naive Bayes: ", np.mean(predicted == twenty_test.target))

Accuracy with Multinomial Naive Bayes:  0.8894806924101198


In [551]:
print("Results for Multinomial Naive Bayes:\n",metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

Results for Multinomial Naive Bayes:
                         precision    recall  f1-score   support

           alt.atheism       0.97      0.72      0.83       319
         comp.graphics       0.95      0.95      0.95       389
               sci.med       0.95      0.88      0.92       396
soc.religion.christian       0.76      0.97      0.85       398

           avg / total       0.90      0.89      0.89      1502



In [552]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(loss='hinge', penalty='l2', random_state=42, max_iter=5))])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
data_test = twenty_test.data
predicted = text_clf.predict(data_test)
print("Accuracy with Linear Support Vector Classification: ", np.mean(predicted == twenty_test.target))

Accuracy with Linear Support Vector Classification:  0.9234354194407457


In [553]:
print("Results for LinearSVC:\n",metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

Results for LinearSVC:
                         precision    recall  f1-score   support

           alt.atheism       0.96      0.82      0.88       319
         comp.graphics       0.89      0.98      0.94       389
               sci.med       0.95      0.91      0.93       396
soc.religion.christian       0.90      0.96      0.93       398

           avg / total       0.93      0.92      0.92      1502

