# Text Classification with Naive Bayes

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.datasets import fetch_20newsgroups
%pylab inline

news = fetch_20newsgroups(subset="all")



    

Populating the interactive namespace from numpy and matplotlib


In [3]:
print(news.data[0], news.target[0]) # the data holds a list of text contents, instead of a numpy matrix

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!

 10


In [6]:
print(len(news.data), type(news.target),
type(news.target_names))

18846 <class 'numpy.ndarray'> <class 'list'>


In [7]:
#converting our text-based dataset to a numeric dataset

SPLIT_PERC = 0.75 #Spliting data for train and test 

split_size = int(len(news.data)*SPLIT_PERC) 

X_train = news.data[:split_size]

X_test = news.data[split_size:]

y_train = news.target[:split_size]

y_test = news.target[split_size:]


print(len(X_train),len(X_test))

14134 4712


In [25]:
#Training naive bayes classifier with 3 different class transform

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

clf_1 = Pipeline([("vect", CountVectorizer()), ("clf", MultinomialNB()),])

clf_2 = Pipeline([("vect", TfidfVectorizer()), ("clf", MultinomialNB()),])


In [26]:
#defining a function that takes a classifier and performs the K-fold cross-validation over the specified X and y values

from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(clf, X, y, K):
    
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(shuffle=True, random_state=0)
    
    # by default the score used is the one returned by score >>> method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print(scores)
    print(f"Mean score: {np.mean(scores)} +/- {sem(scores)}")


clfs = [clf_1, clf_2]

for clf in clfs:
    
    evaluate_cross_validation(clf, news.data, news.target, 5)

[0.85782493 0.85725657 0.84664367 0.85911382 0.8458477 ]
Mean score: 0.853337340146793 +/- 0.0029134723097208348
[0.84482759 0.85990979 0.84558238 0.85990979 0.84213319]
Mean score: 0.8504725482840962 +/- 0.003895171759717566


In [29]:
clf_4 = Pipeline([('vect', TfidfVectorizer(token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),('clf', MultinomialNB()),])

In [31]:
evaluate_cross_validation(clf_4, news.data, news.target, 5)

[0.86100796 0.8718493  0.86203237 0.87291059 0.8588485 ]
Mean score: 0.8653297422150406 +/- 0.002928572778614428


In [40]:
"""using stop_words: this argument allows us to pass a list of words we do not want to take into account,
such as too frequent words, or words we do not a priori expect to provide information about the particular topic"""

def get_stop_words():
    
    result = set()
    
    for line in open("stopwords_en.txt","r").readlines(): # In this case i´m using just a place holder for the txt archive
        
        result.add(line.strip())
    
    return result

In [41]:
clf_5 = Pipeline([("vect", TfidfVectorizer(stop_words= get_stop_words(), token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b" )), ("clf", MultinomialNB())])

evaluate_cross_validation(clf_5, news.data, news.target, 5)

[0.86100796 0.8718493  0.86203237 0.87291059 0.8588485 ]
Mean score: 0.8653297422150406 +/- 0.002928572778614428


In [51]:
"experimenting with some alpha parameter"

alph = [0.01,0.1,0.3,0.7]

for i in alph:
    
    clf_7 = Pipeline([("vect", TfidfVectorizer(stop_words= get_stop_words(),
                            token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b" )),
                      ("clf", MultinomialNB(alpha = i))])

    evaluate_cross_validation(clf_7, news.data, news.target, 5)
    

[0.9198939  0.919342   0.91748474 0.92491377 0.91775007]
Mean score: 0.9198768960520454 +/- 0.0013398190194147679
[0.90742706 0.91138233 0.90342266 0.91483152 0.90368798]
Mean score: 0.9081503089914722 +/- 0.0022121334442564903
[0.89230769 0.89546299 0.88882993 0.89572831 0.88644203]
Mean score: 0.8917541890319816 +/- 0.0018251788140663436
[0.86976127 0.88033961 0.87052269 0.87927832 0.86786946]
Mean score: 0.8735542710918965 +/- 0.002595188239430988


In [50]:
"""Now, will define a helper function that will train the model in the entire training set and evaluate the accuracy in 
the training and in the testing sets. It will also print a classification report (precision and recall on every class)
and the corresponding confusion matrix:""" 

from sklearn import metrics

clf_8 = Pipeline([("vect", TfidfVectorizer(stop_words= get_stop_words(),
                                           token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b" )), ("clf", MultinomialNB(alpha = 0.01))])

def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    
    clf.fit(X_train, y_train)
    
    print(f"Accuracy on training set: {clf.score(X_train, y_train)} ")
    
    print(f"Accuracy on testing set: {clf.score(X_test, y_test)} ")
    
    y_pred = clf.predict(X_test)
    
    print("Classification Report:")
    
    print(metrics.classification_report(y_test, y_pred))
    
    print(f"Confusion Matrix:{metrics.confusion_matrix(y_test, y_pred)}")


train_and_evaluate(clf_8, X_train, X_test, y_train, y_test)

Accuracy on training set: 0.9966746851563606 
Accuracy on testing set: 0.9159592529711376 
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.88      0.91       216
           1       0.83      0.84      0.83       246
           2       0.91      0.83      0.87       274
           3       0.81      0.86      0.83       235
           4       0.87      0.90      0.89       231
           5       0.89      0.91      0.90       225
           6       0.88      0.80      0.84       248
           7       0.93      0.93      0.93       275
           8       0.96      0.98      0.97       226
           9       0.97      0.94      0.96       250
          10       0.97      1.00      0.98       257
          11       0.96      0.98      0.97       261
          12       0.90      0.91      0.91       216
          13       0.94      0.96      0.95       257
          14       0.94      0.96      0.95       246
          15       0.