In [2]:
#Loading the data set - training data.
import sklearn
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [3]:
#checking the target names (categories) and some data files by following commands.
twenty_train.target_names #prints all the categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
print("\n".join(twenty_train.data[0].split("\n")[:3])) 
#prints first line of the first data file

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu


In [5]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [6]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [7]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [8]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [9]:
# Performance of NB Classifier
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)


0.7738980350504514

In [10]:
#finding precision
from sklearn.metrics import precision_score
precision_score(twenty_test.target, predicted, average="macro")

0.8255310124210137

In [20]:
#building confusion matrix
sklearn.metrics.multilabel_confusion_matrix(twenty_test.target, predicted)

array([[[7172,   41],
        [ 153,  166]],

       [[7084,   59],
        [ 137,  252]],

       [[7081,   57],
        [ 136,  258]],

       [[6991,  149],
        [  87,  305]],

       [[7097,   50],
        [  87,  298]],

       [[7100,   37],
        [  97,  298]],

       [[7122,   20],
        [ 119,  271]],

       [[7070,   66],
        [  32,  364]],

       [[7109,   25],
        [  27,  371]],

       [[7105,   30],
        [  40,  357]],

       [[7086,   47],
        [  12,  387]],

       [[6874,  262],
        [  13,  383]],

       [[7093,   46],
        [ 158,  235]],

       [[7111,   25],
        [ 104,  292]],

       [[7072,   66],
        [  43,  351]],

       [[6633,  501],
        [   6,  392]],

       [[6979,  189],
        [  23,  341]],

       [[7130,   26],
        [  32,  344]],

       [[7216,    6],
        [ 181,  129]],

       [[7280,    1],
        [ 216,   35]]])

In [12]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, max_iter=12, random_state=42))])

text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.8240839086563994

In [13]:
#finding precision for SVM
precision_score(twenty_test.target, predicted_svm, average="macro")

0.8263675044574992

In [14]:
#building confusion matrix for SVM
sklearn.metrics.multilabel_confusion_matrix(twenty_test.target, predicted_svm)

array([[[7129,   84],
        [  93,  226]],

       [[7079,   64],
        [ 120,  269]],

       [[7015,  123],
        [  86,  308]],

       [[7046,   94],
        [ 130,  262]],

       [[7074,   73],
        [  66,  319]],

       [[7078,   59],
        [  92,  303]],

       [[7075,   67],
        [  40,  350]],

       [[7104,   32],
        [  42,  354]],

       [[7103,   31],
        [  13,  385]],

       [[7089,   46],
        [  39,  358]],

       [[7080,   53],
        [   3,  396]],

       [[7063,   73],
        [  15,  381]],

       [[7088,   51],
        [ 151,  242]],

       [[7087,   49],
        [  56,  340]],

       [[7064,   74],
        [  15,  379]],

       [[7009,  125],
        [  22,  376]],

       [[7022,  146],
        [  28,  336]],

       [[7120,   36],
        [  26,  350]],

       [[7196,   26],
        [ 136,  174]],

       [[7262,   19],
        [ 152,   99]]])

In [15]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [16]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [25]:
# To see the best mean score and the params

print(gs_clf.best_score_)
gs_clf.best_params_



0.9157684864695698


{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [26]:
#Accuracy of NB after hyperparameter tuning
predicted_gs_nb = gs_clf.predict(twenty_test.data)
np.mean(predicted_gs_nb == twenty_test.target)


0.8361656930430165

In [27]:
#Precision of NB after hyperparameter tuning
precision_score(twenty_test.target, predicted_gs_nb, average="macro")

0.8351706629293817

In [18]:
# Similarly doing grid search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 
                  'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)


print(gs_clf_svm.best_score_)
gs_clf_svm.best_params_

0.9051618841994754


{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [28]:
#Accuracy of SVM after hyperparameter tuning
predicted_gs_svm = gs_clf_svm.predict(twenty_test.data)
np.mean(predicted_gs_svm == twenty_test.target)

0.8351035581518853

In [29]:
#Precision of SVM after hyperparameter tuning
precision_score(twenty_test.target, predicted_gs_svm, average="macro")

0.8384991589890454

In [19]:
# NLTK
# Removing stop words
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

In [20]:
# Stemming Code

import nltk
nltk.download('stopwords')

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

np.mean(predicted_mnb_stemmed == twenty_test.target)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sunayna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.8167817312798725

In [21]:
#precision of NB with stemming
precision_score(twenty_test.target, predicted_mnb_stemmed, average="macro")

0.8335586735879094

In [22]:
#Accuracy of SVM with stemming
text_svm_stemmed = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, 
                                                   max_iter=12, random_state=42))])
text_svm_stemmed = text_svm_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_svm_stemmed = text_svm_stemmed.predict(twenty_test.data)

np.mean(predicted_svm_stemmed == twenty_test.target)

0.8240839086563994

In [23]:
#precision of SVM with stemming
precision_score(twenty_test.target, predicted_svm_stemmed, average="macro")

0.8263675044574992