### Lab 7: Naive Bayes and K-NN classifiers

##### In this lab we will work with a subset of the 20 newsgroup data that was mentioned during the Naive Bayes discussion in class. We will select only four of 10 newsgroups to work with. 

##### First we load the data and take a look at it.

In [20]:
# load training data

from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med', 'sci.space',
              'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

# read the data in twenty_train.data and twenty_train.target
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

# print the class names
print (twenty_train.target_names, "\n")

# print the class names for the first 25 articles
for t in twenty_train.target[:25]:
    print(twenty_train.target[t], twenty_train.target_names[t])
    
# print the first article
print("\n", twenty_train.data[:1])

['alt.atheism', 'comp.graphics', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.med', 'sci.space', 'soc.religion.christian'] 

6 rec.sport.hockey
6 soc.religion.christian
2 misc.forsale
0 rec.sport.baseball
5 sci.space
6 alt.atheism
6 rec.sport.hockey
2 misc.forsale
0 rec.sport.baseball
6 rec.sport.hockey
6 soc.religion.christian
8 rec.motorcycles
9 comp.graphics
6 rec.sport.hockey
0 rec.sport.baseball
5 sci.space
2 misc.forsale
9 comp.graphics
0 rec.sport.baseball
0 rec.sport.baseball
9 comp.graphics
5 rec.autos
5 rec.autos
6 soc.religion.christian
9 comp.graphics

 ["From: huot@cray.com (Tom Huot)\nSubject: Re: Ulf and all...\nLines: 29\nNntp-Posting-Host: pittpa.cray.com\nOrganization: Cray Research Inc.\nX-Newsreader: TIN [version 1.1 PL8]\n\nRichard Wernick (richard@amc.com) wrote:\n: You should be ashamed to call yourself an Ulf Samuelson fan. Anybody who plays\n: the way he does, does not belong in the NHL. There have been cheap sh

##### Convert the data to word counts and see how many times the word 'algorithm' appears

In [21]:
# import and use CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data) 

count_vect.vocabulary_.get(u'algorithm')

10285

##### Run Naive Bayes

In [22]:
# Import Multinomial NB (this a good Naive Bayes classifier for text) and other libraries to help with analysis

from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn import metrics

# fit classifier using word counts
clf_1 = MultinomialNB().fit(X_train_counts, twenty_train.target)

# load the test data set and convert to word counts
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

# note that I am using .transform instead of .fit_transform. this keeps the columns the same as the training set
X_test_counts = count_vect.transform(twenty_test.data)

# make predictions on test data
predicted = clf_1.predict(X_test_counts)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.922342621259
                        precision    recall  f1-score   support

           alt.atheism       0.91      0.87      0.89       319
         comp.graphics       0.90      0.91      0.90       389
          misc.forsale       0.95      0.85      0.90       390
             rec.autos       0.87      0.95      0.91       396
       rec.motorcycles       0.97      0.95      0.96       398
    rec.sport.baseball       0.97      0.93      0.95       397
      rec.sport.hockey       0.95      0.97      0.96       399
               sci.med       0.95      0.86      0.90       396
             sci.space       0.92      0.94      0.93       394
soc.religion.christian       0.87      0.97      0.92       398

           avg / total       0.92      0.92      0.92      3876

[[276   1   1   2   1   1   2   2   3  30]
 [  5 354   2   5   0   2   2   5  11   3]
 [  0  16 333  24   2   1   3   4   6   1]
 [  0   2   6 378   3   1   2   0   4   0]
 [  0   0   3  12 380   0   0   0   1   2]

##### Convert the data to a TF-IDF representation and run Naive Bayes

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf_2 = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# make predictions on test data
predicted = clf_2.predict(X_test_tfidf)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.868937048504
                        precision    recall  f1-score   support

           alt.atheism       0.97      0.55      0.70       319
         comp.graphics       0.93      0.83      0.88       389
          misc.forsale       0.96      0.83      0.89       390
             rec.autos       0.88      0.94      0.91       396
       rec.motorcycles       0.94      0.94      0.94       398
    rec.sport.baseball       0.94      0.90      0.92       397
      rec.sport.hockey       0.92      0.97      0.94       399
               sci.med       0.94      0.77      0.85       396
             sci.space       0.92      0.91      0.91       394
soc.religion.christian       0.58      0.98      0.73       398

           avg / total       0.90      0.87      0.87      3876

[[175   1   0   0   2   1   1   6   3 130]
 [  1 322   2   9   3   7   2   1  10  32]
 [  0  10 322  20   5   4   5   6   4  14]
 [  0   3   5 373   3   2   3   1   3   3]
 [  0   0   2  12 375   0   0   0   0   9]

##### Create a pipeline for Naive Bayes for TF-IDF and rerun experiments

In [24]:
from sklearn.pipeline import Pipeline
clf_3 = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB()),
])

clf_3.fit(twenty_train.data, twenty_train.target)

# make predictions on test data
predicted = clf_3.predict(twenty_test.data)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.868937048504
                        precision    recall  f1-score   support

           alt.atheism       0.97      0.55      0.70       319
         comp.graphics       0.93      0.83      0.88       389
          misc.forsale       0.96      0.83      0.89       390
             rec.autos       0.88      0.94      0.91       396
       rec.motorcycles       0.94      0.94      0.94       398
    rec.sport.baseball       0.94      0.90      0.92       397
      rec.sport.hockey       0.92      0.97      0.94       399
               sci.med       0.94      0.77      0.85       396
             sci.space       0.92      0.91      0.91       394
soc.religion.christian       0.58      0.98      0.73       398

           avg / total       0.90      0.87      0.87      3876

[[175   1   0   0   2   1   1   6   3 130]
 [  1 322   2   9   3   7   2   1  10  32]
 [  0  10 322  20   5   4   5   6   4  14]
 [  0   3   5 373   3   2   3   1   3   3]
 [  0   0   2  12 375   0   0   0   0   9]

##### Create a pipeline for Naive Bayes for word_counts and rerun experiments

In [25]:
# insert code here

from sklearn.pipeline import Pipeline
clf_4 = Pipeline([('vect', CountVectorizer()),
                  ('clf', MultinomialNB()),                 
])

clf_4.fit(twenty_train.data, twenty_train.target)

# make predictions on test data
predicted = clf_4.predict(twenty_test.data)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

print ('\n',len(twenty_train.data))
print ('\n[samples,features]',X_train_counts.shape)


0.922342621259
                        precision    recall  f1-score   support

           alt.atheism       0.91      0.87      0.89       319
         comp.graphics       0.90      0.91      0.90       389
          misc.forsale       0.95      0.85      0.90       390
             rec.autos       0.87      0.95      0.91       396
       rec.motorcycles       0.97      0.95      0.96       398
    rec.sport.baseball       0.97      0.93      0.95       397
      rec.sport.hockey       0.95      0.97      0.96       399
               sci.med       0.95      0.86      0.90       396
             sci.space       0.92      0.94      0.93       394
soc.religion.christian       0.87      0.97      0.92       398

           avg / total       0.92      0.92      0.92      3876

[[276   1   1   2   1   1   2   2   3  30]
 [  5 354   2   5   0   2   2   5  11   3]
 [  0  16 333  24   2   1   3   4   6   1]
 [  0   2   6 378   3   1   2   0   4   0]
 [  0   0   3  12 380   0   0   0   1   2]

##### Create and test a nearest-neighbor classfier using word counts and 5 neighbors (default)

In [26]:
# k-means
from sklearn import neighbors

clf_5=neighbors.KNeighborsClassifier(n_neighbors = 5) # don't need argument here but do later

# we create an instance of Neighbours Classifier and fit the data.
clf_5.fit(X_train_counts, twenty_train.target)

predicted = clf_5.predict(X_test_counts)

print (np.mean(predicted == twenty_test.target)) 

print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

print(metrics.confusion_matrix(twenty_test.target, predicted))

0.453818369453
                        precision    recall  f1-score   support

           alt.atheism       0.46      0.53      0.49       319
         comp.graphics       0.25      0.50      0.34       389
          misc.forsale       0.72      0.48      0.58       390
             rec.autos       0.35      0.40      0.37       396
       rec.motorcycles       0.74      0.49      0.59       398
    rec.sport.baseball       0.46      0.36      0.40       397
      rec.sport.hockey       0.50      0.52      0.51       399
               sci.med       0.53      0.26      0.35       396
             sci.space       0.84      0.36      0.50       394
soc.religion.christian       0.38      0.66      0.48       398

           avg / total       0.53      0.45      0.46      3876

[[168  23   6  16   4  13   9   9   2  69]
 [ 19 195  15  36   8  15  20   7   8  66]
 [  6 125 187  22   3   6  12   4   1  24]
 [ 19  75   9 158  14  28  29  15   3  46]
 [ 11  47   9  48 195  15  22  11   0  40]

##### Create and test a nearest-neighbor classfier using IF-IDF vectors and 5 neighbors (default)

In [27]:
# insert code here

from sklearn import neighbors

clf_6=neighbors.KNeighborsClassifier(n_neighbors = 5) # don't need argument here but do later

#TF-IDF 
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# we create an instance of Neighbours Classifier and fit the data.
clf_6.fit(X_train_tfidf, twenty_train.target)

predicted = clf_6.predict(X_test_tfidf)

print (np.mean(predicted == twenty_test.target)) 

print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

print(metrics.confusion_matrix(twenty_test.target, predicted))


0.764705882353
                        precision    recall  f1-score   support

           alt.atheism       0.52      0.85      0.64       319
         comp.graphics       0.74      0.73      0.73       389
          misc.forsale       0.76      0.66      0.71       390
             rec.autos       0.80      0.79      0.79       396
       rec.motorcycles       0.84      0.83      0.83       398
    rec.sport.baseball       0.82      0.78      0.80       397
      rec.sport.hockey       0.86      0.88      0.87       399
               sci.med       0.89      0.54      0.67       396
             sci.space       0.86      0.77      0.81       394
soc.religion.christian       0.71      0.84      0.77       398

           avg / total       0.78      0.76      0.77      3876

[[270   2   5   1   0   1   0   7   5  28]
 [ 35 285  13  14   9   8   5   3   9   8]
 [ 11  31 256  17  21  17  10   6  11  10]
 [ 16  17  15 313  11   6   7   2   2   7]
 [ 17   3   9  18 329   2   4   2   2  12]

##### Using TF-IDF vectors, write code that uses cross-validation to select the number of neighbors

In [28]:
# insert code here 1

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

# number of folds
k = 10

X = X_train_tfidf
y =  twenty_train.target

kf = KFold(n_splits=k, shuffle=True, random_state=None)

best_rss = float("infinity")
for i in range(1,10):
    clf_7=neighbors.KNeighborsClassifier(n_neighbors = i)
    rss_sum=0

    
    # compute 10-fold cv RSS
    for train_index, test_index in kf.split(X_train_tfidf):
        X_tr, X_te = X[train_index], X[test_index]
        y_tr, y_te = y[train_index], y[test_index]
        clf_7.fit(X_tr,y_tr)
        y_pred = clf_7.predict(X_te)
        rss = sum((y_pred-y_te)**2)
        rss_sum = rss_sum + rss
        
    if (rss_sum < best_rss):
        best_neighbour = i
        best_rss = rss_sum
        
print("Selected number of neighbours = ", best_neighbour)
# print (y_pred)

Selected number of neighbours =  1


##### Evaluate the performance of k-nn using the selected number of neighbors

In [31]:
# insert code here

from sklearn import neighbors

clf_8=neighbors.KNeighborsClassifier(n_neighbors = best_neighbour) # don't need argument here but do later

#TF-IDF 
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# we create an instance of Neighbours Classifier and fit the data.
clf_8.fit(X_train_tfidf, twenty_train.target)

predicted = clf_8.predict(X_test_tfidf)

print (np.mean(predicted == twenty_test.target)) 

print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

print(metrics.confusion_matrix(twenty_test.target, predicted))

0.776315789474
                        precision    recall  f1-score   support

           alt.atheism       0.64      0.84      0.73       319
         comp.graphics       0.71      0.71      0.71       389
          misc.forsale       0.76      0.64      0.70       390
             rec.autos       0.82      0.75      0.78       396
       rec.motorcycles       0.83      0.86      0.84       398
    rec.sport.baseball       0.79      0.79      0.79       397
      rec.sport.hockey       0.83      0.86      0.85       399
               sci.med       0.84      0.67      0.75       396
             sci.space       0.82      0.85      0.83       394
soc.religion.christian       0.74      0.80      0.77       398

           avg / total       0.78      0.78      0.78      3876

[[267   3   5   1   3   3   0   8   7  22]
 [ 17 276  18  15   8   9  10   8  16  12]
 [  4  31 250  12  17  26  16   5  16  13]
 [  8  18  16 298  17   7   8  10  11   3]
 [ 12   8   2  14 341   4   3   7   3   4]