## Lab 8: Support Vector Machines and Grid Search

#### Let's start by loading the spambase data to work with.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

spambase = pd.read_csv('spambase.csv')

print("Spambase loaded.")

Spambase loaded.


#### Train a support vectore classifier using the default kernel and C value

In [2]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

# first we need to prepare the data

# Copy spambase across and drop spam_class (we don't include the class feature in the training data)
X = spambase
X = X.drop('spam_class', axis=1)

# Set y as the spam column, we need to wrap it in the dataframe to stop it being series 
y_df = pd.DataFrame(spambase.spam_class)

# flatten y into a 1-D array
y = np.ravel(y_df)

# now split the data into training data and test data (80/20 split)
# random_state = k sets a specific random seed
# since random_state is fixed, this call will always produce the same split
# if you leave out the random_state value, you will get a different random split when you run it agaimn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


#cut and pste from here 
# create our model and fit it to our training data
clf = svm.SVC(kernel='rbf',C=1) # defaults
clf.fit(X_train, y_train) 

# make predictions on test data
predicted = clf.predict(X_test)

# print accuracy
print (np.mean(predicted == y_test)) 

# print precision and recall statistics
print(metrics.classification_report(y_test, predicted))

# print confusion matrix
print(metrics.confusion_matrix(y_test, predicted))

0.833876221498
             precision    recall  f1-score   support

          0       0.86      0.86      0.86       538
          1       0.80      0.80      0.80       383

avg / total       0.83      0.83      0.83       921

[[463  75]
 [ 78 305]]


#### Experiment with parameter values

In [None]:
# try out other kernels ('linear' 'poly' 'sigmoid') and other values for C
# clf = svm.SVC(kernel='linear',C=3) # defaults
# clf.fit(X_train, y_train) 

# # make predictions on test data
# predicted = clf.predict(X_test)

# # print accuracy
# print (np.mean(predicted == y_test)) 

# # print precision and recall statistics
# print(metrics.classification_report(y_test, predicted))

# # print confusion matrix
# print(metrics.confusion_matrix(y_test, predicted))


clf = svm.SVC(kernel='poly',C=2) # defaults
clf.fit(X_train, y_train) 

# make predictions on test data
predicted = clf.predict(X_test)

# print accuracy
print (np.mean(predicted == y_test)) 

# print precision and recall statistics
print(metrics.classification_report(y_test, predicted))

# print confusion matrix
print(metrics.confusion_matrix(y_test, predicted))



# clf = svm.SVC(kernel='sigmoid',C=3) # defaults
# clf.fit(X_train, y_train) 

# # make predictions on test data
# predicted = clf.predict(X_test)

# # print accuracy
# print (np.mean(predicted == y_test)) 

# # print precision and recall statistics
# print(metrics.classification_report(y_test, predicted))

# # print confusion matrix
# print(metrics.confusion_matrix(y_test, predicted))


#### Use GridSearchCV to select parameters

In [3]:
# This takes a while to run.

from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters, n_jobs=-1) # n_jobs -> number of parallel jobs
                                               # -1 -> whatever the architecture allows

clf.fit(X_train, y_train)

print(clf.best_params_,"\n")

# make predictions on test data
predicted = clf.predict(X_test)

# print accuracy
print (np.mean(predicted == y_test)) 

# print precision and recall statistics
print(metrics.classification_report(y_test, predicted))

# print confusion matrix
print(metrics.confusion_matrix(y_test, predicted))

{'C': 1, 'kernel': 'linear'} 

0.914223669924
             precision    recall  f1-score   support

          0       0.92      0.93      0.93       538
          1       0.91      0.89      0.90       383

avg / total       0.91      0.91      0.91       921

[[503  35]
 [ 44 339]]


##### Use the following documentation (and any other documentation you wish to use to implement a multi-class SVM using the data from last week's lab. Only, use the following four newsgroups (rather than all ten): 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'

http://scikit-learn.org/stable/modules/svm.html

In [12]:
# insert code here

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer




# load newsgroup train and test data
# categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

# read the data in twenty_train.data and twenty_train.target
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

# load the test data set and convert to word counts
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)


# print the class names
print (twenty_train.target_names, "\n")
print (twenty_test.target_names, "\n")

# print the class names for the first 25 articles
for t in twenty_train.target[:25]:
    print(twenty_train.target[t], twenty_train.target_names[t])
    
# print the first article
# print("\n", twenty_train.data[:1])




['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] 

['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] 

0 rec.autos
2 rec.sport.hockey
0 rec.autos
0 rec.sport.baseball
0 rec.sport.baseball
0 rec.sport.baseball
0 rec.sport.baseball
0 rec.autos
2 rec.sport.hockey
3 rec.motorcycles
3 rec.motorcycles
0 rec.sport.baseball
2 rec.sport.hockey
0 rec.sport.baseball
0 rec.autos
0 rec.autos
0 rec.autos
0 rec.sport.baseball
0 rec.sport.baseball
2 rec.sport.hockey
3 rec.motorcycles
0 rec.autos
0 rec.sport.baseball
0 rec.autos
0 rec.autos


In [13]:
from sklearn import svm

# transform data into to tfidf vectors


count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(twenty_train.data) 
# note that I am using .transform instead of .fit_transform. this keeps the columns the same as the training set
X_test_counts = count_vect.transform(twenty_test.data)


tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# clf_2 = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

X_test_tfidf = tfidf_transformer.transform(X_test_counts)


X = X_train_tfidf
y =  twenty_train.target

lin_clf = svm.LinearSVC(multi_class='ovr')

lin_clf.fit(X,y)
# clf= svm.SVC(kernel='liner', decision_function_shape = 'ovo')
# make predictions on test data
predicted = lin_clf.predict(X_test_tfidf)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

# fit and evaluate a multi-class svm


0.969182389937
                    precision    recall  f1-score   support

         rec.autos       0.96      0.98      0.97       396
   rec.motorcycles       0.98      0.96      0.97       398
rec.sport.baseball       0.97      0.95      0.96       397
  rec.sport.hockey       0.97      0.98      0.98       399

       avg / total       0.97      0.97      0.97      1590

[[388   5   3   0]
 [ 14 382   2   0]
 [  3   2 379  13]
 [  1   1   5 392]]
