In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spammail=pd.read_csv('spambase.csv')

In [3]:
X=spammail.drop('spam',axis=1)

In [4]:
y=spammail['spam']

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [6]:
#Feature scaling-standardizing features by removing mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler
scaler_x=StandardScaler()
X_Train=scaler_x.fit_transform(X_train)
X_Test=scaler_x.transform(X_test)

In [7]:
tuned_parameters_quad = [{'kernel':['poly'],'degree':[2],'C':[1,10,100,1000,10000,30000,50000]}]
tuned_parameters_linear=[{'kernel':['linear'],'C':[1,2]}]
tuned_parameters_rbf=[{'kernel':['rbf'],'C':[1,10,100,1000,10000,100000],'gamma':['scale','auto']}]

In [8]:
#SVM model to predict if a mail is spam or non spam
#In order to vary regulation parameter C and decide an optimal value, we are using an exhaustive grid search
#C has been given the values of 1,10,100,1000 and 10000

def svmmodel(tuned_parameters):
  from sklearn.svm import SVC
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import confusion_matrix
  svclassifier = GridSearchCV(SVC(), param_grid=tuned_parameters, scoring='accuracy',verbose=10,n_jobs=-1)
  svclassifier.fit(X_train, y_train)
  print('Scores:')
  means = svclassifier.cv_results_['mean_test_score']
  stds = svclassifier.cv_results_['std_test_score']
  for mean, std, params in zip(means, stds, svclassifier.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
  print('Best score:')
  print(svclassifier.best_params_)
  y_true_test, y_predtest = y_test, svclassifier.predict(X_test)
  y_true_train, y_predtrain = y_train, svclassifier.predict(X_train)
  cfmatrixtrain=confusion_matrix(y_true_train,y_predtrain)
  cfmatrixtest=confusion_matrix(y_true_test,y_predtest)
  cfmetrics(cfmatrixtrain,cfmatrixtest)

In [9]:
def cfmetrics(cfmatrixtrain,cfmatrixtest):  
  print('confusion matrix for training data:')
  print(cfmatrixtrain)
  TN=cfmatrixtrain[0][0]
  FN=cfmatrixtrain[1][0]
  TP=cfmatrixtrain[1][1]
  FP=cfmatrixtrain[0][1]
  accuracy_train=(TN+TP)/(TN+TP+FN+FP)
  precision_train=(TP)/(TP+FP)
  recall_train=TP/(TP+FN)
  print('Training accuracy')
  print(accuracy_train)
  print('Training precision')
  print(precision_train)
  print('Training recall')
  print(recall_train)
  print('confusion matrix for test data:')
  print(cfmatrixtest)
  TN=cfmatrixtest[0][0]
  FN=cfmatrixtest[1][0]
  TP=cfmatrixtest[1][1]
  FP=cfmatrixtest[0][1]
  accuracy_test=(TN+TP)/(TN+TP+FN+FP)
  precision_test=(TP)/(TP+FP)
  recall_test=TP/(TP+FN)
  print('Test accuracy')
  print(accuracy_test)
  print('Test precision')
  print(precision_test)
  print('Test recall')
  print(recall_test)

In [10]:
#Linear kernel function for SVM
svmmodel(tuned_parameters_linear)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 38.1min remaining: 38.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 52.3min remaining: 22.4min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 59.3min finished


Scores:
0.926 (+/-0.009) for {'C': 1, 'kernel': 'linear'}

0.925 (+/-0.012) for {'C': 2, 'kernel': 'linear'}

Best score:
{'C': 1, 'kernel': 'linear'}
confusion matrix for training data:
[[2143  101]
 [ 146 1290]]
Training accuracy
0.9328804347826087
Training precision
0.9273903666427031
Training recall
0.8983286908077994
confusion matrix for test data:
[[516  28]
 [ 34 343]]
Test accuracy
0.9326818675352877
Test precision
0.9245283018867925
Test recall
0.9098143236074271


In [11]:
#Quadratic kernel function for SVM
svmmodel(tuned_parameters_quad)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  32 out of  35 | elapsed:  4.6min remaining:   25.9s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  6.8min finished


Scores:
0.670 (+/-0.013) for {'C': 1, 'degree': 2, 'kernel': 'poly'}

0.678 (+/-0.024) for {'C': 10, 'degree': 2, 'kernel': 'poly'}

0.708 (+/-0.019) for {'C': 100, 'degree': 2, 'kernel': 'poly'}

0.770 (+/-0.023) for {'C': 1000, 'degree': 2, 'kernel': 'poly'}

0.833 (+/-0.024) for {'C': 10000, 'degree': 2, 'kernel': 'poly'}

0.854 (+/-0.025) for {'C': 30000, 'degree': 2, 'kernel': 'poly'}

0.867 (+/-0.025) for {'C': 50000, 'degree': 2, 'kernel': 'poly'}

Best score:
{'C': 50000, 'degree': 2, 'kernel': 'poly'}
confusion matrix for training data:
[[2182   62]
 [ 403 1033]]
Training accuracy
0.873641304347826
Training precision
0.94337899543379
Training recall
0.7193593314763231
confusion matrix for test data:
[[530  14]
 [ 93 284]]
Test accuracy
0.8838219326818675
Test precision
0.9530201342281879
Test recall
0.753315649867374


In [12]:
#RBF kernel function for SVM
svmmodel(tuned_parameters_rbf)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished


Scores:
0.706 (+/-0.025) for {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

0.821 (+/-0.025) for {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

0.732 (+/-0.015) for {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

0.845 (+/-0.020) for {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

0.811 (+/-0.020) for {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

0.840 (+/-0.019) for {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}

0.897 (+/-0.008) for {'C': 1000, 'gamma': 'scale', 'kernel': 'rbf'}

0.835 (+/-0.016) for {'C': 1000, 'gamma': 'auto', 'kernel': 'rbf'}

0.925 (+/-0.009) for {'C': 10000, 'gamma': 'scale', 'kernel': 'rbf'}

0.835 (+/-0.016) for {'C': 10000, 'gamma': 'auto', 'kernel': 'rbf'}

0.933 (+/-0.012) for {'C': 100000, 'gamma': 'scale', 'kernel': 'rbf'}

0.835 (+/-0.016) for {'C': 100000, 'gamma': 'auto', 'kernel': 'rbf'}

Best score:
{'C': 100000, 'gamma': 'scale', 'kernel': 'rbf'}
confusion matrix for training data:
[[2165   79]
 [ 113 1323]]
Training accuracy
0.9478260869565217
Training precis

In [13]:
#RBF kernel function for SVM
svmmodel(tuned_parameters_rbf)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   58.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   58.5s finished


Scores:
0.706 (+/-0.025) for {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

0.821 (+/-0.025) for {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

0.732 (+/-0.015) for {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

0.845 (+/-0.020) for {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

0.811 (+/-0.020) for {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

0.840 (+/-0.019) for {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}

0.897 (+/-0.008) for {'C': 1000, 'gamma': 'scale', 'kernel': 'rbf'}

0.835 (+/-0.016) for {'C': 1000, 'gamma': 'auto', 'kernel': 'rbf'}

0.925 (+/-0.009) for {'C': 10000, 'gamma': 'scale', 'kernel': 'rbf'}

0.835 (+/-0.016) for {'C': 10000, 'gamma': 'auto', 'kernel': 'rbf'}

0.933 (+/-0.012) for {'C': 100000, 'gamma': 'scale', 'kernel': 'rbf'}

0.835 (+/-0.016) for {'C': 100000, 'gamma': 'auto', 'kernel': 'rbf'}

Best score:
{'C': 100000, 'gamma': 'scale', 'kernel': 'rbf'}
confusion matrix for training data:
[[2165   79]
 [ 113 1323]]
Training accuracy
0.9478260869565217
Training precis

In [14]:
tuned_parameters_linear=[{'kernel':['linear'],'C':[0.1,1]}]
svmmodel(tuned_parameters_linear)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  3.2min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 15.4min remaining:  6.6min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 21.4min finished


Scores:
0.921 (+/-0.019) for {'C': 0.1, 'kernel': 'linear'}

0.926 (+/-0.009) for {'C': 1, 'kernel': 'linear'}

Best score:
{'C': 1, 'kernel': 'linear'}
confusion matrix for training data:
[[2143  101]
 [ 146 1290]]
Training accuracy
0.9328804347826087
Training precision
0.9273903666427031
Training recall
0.8983286908077994
confusion matrix for test data:
[[516  28]
 [ 34 343]]
Test accuracy
0.9326818675352877
Test precision
0.9245283018867925
Test recall
0.9098143236074271
