# HW #6: Evaluating SVM
---

In this HW, you'll explore several data sets with SVM classifiers and compare them to logistic regression classifiers.  

Use markdown cells to explain your observation.  

We will use two data sets:

**Breast Cancer**

    breast_cancer.csv

**Car Evaluation**

    car.csv


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model, datasets
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

plt.style.use('fivethirtyeight')

from ipywidgets import *
from IPython.display import display

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
#1 Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

[?25l[K    1% |▎                               | 10kB 18.0MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.7MB/s eta 0:00:01[K    3% |█                               | 30kB 2.5MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.7MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.1MB/s eta 0:00:01[K    6% |██                              | 61kB 2.5MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.8MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.2MB/s eta 0:00:01[K    9% |███                             | 92kB 3.6MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.7MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.8MB/s eta 0:00:01[K    12% |████                            | 122kB 3.9MB/s eta 0:00:01[K    13% |████▎                           | 133kB 3.9MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.3MB/s eta 0:00:01[

In [0]:
#2. Get the file
#make sure you upload all your data files to your Google drive and change share->Advanced->change->anyone with the link can view
downloaded = drive.CreateFile({'id':'1kxV5c1FgaUM-KFoHVAM7-np6LnpIFZxQ'}) # replace the id with id of file you want to access
downloaded.GetContentFile('breast_cancer.csv')


downloaded = drive.CreateFile({'id':'1ILf9_rzNvU9ip4_tyQx3oVSrMciMg_sm'}) # replace the id with id of file you want to access
downloaded.GetContentFile('car.csv')

### 1) Load the breast cancer data.

- Are there any missing values? If so, impute or clean the data set.
- Select a classification target and predictors.

In [0]:
df = pd.read_csv('breast_cancer.csv', na_values='?')

In [5]:
# A: take a sample of sample to see how the data is columns and values
df.head(10)

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
5,1017122,8,10,10,8,7,10.0,9,7,1,4
6,1018099,1,1,1,1,2,10.0,3,1,1,2
7,1018561,2,1,2,1,2,1.0,3,1,1,2
8,1033078,2,1,1,1,2,1.0,1,1,5,2
9,1033078,4,2,1,1,2,1.0,2,1,1,2


In [6]:
# to check the number of rows and columns
df.shape

(699, 11)

In [7]:
# get the value counts for target class to see how many types of target are present
df['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

In [8]:
# encoded the target class 2 -> 0 and 4 -> 1
from sklearn.preprocessing import LabelEncoder
df['Class'] = LabelEncoder().fit_transform(df['Class'])
df.head(5)

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0


In [9]:
# check for null values if present
df.isnull().sum()

Sample_code_number              0
Clump_Thickness                 0
Uniformity_of_Cell_Size         0
Uniformity_of_Cell_Shape        0
Marginal_Adhesion               0
Single_Epithelial_Cell_Size     0
Bare_Nuclei                    16
Bland_Chromatin                 0
Normal_Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [10]:
# clean the dataset
df.dropna(inplace=True)
print(df.isnull().sum())
print(df.shape)

Sample_code_number             0
Clump_Thickness                0
Uniformity_of_Cell_Size        0
Uniformity_of_Cell_Shape       0
Marginal_Adhesion              0
Single_Epithelial_Cell_Size    0
Bare_Nuclei                    0
Bland_Chromatin                0
Normal_Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64
(683, 11)


In [11]:
# checking the correlation values with target class
df.corr()['Class'].sort_values(ascending=False)[1:]

Bare_Nuclei                    0.822696
Uniformity_of_Cell_Shape       0.821891
Uniformity_of_Cell_Size        0.820801
Bland_Chromatin                0.758228
Normal_Nucleoli                0.718677
Clump_Thickness                0.714790
Marginal_Adhesion              0.706294
Single_Epithelial_Cell_Size    0.690958
Mitoses                        0.423448
Sample_code_number            -0.084701
Name: Class, dtype: float64

In [0]:
# dropped the column Sample_code_number as it has negative correlation with the target.  
X = df.drop(['Class', 'Sample_code_number'],axis=1)
y = df['Class']

### 2) Build an SVM classifier on the data.

For details on the SVM classifier, [click here](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).

- Initialize and train a linear SVM. What's the average accuracy score with a five-fold cross-validation?
- Repeat using a radial basis function (RBF) classifier. Compare the scores. Which one is better?
- Print a confusion matrix and classification report for your best model using training and testing data.

**[View Classification report and Confusion matrix coding example here](http://joshlawman.com/metrics-classification-report-breakdown-precision-recall-f1/)**

In [13]:
# A: To avoid future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import train_test_split

# split the data into 2 parts test and train with 30% test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


svc = SVC(kernel = 'linear')        # linear
print( "SVM with linear kernel:",cross_val_score(svc, X_train, y_train, cv=5, scoring='accuracy').mean())

rbf_svc = SVC (kernel = 'rbf')           # rbf
print("SVM with rbf kernel:",cross_val_score(rbf_svc, X_train, y_train, cv=5, scoring='accuracy').mean())


SVM with linear kernel: 0.9644503979019714
SVM with rbf kernel: 0.9519060860915175


### SVM with Linear kernel is better which has 96.45 % accuracy.

In [0]:
from sklearn.model_selection import cross_val_predict
# predicting target values for test data
y_pred = cross_val_predict(svc,X_test,y_test)


In [15]:
from sklearn.metrics import classification_report
# classification report for test target when compared to predicted values.
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       127
           1       0.95      0.99      0.97        78

   micro avg       0.98      0.98      0.98       205
   macro avg       0.97      0.98      0.97       205
weighted avg       0.98      0.98      0.98       205



In [16]:
from sklearn.metrics import confusion_matrix
# confusion matrix for test data
confusion_df = pd.DataFrame(confusion_matrix(y_test,y_pred),
             columns=["Predicted Class " + str(class_name) for class_name in [0,1]],
             index = ["Class " + str(class_name) for class_name in [0,1]])

print(confusion_df)

         Predicted Class 0  Predicted Class 1
Class 0                123                  4
Class 1                  1                 77


#### 2.A) Are there more false positives or false negatives? Is this good or bad?

### A:  There are more false positive when compared to false negatives. There are 4 False Positives and 1 False negative predicted for test data set , False positives are good for this case as on the further screening we can detect it as incorrect result but False negatives can cause lot of trouble as if the patient has cancer and diagnosed as not having will not be undergoing any further screening and will have a risk of increase or worsen the health condition.

### 3) Compare SVM and logistic regression using cars' dataset.

You should work on the following:

- With default hyperparameters, compare the performance of SVM vs. Logistic Regression
- What choice of kernel would give SVM the best performance

Use the following to discuss performance
- The mean value of cross-validate scores.
- Examine confusion matrices and classification reports.


In [0]:
car = pd.read_csv('car.csv')

In [18]:
# A: get sample data to examine data
car.sample(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
1245,med,low,4,2,med,low,unacc
1456,low,high,3,more,big,med,acc
3,vhigh,vhigh,2,2,med,low,unacc
1521,low,med,2,4,small,low,unacc
1192,med,low,2,2,med,med,unacc
793,high,low,3,4,small,med,unacc
1626,low,low,2,2,big,low,unacc
409,vhigh,low,5more,2,med,med,unacc
560,high,high,2,more,small,high,unacc
212,vhigh,high,5more,more,med,high,unacc


In [19]:
# to see the number of columns and rows
car.shape

(1728, 7)

In [20]:
# to check for null values
car.isnull().sum()

buying           0
maint            0
doors            0
persons          0
lug_boot         0
safety           0
acceptability    0
dtype: int64

In [0]:
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(car['acceptability'])
X = pd.get_dummies(car.drop('acceptability', axis=1))

from sklearn.model_selection import train_test_split

# split the data into 2 parts test and train with 30% test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# to see the counts of target variable.
pd.Series(y).value_counts()

2    1210
0     384
1      69
3      65
dtype: int64

In [23]:
# comparing accuracy for logistic and default svm
logreg = LogisticRegression()
print("Logistic Regression:",cross_val_score(logreg, X_train, y_train, cv=5, scoring='accuracy').mean())

svc = SVC()
print( "SVM with default  kernel:",cross_val_score(svc, X_train, y_train, cv=5, scoring='accuracy').mean())

Logistic Regression: 0.8693193412940369
SVM with default  kernel: 0.8800599666857991


### SVM with default parameters  and default kernel is better when compared to logistic regression with 88 % accuracy.

In [24]:
# compaing accuracy scores for different kernels of svm
linear_svc = SVC(kernel = 'linear')        # linear
print( "SVM with linear kernel:",cross_val_score(svc, X_train, y_train, cv=5, scoring='accuracy').mean())


poly_svc = SVC(kernel = 'poly')        #  polys
print("SVM with poly kernel:",cross_val_score(poly_svc, X_train, y_train, cv=5, scoring='accuracy').mean())

rbf_svc = SVC (kernel = 'rbf')           # rbf
print("SVM with rbf kernel:",cross_val_score(rbf_svc, X_train, y_train, cv=5, scoring='accuracy').mean())

SVM with linear kernel: 0.8800599666857991
SVM with poly kernel: 0.7047227629986379
SVM with rbf kernel: 0.8800599666857991


###Both Linear kernel and rbf kernel gives data the best performance with 88% accuracy.

In [0]:
from sklearn.model_selection import cross_val_predict
# predicting target values for test data with all kernels
y_pred_linear = cross_val_predict(linear_svc,X_test,y_test)
y_pred_poly = cross_val_predict(poly_svc,X_test,y_test)
y_pred_rbf = cross_val_predict(rbf_svc,X_test,y_test)

In [26]:
from sklearn.metrics import classification_report
# classification report for test target when compared to predicted values of linear svm kernel.
print(classification_report(y_test,y_pred_linear))

              precision    recall  f1-score   support

           0       0.77      0.84      0.80       118
           1       0.62      0.53      0.57        19
           2       0.97      0.95      0.96       358
           3       0.88      0.88      0.88        24

   micro avg       0.90      0.90      0.90       519
   macro avg       0.81      0.80      0.80       519
weighted avg       0.91      0.90      0.90       519



In [34]:
from sklearn.metrics import confusion_matrix
# confusion matrix for test data of linear svm kernel.
confusion_df = pd.DataFrame(confusion_matrix(y_test,y_pred_linear),
             columns=["Predicted Class " + str(class_name) for class_name in [0,1,2,3]],
             index = ["Class " + str(class_name) for class_name in [0,1,2,3]])

print(confusion_df)

         Predicted Class 0  Predicted Class 1  Predicted Class 2  \
Class 0                 99                  6                 12   
Class 1                  7                 10                  0   
Class 2                 19                  0                339   
Class 3                  3                  0                  0   

         Predicted Class 3  
Class 0                  1  
Class 1                  2  
Class 2                  0  
Class 3                 21  


In [28]:
from sklearn.metrics import classification_report
# classification report for test target when compared to predicted values of polynomial svm kernel.
print(classification_report(y_test,y_pred_poly))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       118
           1       0.00      0.00      0.00        19
           2       0.69      1.00      0.82       358
           3       0.00      0.00      0.00        24

   micro avg       0.69      0.69      0.69       519
   macro avg       0.17      0.25      0.20       519
weighted avg       0.48      0.69      0.56       519



  'precision', 'predicted', average, warn_for)


In [29]:
from sklearn.metrics import confusion_matrix
# confusion matrix for test data of polynomial svm kernel.
confusion_df = pd.DataFrame(confusion_matrix(y_test,y_pred_poly),
             columns=["Predicted Class " + str(class_name) for class_name in [0,1,2,3]],
             index = ["Class " + str(class_name) for class_name in [0,1,2,3]])

print(confusion_df)

         Predicted Class 0  Predicted Class 1  Predicted Class 2  \
Class 0                  0                  0                118   
Class 1                  0                  0                 19   
Class 2                  0                  0                358   
Class 3                  0                  0                 24   

         Predicted Class 3  
Class 0                  0  
Class 1                  0  
Class 2                  0  
Class 3                  0  


In [30]:
from sklearn.metrics import classification_report
# classification report for test target when compared to predicted values of rbf svm kernel.
print(classification_report(y_test,y_pred_rbf))

              precision    recall  f1-score   support

           0       0.59      0.91      0.71       118
           1       0.00      0.00      0.00        19
           2       0.97      0.91      0.94       358
           3       0.00      0.00      0.00        24

   micro avg       0.83      0.83      0.83       519
   macro avg       0.39      0.45      0.41       519
weighted avg       0.80      0.83      0.81       519



  'precision', 'predicted', average, warn_for)


In [31]:
from sklearn.metrics import confusion_matrix
# confusion matrix for test data of rbf svm kernel.
confusion_df = pd.DataFrame(confusion_matrix(y_test,y_pred_rbf),
             columns=["Predicted Class " + str(class_name) for class_name in [0,1,2,3]],
             index = ["Class " + str(class_name) for class_name in [0,1,2,3]])

print(confusion_df)

         Predicted Class 0  Predicted Class 1  Predicted Class 2  \
Class 0                107                  0                 11   
Class 1                 19                  0                  0   
Class 2                 32                  0                326   
Class 3                 24                  0                  0   

         Predicted Class 3  
Class 0                  0  
Class 1                  0  
Class 2                  0  
Class 3                  0  


### When compared with mean values of accuracy in cross validataion both linear and rbf performed same but when seeing the classification reports and confusion matrices , linear kernel did well(predicted better as the precision is better and recall score is also better compared to rbf, rbf have zero for class1 and 3 which is very unusual) when compared to rbf kernel.