# Support Vector Machines

In [24]:
import os

import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split

In [2]:
os.chdir("/Users/Dippies/CODE PACKT - EML - FINAL/Chapter 3/Support Vector Machines")
os.getcwd()

'/Users/Dippies/CODE PACKT - EML - FINAL/Chapter 3/Support Vector Machines'

In [None]:
os.chdir(".../Chapter 3/Support Vector Machines")
os.getcwd()

In [4]:
df_bankdata = pd.read_csv("bank.csv")

In [5]:
df_bankdata.shape

(4521, 17)

In [6]:
df_bankdata.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [7]:
df_bankdata.corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.08382,-0.017853,-0.002367,-0.005148,-0.008894,-0.003511
balance,0.08382,1.0,-0.008677,-0.01595,-0.009976,0.009437,0.026196
day,-0.017853,-0.008677,1.0,-0.024629,0.160706,-0.094352,-0.059114
duration,-0.002367,-0.01595,-0.024629,1.0,-0.068382,0.01038,0.01808
campaign,-0.005148,-0.009976,0.160706,-0.068382,1.0,-0.093137,-0.067833
pdays,-0.008894,0.009437,-0.094352,0.01038,-0.093137,1.0,0.577562
previous,-0.003511,0.026196,-0.059114,0.01808,-0.067833,0.577562,1.0


In [8]:
df_bankdata.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

#### Perform standardization on numeric variables

In [9]:
print("Total number of class labels: {}".format(df_bankdata.shape[0]))
print("Number of people opted for Term Deposit: {}".format(df_bankdata[df_bankdata.y == 'yes'].shape[0]))
print("Number of people not opted for Term Deposit: {}".format(df_bankdata[df_bankdata.y == 'no'].shape[0]))

Total number of class labels: 4521
Number of people opted for Term Deposit: 521
Number of people not opted for Term Deposit: 4000


In [10]:
# We convert our target class to 1 & 0
df_bankdata['y'] = (df_bankdata['y']=='yes').astype(int)

In [11]:
# Using select_dtypes() to select only the non-numeric type variable
column_type = ['object']
df_bank_data_category_cols = df_bankdata.select_dtypes(column_type)

# This will give you the names of the non-numeric variables
category_column_names = df_bank_data_category_cols.columns.values.tolist()
category_column_names

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [13]:
for each_col in category_column_names:
    dummy_var = pd.get_dummies(df_bank_data_category_cols[each_col], prefix=each_col)
    df_joindata = df_bankdata.join(dummy_var)
    df_joindata.drop([each_col], axis=1, inplace=True)
    df_bankdata = df_joindata

In [14]:
# Separate features & response variable
X=df_bankdata.iloc[:, :-1]
Y=df_bankdata['y']

In [15]:
X.shape

(4521, 51)

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [20]:
Y_train.shape

(3616,)

In [19]:
Y_train.dtypes

dtype('int64')

#### Using default rbf kernel

In [21]:
svc_model = SVC(kernel='rbf') 
svc_model.fit(X_train, Y_train)

train_predictedvalues=svc_model.predict(X_train)
test_predictedvalues=svc_model.predict(X_test)

In [26]:
print('Train Accuracy Score:')
print(accuracy_score(Y_train,train_predictedvalues))

print('Test Accuracy Score:')
print(accuracy_score(Y_test,test_predictedvalues))

Train Accuracy Score:
0.99889380531
Test Accuracy Score:
0.872928176796


#### With Polynomial kernel

In [None]:
from sklearn.svm import SVC
from sklearn import metrics

svc_model = SVC(kernel='poly') 
svc_model.fit(X_train, Y_train)

train_predictedvalues=svc_model.predict(X_train)
test_predictedvalues=svc_model.predict(X_test)

print('Train Accuracy Score:')
print(metrics.accuracy_score(Y_train,train_predictedvalues))

print('Test Accuracy Score:')
print(metrics.accuracy_score(Y_test,test_predictedvalues))

#### Using linear kernel

In [22]:
from sklearn.svm import SVC
from sklearn import metrics

svc_model = SVC(kernel='linear') 
svc_model.fit(X_train, Y_train)

train_predictedvalues=svc_model.predict(X_train)
test_predictedvalues=svc_model.predict(X_test)

print('Train Accuracy Score:')
print(metrics.accuracy_score(Y_train,train_predictedvalues))

print('Test Accuracy Score:')
print(metrics.accuracy_score(Y_test,test_predictedvalues))

Train Accuracy Score:
0.997234513274
Test Accuracy Score:
0.994475138122
