In [23]:
import numpy as np
import pandas as pd
from ISLP import load_data
from ISLP.models import ModelSpec as MS
import sklearn.model_selection as skm
from sklearn.svm import SVC

In [44]:
# Part A: Initialization
OJ = load_data("OJ")
OJ['Store7'] = OJ.Store7.map(dict(Yes=1,No=0))
model = MS(OJ.columns.drop('Purchase'), intercept=False)
D = model.fit_transform(OJ)
feature_names = list(D.columns)
X = np.asarray(D)

In [46]:
X_train, X_test, y_train, y_test = skm.train_test_split(X,
                                                    OJ["Purchase"],
                                                    test_size=270,
                                                    random_state=0)

In [47]:
# Part B
# Fit Support vector classifier
svm_linear = SVC(C=0.01, # Tolerance for missclassification
                 kernel='linear')
svm_linear.fit(X_train, y_train)

In [52]:
svm_linear.n_support_

array([302, 300], dtype=int32)

In [58]:
# Part C: train and test error
y_hat = svm_linear.predict(X_train)
print("Train error" , np.mean(np.abs(y_hat != y_train)))

Train error 0.28375


In [59]:
y_hat_test = svm_linear.predict(X_test)
print("Test error", np.mean(np.abs(y_hat_test != y_test)))

Test error 0.32222222222222224


In [61]:
# Part D
# Cross val to find best C
kfold = skm.KFold(5,
                  random_state=0,
                  shuffle=True)
grid = skm.GridSearchCV(svm_linear,
                        {'C':[0.001,0.01,0.1,1,5,10]},
                         refit=True,
                        cv=kfold,
                        scoring='accuracy')
grid.fit(X_test, y_test)
grid.best_params_

{'C': 5}

In [64]:
# Part E: Train Test error
best_ = grid.best_estimator_
y_hat_ = best_.predict(X_train)
print("Train error", np.mean(np.abs(y_hat_!=y_train)))

Train error 0.1775


In [66]:
y_hat_test_ = best_.predict(X_test)
print("Test error", np.mean(np.abs(y_hat_test_ != y_test)))

Test error 0.15555555555555556


In [68]:
# Part F: Doing the same with radial kernel

svm_rbf = SVC(C=0.01,
              kernel='rbf')
# Cross val to find best C
kfold = skm.KFold(5,
                  random_state=0,
                  shuffle=True)
grid = skm.GridSearchCV(svm_rbf,
                        {'C':[0.001,0.01,0.1,1,5,10]},
                         refit=True,
                        cv=kfold,
                        scoring='accuracy')
grid.fit(X_test, y_test)
grid.best_params_

{'C': 0.001}

In [69]:
best_ = grid.best_estimator_
y_hat_ = best_.predict(X_train)
print("Train error", np.mean(np.abs(y_hat_!=y_train)))

Train error 0.375


In [70]:
y_hat_test_ = best_.predict(X_test)
print("Test error", np.mean(np.abs(y_hat_test_ != y_test)))

Test error 0.43333333333333335


In [None]:
# Part G : Doing the same with quadratic kernel
svm_poly = SVC(C=0.01,
              kernel='poly',
              degree=100)
# Cross val to find best C
kfold = skm.KFold(5,
                  random_state=0,
                  shuffle=True)
grid = skm.GridSearchCV(svm_poly,
                        {'C':[0.001,0.01,0.1,1,5,10]},
                         refit=True,
                        cv=kfold,
                        scoring='accuracy')
grid.fit(X_test, y_test)
grid.best_params_

In [75]:
best_ = grid.best_estimator_
y_hat_ = best_.predict(X_train)
print("Train error", np.mean(np.abs(y_hat_!=y_train)))

Train error 0.32625


In [76]:
y_hat_test_ = best_.predict(X_test)
print("Test error", np.mean(np.abs(y_hat_test_ != y_test)))

Test error 0.337037037037037
