# SVM Tree Worksheet

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC # "Support vector classifier"
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

#### Load pima indians diabetes dataset below

In [4]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

data = pd.read_csv(url, names = names)
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### Split data into test and train set with the test_size being 25 %

In [13]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

y.head()

0    1
1    0
2    1
3    0
4    1
Name: class, dtype: int64

#### Create a model with sklearn's SVC and a high gamma value (perhaps 1.0)

In [14]:
model = SVC(gamma = 1.0)

model.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Print accuracy of test data and training data (the model should be trained/'fit' on the training data)

In [15]:
y_test_predict = model.predict(x_test)

accuracy_score(y_test, y_test_predict)

0.6770833333333334

#### Create a model with sklearn's SVC and a low gamma value (perhaps .001)

In [22]:
model = SVC(gamma = 0.001)

model.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Print accuracy of training data and test data

In [23]:
y_train_predict = model.predict(x_train)
print(accuracy_score(y_train, y_train_predict))

y_test_predict = model.predict(x_test)
print(accuracy_score(y_test, y_test_predict))

0.8402777777777778
0.7552083333333334


##### Explain the accuracy results below. 

Your answer here <br>
Too high of a gamma may result in a sort of overfitting. Relatively lower gamma is more accurate for test, although less accurate for training.

## GridSearchCV

There are several parameters to tune. Instead of tuning the parameters one by one, GridSearchCV does an exhaustive search over provided parameters. <br><br>

###Use gamma, C and decision_function_shape as parameters and GridSearchCV to find the best parameters with kernel='rbf'<br>
Don't know what decision_function_shape is, look at the SVC documentation <br>
Don't know how to use GridSearchCV, google it!<br><br>

In [25]:
from sklearn import svm, grid_search
def svc_param_selection(X, y):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

svc_param_selection(x_train, y_train)

{'C': 1, 'gamma': 0.001}

### Using the optimal parameters you found, print the accuracy

In [30]:
model = SVC(C = 1, gamma = 0.001)
model.fit(x_train, y_train)

y_train_predict = model.predict(x_train)
print("training:", accuracy_score(y_train, y_train_predict))

y_test_predict = model.predict(x_test)
print("test:", accuracy_score(y_test, y_test_predict))

training: 0.8402777777777778
test: 0.7552083333333334


### Create an ensemble that includes svm and random forest (use your code from the decision trees notebook)
### Use predict_proba to get probabilities and decide a method to comebine the predictions 

In [40]:
from sklearn.ensemble import VotingClassifier

def random_forest_param_selection(X, y):
    n_estimators = [10,20,30,40,50,60,70,80,90,100]
    max_depths = [3,4,5,6,7,8,9,10]
    param_grid = {'n_estimators': n_estimators, 'max_depth' : max_depths}
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

#random_forest_param_selection(x_train, y_train)
#{'max_depth': 4, 'n_estimators': 40}

model2 = RandomForestClassifier(max_depth = 4, n_estimators = 40)
model2.fit(x_train, y_train)

#y_train_predict2 = model2.predict(x_train)
#print("training:", accuracy_score(y_train, y_train_predict2))
#y_test_predict2 = model2.predict(x_test)
#print("test:", accuracy_score(y_test, y_test_predict2))

ensemble = VotingClassifier([('svc', model), ('rf', model2)])
ensemble.fit(x_train, y_train)

y_test_predict3 = ensemble.predict(x_test)
print("test:", accuracy_score(y_test, y_test_predict3))

test: 0.7604166666666666


  if diff:
