# CROSS VALIDATION & GRID SEARCH

## 0. import lib

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# drwa
%matplotlib inline

## 1. load data

In [13]:
iris = load_iris()

X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)

print('sample：{}，train：{}，test：{}'.format(len(X), len(X_train), len(X_test)))

sample：150，train：112，test：38


## 2. cross validate

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

k_range = [1, 5, 9, 15]
cv_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5)
    cv_score = np.mean(scores)
    print('k={}，acc on test={:.3f}'.format(k, cv_score))
    cv_scores.append(cv_score)

k=1，acc on test=0.947
k=5，acc on test=0.955
k=9，acc on test=0.964
k=15，acc on test=0.964


In [18]:
best_k = k_range[np.argmax(cv_scores)]
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train, y_train)
print('acc on test：', best_knn.score(X_test, y_test))

acc on test： 0.9736842105263158


## 3. Grid Searh

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

parameters = {'max_depth':[3, 5, 7, 9], 'min_samples_leaf': [1, 2, 3, 4]}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5, 7, 9], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [20]:
print('optimal param：', clf.best_params_)
print('highest test score：', clf.best_score_)

optimal param： {'max_depth': 5, 'min_samples_leaf': 2}
highest test score： 0.9553571428571429


In [22]:
# opimal model
best_model = clf.best_estimator_
print('highest test score：', best_model.score(X_test, y_test))

acc on test： 0.9473684210526315


## 4. Consistency

### 4.1 pickle

In [23]:
# use pickle
import pickle

model_path1 = './trained_model1.pkl'

# save to drive
with open(model_path1, 'wb') as f:
    pickle.dump(best_model, f)

In [25]:
# load saved model
with open(model_path1, 'rb') as f:
    model = pickle.load(f)

# pred
print('pred', model.predict([X_test[0, :]]))
print('actual', y_test[0])

pred [2]
actual 2


### 4.2 use joblib

In [26]:
# use joblib
from sklearn.externals import joblib

# save to drive
model_path2 = './trained_model2.pkl'
joblib.dump(best_model, model_path2) 

['./trained_model2.pkl']

In [28]:
# load saved model
model = joblib.load(model_path2) 

# pred
print('pred', model.predict([X_test[0, :]]))
print('actual', y_test[0])

pred [2]
actual 2
