In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
# read in the iris data
iris = load_iris()

# create X (features) and y (response)
X = iris.data
y = iris.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# KNN classifier with K=5

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

### K-fold cross validation

In [None]:
# simulate split of a dataset of 25 observations into 5 folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:^25}'.format(iteration, data[0], str(data[1])))

In [None]:
from sklearn.model_selection import cross_val_score
# cross_val_score runs KFold internally

In [None]:
# 10 fold cross validation with K=5 for KNN
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

In [None]:
# average accuracy
print(scores.mean())

### searching for optimal value of K for KNN

In [None]:
k_range = range(1, 31)
k_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
    
print(k_scores)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-validated Accuracy')

### Cross-validation - model selection

In [None]:
# 10 fold cross validation with best KNN model
knn = KNeighborsClassifier(n_neighbors=20)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

In [None]:
# 10 fold cross-validation with Logistic regression
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
print(cross_val_score(logReg, X, y, cv=10, scoring='accuracy').mean())

### Cross-validation - feature selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)

In [None]:
feature_cols = ['TV', 'radio', 'newspaper']
X = data[feature_cols]
y = data['sales']

In [None]:
# 10 fold cross-validation with all features
lm = LinearRegression()
scores = cross_val_score(lm, X, y, cv=10, scoring='neg_mean_squared_error')
print(scores)

In [None]:
# fix sign of MSE scores
mse_scores = -scores
print(mse_scores)

In [None]:
# calculating RMSE
rmse_scores = np.sqrt(mse_scores)
print(rmse_scores)

In [None]:
# average RMSE
print(rmse_scores.mean())

In [None]:
# 10-fold cross-validation with two features (excluding Newspaper)
feature_cols = ['TV', 'radio']
X = data[feature_cols]
print(np.sqrt(-cross_val_score(lm, X, y, cv=10, scoring='neg_mean_squared_error')).mean())