## Training and Testing
#### Clean Data: Ready to Model

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.neighbors import KNeighborsClassifier

In [32]:
from sklearn.datasets import load_iris

iris = load_iris()

# create X (features) and y (response)
data = pd.DataFrame(iris.data)

In [33]:
data['y'] = pd.Series(list(iris.target), index=data.index)
data.head()

Unnamed: 0,0,1,2,3,y
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [51]:
data.iloc[:5,:4]

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Single Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Train/Validate/Test

In [34]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.ix[perm[:train_end]]
    validate = df.ix[perm[train_end:validate_end]]
    test = df.ix[perm[validate_end:]]
    return train, validate, test

In [36]:
train, validate, test = train_validate_test_split(data)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  import sys


In [42]:
print(train.shape)
print(validate.shape)
print(test.shape)

(90, 5)
(30, 5)
(30, 5)


# K-Nearest Neighbor Model

### K- Nearest Neighbors- Training Error

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn import datasets



In [56]:
#KNN with K=1
knn = KNeighborsClassifier(n_neighbors=1)
print(knn.fit(data.iloc[:,:4], data['y']))
y_pred = knn.predict(data.iloc[:,:4])
metrics.accuracy_score(data['y'], y_pred)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


1.0

In [57]:
#KNN with K=20
knn = KNeighborsClassifier(n_neighbors=20)
print(knn.fit(data.iloc[:,:4], data['y']))
y_pred = knn.predict(data.iloc[:,:4])
metrics.accuracy_score(data['y'], y_pred)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')


0.97999999999999998

## Train vs Test Error

In [None]:
# search for an optimal value of K for KNN
k_range = list(range(1, 30))
k_scores = []
k_test_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_train)
    k_scores.append(metrics.accuracy_score(y_train, y_pred))
    y_test_pred = knn.predict(X_test)
    k_test_scores.append(metrics.accuracy_score(y_test, y_test_pred))
    

df = pd.DataFrame(k_scores, index=k_range, columns=["Scores"])
df1 = pd.DataFrame(k_test_scores, index=k_range, columns=["Scores"])
print(df['Scores'].argmax())
print(df1['Scores'].argmax())

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores, label="Train")
plt.plot(k_range, k_test_scores, label="Train")
plt.xlabel('Value of K for KNN')
plt.ylabel('Accuracy')
plt.legend()

### Cross Validation
Counter Overfitting

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

### K- Parameter Tuning + Cross-Validation
Bias-Variance Tradeoff

In [None]:
from sklearn.cross_validation import cross_val_score

In [None]:
# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
df = pd.DataFrame(k_scores, index=k_range, columns=["Scores"])
print(df)
df['Scores'].argmax()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

## Further Resources
- Dataschool Video and Code: https://github.com/justmarkham/scikit-learn-videos
- Data Science Handbook (O'Reilly): https://github.com/jakevdp/PythonDataScienceHandbook


- Link to PPT: https://docs.google.com/presentation/d/1GKVpWJvuwOtCaWW6X_Mfe0x_4dyPE5KnuGB5_NnOTYo/edit?usp=sharing
