### Model Selection and Validation

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()


In [None]:
X = iris.data
y = iris.target

Let's now take a simple classifier algorithm e.g k-neighbours and use it to explain the various approaches in model validation

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=1)


#### 1.Model Validation - Naive Approach

In [None]:
model.fit(X, y)
y_model = model.predict(X)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y, y_model)


We see an accuracy score of 1.0...but this brings out a fundamental flaw:\
<i>if it trains and evaluates the model on the same data, accuracy will always be 100%</i>

#### 2. Model Validation the right way : Use of holdout sets

In [None]:
# Let's create a holdout set
from sklearn.model_selection import train_test_split

X1, X2, y1, y2 = train_test_split(X, y, random_state=0, train_size=0.5)

model.fit(X1, y1)

y2_model = model.predict(X2)
accuracy_score(y2, y2_model)


The use of holdout sets brings out a more reasonable result as seen above

#### 3. Model Validation via cross-validation

##### A. Two-fold cross validation 

In [None]:
y2_model = model.fit(X1, y1).predict(X2)
y1_model = model.fit(X2, y2).predict(X1)

accuracy_score(y1, y1_model), accuracy_score(y2, y2_model)

##### B. n-fold cross validation e.g n=5  

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5)

# Getting average...
scores.mean()


#### 4. Validation curves

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline


def polynomial_regression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))

Creating sample data

In [None]:
import numpy as np


def make_data(N, err=1.0, rseed=1):
    # Randomly sample the data
    rng = np.random.RandomState(rseed)
    X = rng.rand(N, 1) ** 2
    y = 10 - 1.0 / (X.ravel() + 0.1)

    if err > 0:
        y += err * rng.randn(N)

    return X, y


X, y = make_data(40)

In [None]:
X[:5, :]


Visualizing our data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()  # plot formatting

X_test = np.linspace(-0.1, 1.1, 500)[:, None]

plt.scatter(X.ravel(), y, color='black')
axis = plt.axis()
for degree in [1, 3, 5]:
    y_test = polynomial_regression(degree).fit(X, y).predict(X_test)
    plt.plot(X_test.ravel(), y_test, label='degree={0}'.format(degree))
plt.xlim(-0.1, 1.0)
plt.ylim(-2, 12)
plt.legend(loc='best')


Since they are different polynomials for the above model, we can make progress by visualizing the validation curve\
for this data and model

In [None]:
# validation_curve?


In [None]:
from sklearn.model_selection import validation_curve

degree = np.arange(0, 21)

train_score, val_score = validation_curve(
    estimator=polynomial_regression(),
    X=X,
    y=y,
    param_name="polynomialfeatures__degree",
    param_range=degree,
    cv=7,
)

plt.plot(degree, np.median(train_score, 1),
         color="blue", label="training score")
plt.plot(degree, np.median(val_score, 1),
         color="red", label="validation score")
plt.legend(loc="best")
plt.ylim(0, 1)
plt.xlabel("degree")
plt.ylabel("score")


From the validation curve above, we see the optimum trade-off is at degree = 3.0

In [None]:
plt.scatter(X.ravel(), y)
lim = plt.axis()
y_test = polynomial_regression(3).fit(X, y).predict(X_test)
plt.plot(X_test.ravel(), y_test)
plt.axis(lim)

#### 5.Learning Curves

In [None]:
X2, y2 = make_data(200)
plt.scatter(X2.ravel(), y2)


In [None]:
degree = np.arange(21)

train_score2, val_score2 = validation_curve(
    estimator=polynomial_regression(),
    X=X2,
    y=y2,
    param_name="polynomialfeatures__degree",
    param_range=degree,
    cv=7,
)

plt.plot(degree, np.median(train_score2, 1),
         color="blue", label="training score")
plt.plot(degree, np.median(val_score2, 1),
         color="red", label="validation score")
plt.plot(degree, np.median(train_score, 1),
         color="blue", alpha=0.3, linestyle="dashed")
plt.plot(degree, np.median(val_score, 1),
         color="red", alpha=0.3, linestyle="dashed")
plt.legend(loc="lower center")
plt.ylim(0, 1)
plt.xlabel("degree")
plt.ylabel("score")


From the Validation curve above, the solid lines show the new results, while the fainter dashed lines show the results of the previous smaller dataset\
To compute a learning curve, use the learning_curve()

In [None]:
from sklearn.model_selection import learning_curve

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
for i, degree in enumerate([2, 9]):
    N, train_lc, val_lc = learning_curve(
        polynomial_regression(degree), X, y, cv=7, train_sizes=np.linspace(0.3, 1, 25)
    )
    ax[i].plot(N, np.mean(train_lc, 1), color="blue", label="training score")
    ax[i].plot(N, np.mean(val_lc, 1), color="red", label="validation score")
    ax[i].hlines(
        np.mean([train_lc[-1], val_lc[-1]]),
        N[0],
        N[-1],
        color="gray",
        linestyle="dashed",
    )
    ax[i].set_ylim(0, 1)
    ax[i].set_xlim(N[0], N[-1])
    ax[i].set_xlabel("training size")
    ax[i].set_ylabel("score")
    ax[i].set_title("degree = {0}".format(degree), size=14)
    ax[i].legend(loc="best")

Learning curves above for a low-complexity model (left) and a high-complexity model (right)

#### 6. Validation in Practice: Grid Search