# Using Scikit-Learn to identify the line of business of a loss triangle
We would like to know whether we can use Machine Learning to predict the line of business of a triangle from just the triangle itself.  We will download the *CAS loss reserve database* for Medical Malpractice, Private Passenger Auto, and Workers' Compensation. 

In this notebook we will explore some of the tooling available in scikit-learn to do this, and as we learn about `sklearn` we will simultanously answer this question.

But first, let's use `pandas` to grab the data of the 19 largest carriers for each line of business, and group the remaining carriers as 'Other'.

In [1]:
import pandas as pd
import numpy as np

## Original

In [None]:
# original 
data_url = 'https://www.casact.org/research/reserve_data'
# Read in the data
lobs = ['medmal','ppauto','wkcomp']
data = pd.DataFrame()
columns = ['GRCODE','GRNAME','AccidentYear','DevelopmentYear','DevelopmentLag'
           ,'IncurLoss', 'CumPaidLoss','BulkLoss','EarnedPremDIR'
           ,'EarnedPremCeded','EarnedPremNet', 'Single','PostedReserve97']
for lob in lobs:
    file_url = f'{data_url}/{lob}_pos.csv'
    subset = pd.read_csv(file_url, names=columns, skiprows=1)
    subset['LOB'] = lob
    data = data.append(subset, sort=True)
data = data[data['DevelopmentYear']<=1997].reset_index()


In [5]:
# original
def make_trg(data):
    # Find largest 20 companies by premium size for each LOB
    aggregates = (data[data['DevelopmentYear']==1997].groupby(['LOB','GRNAME']) \
                                                 .sum()['IncurLoss']) \
                                                 .reset_index()
    top_20_by_lob = aggregates.iloc[aggregates.groupby('LOB')['IncurLoss'] \
                              .nlargest(19).index.levels[1]]
    data2 = data.merge(top_20_by_lob, how='left', on=['LOB','GRNAME'])
    data2.loc[data2.iloc[:,-1].isna(),'GRNAME'] = 'Other'
    
    # Create Triangles
    triangles = pd.pivot_table(data2, index=['GRNAME','LOB','AccidentYear'],
                               columns='DevelopmentLag', values='CumPaidLoss',
                               aggfunc='sum')
    
    # Determine LDF Weights
    weight = np.array(~triangles.iloc[:,1:].isna())
    columns = [f'{triangles.columns[num]}-{triangles.columns[num+1]}'
               for num, item in enumerate(triangles.columns[:-1])]

    # Volume-weighted numerator and demoninator
    numerator = (
        (triangles.iloc[:,1:]).reset_index() 
                                     .drop('AccidentYear',axis=1)
                                     .groupby(['GRNAME','LOB'])
                                     .sum(axis=0))
    denominator = (
        (weight*triangles.iloc[:,:-1]).reset_index()
                                      .drop('AccidentYear',axis=1)
                                      .groupby(['GRNAME','LOB'])
                                      .sum(axis=0))
    numerator.columns = denominator.columns = columns

    # Development Patterns
    ldf = (numerator/denominator).fillna(1.0)
    
    return ldf
    

In [6]:
%timeit ldf3 = make_trg(data)

121 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
# alternatives, including original
def make_trg_2(data):
    '''
    see _alt that this method is fastest
    '''
    aggregates2 = data.query(' DevelopmentYear ==  1997 ').groupby(['LOB','GRNAME'])['IncurLoss'].sum() 
    top_20_by_lob = aggregates2.groupby(level='LOB').apply(lambda x : x.nlargest(19).reset_index(level=0, drop=True))
    
    data_alt2 = data.merge(top_20_by_lob.to_frame(), how='left', left_on=['LOB','GRNAME'], right_index=True)
    data_alt2.loc[data_alt2.loc[:,'IncurLoss_y'].isna(), 'GRNAME'] = 'Other'
    
    # create triangles 
    triangles = pd.pivot_table(data_alt2, index=['GRNAME','LOB','AccidentYear'],
                           columns='DevelopmentLag', values='CumPaidLoss')
    
    # Determine LDF Weights ORIG
    w = pd.DataFrame(np.array([[1 if i+j<9 else 0 for i in range(9)] for j in range(10)]))
    weight = np.tile(w, (int(triangles.shape[0]/10), 1))
    columns = [f'{triangles.columns[num]}-{triangles.columns[num+1]}'
               for num, item in enumerate(triangles.columns[:-1])]

    # Volume-weighted numerator and demoninator mask for denom only; values on num because want index from num 
    ldf = (triangles.iloc[:,1:].groupby(level=['GRNAME','LOB']).sum().values / \
           (weight*triangles.iloc[:,:-1]).groupby(level=['GRNAME','LOB']).sum()).fillna(1.0) 
    return ldf

In [30]:
aa = make_trg_2(data)
np.allclose(a0, aa)



True

In [26]:
%timeit aa = make_trg_2(data)



31.6 ms ± 708 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
# alternatives, including original
def make_trg_alt(data, method):
        
    aggregates2 = data.query(' DevelopmentYear ==  1997 ').groupby(['LOB','GRNAME'])['IncurLoss'].sum() 
    top_20_by_lob = aggregates2.groupby(level='LOB').apply(lambda x : x.nlargest(19).reset_index(level=0, drop=True))
    
    data_alt2 = data.merge(top_20_by_lob.to_frame(), how='left', left_on=['LOB','GRNAME'], right_index=True)
    data_alt2.loc[data_alt2.loc[:,'IncurLoss_y'].isna(), 'GRNAME'] = 'Other'
    
    # create triangles 
    triangles = pd.pivot_table(data_alt2, index=['GRNAME','LOB','AccidentYear'],
                           columns='DevelopmentLag', values='CumPaidLoss')
    
    # Determine LDF Weights ORIG
#     weight = np.array(~triangles.iloc[:,1:].isna())
    w = pd.DataFrame(np.array([[1 if i+j<9 else 0 for i in range(9)] for j in range(10)]))
    weight = np.tile(w, (int(triangles.shape[0]/10), 1))
    columns = [f'{triangles.columns[num]}-{triangles.columns[num+1]}'
               for num, item in enumerate(triangles.columns[:-1])]

    # Volume-weighted numerator and demoninator; do not need mask here... 
    if method=='original':
        numerator = (
            (triangles.iloc[:,1:]).reset_index() 
                                         .drop('AccidentYear',axis=1)
                                         .groupby(['GRNAME','LOB'])
                                         .sum(axis=0))
        denominator = (
            (weight*triangles.iloc[:,:-1]).reset_index()
                                          .drop('AccidentYear',axis=1)
                                          .groupby(['GRNAME','LOB'])
                                          .sum(axis=0))
    if method=='original_noax':
        numerator = (
            (triangles.iloc[:,1:]).reset_index() 
                                         .drop('AccidentYear',axis=1)
                                         .groupby(['GRNAME','LOB'])
                                         .sum())
        denominator = (
            (weight*triangles.iloc[:,:-1]).reset_index()
                                          .drop('AccidentYear',axis=1)
                                          .groupby(['GRNAME','LOB'])
                                          .sum())
    if method=='original2':
        numerator = (
            (triangles.iloc[:,1:]).reset_index() 
                                         .drop('AccidentYear',axis=1)
                                         .groupby(['GRNAME','LOB'])
                                         .apply(lambda x: np.sum(x.iloc[:, 2:], axis=0)))
        denominator = (
            (weight*triangles.iloc[:,:-1]).reset_index()
                                          .drop('AccidentYear',axis=1)
                                          .groupby(['GRNAME','LOB'])
                                          .apply(lambda x: np.sum(x.iloc[:, 2:], axis=0)))
    if method=='alt1':
        numerator = (weight*triangles.iloc[:,1:]).groupby(level=['GRNAME','LOB']).sum()
        denominator = (weight*triangles.iloc[:,:-1]).groupby(level=['GRNAME','LOB']).sum()
    if method=='alt2':    
        numerator = triangles.iloc[:,1:].groupby(level=['GRNAME','LOB']).apply(sum)
        denominator = (weight*triangles.iloc[:,:-1]).groupby(level=['GRNAME','LOB']).apply(sum)

#     print(numerator.shape, denominator.shape, len(columns))
    numerator.columns = denominator.columns = columns

    # Development Patterns
    ldf = (numerator/denominator).fillna(1.0) 
    return ldf

In [8]:
a0 = make_trg(data)
a1 = make_trg_alt(data, 'original')
a1x = make_trg_alt(data, 'original_noax')
a1s = make_trg_alt(data, 'original2')
a2 = make_trg_alt(data, 'alt1')
a3 = make_trg_alt(data, 'alt2')
np.allclose(a0, a1), np.allclose(a0, a1x),  np.allclose(a0, a2), np.allclose(a0, a3) 

  stacked_values = np.vstack(map(np.asarray, values))


(True, True, True, True)

In [9]:
%timeit a0 = make_trg(data)

153 ms ± 26.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit a1 = make_trg_alt(data, 'original')

117 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit a1 = make_trg_alt(data, 'original_noax')

44.1 ms ± 9.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%timeit a1 = make_trg_alt(data, 'alt1')

31.3 ms ± 569 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%timeit a1 = make_trg_alt(data, 'alt2')

70.5 ms ± 5.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%timeit a1 = make_trg_alt(data, 'original2')

87.4 ms ± 23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Contrived example

In [None]:
tdf = pd.DataFrame(np.random.normal(size=(20000,2000)))
tdf.head()

In [None]:
tdf.groupby(lambda x: x // 5000).sum(axis=0)

In [None]:
%timeit tdf.sum(axis=0)

In [None]:
%timeit tdf.sum()

In [None]:
%timeit tdf.loc[:, 0].groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0:1].groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0:1].groupby(lambda x: x // 500).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0:4].groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0:9].groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0:19].groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0].groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0:1].groupby(lambda x: x // 20000).sum()

In [None]:
%timeit tdf.loc[:, 0:1].groupby(lambda x: x // 500).sum()

In [None]:
%timeit tdf.loc[:, 0:4].groupby(lambda x: x // 20000).sum()

In [None]:
%timeit tdf.loc[:, 0:9].groupby(lambda x: x // 20000).sum()

In [None]:
%timeit tdf.loc[:, 0:19].groupby(lambda x: x // 20000).sum()

In [None]:
%timeit tdf.loc[:, 0:5].groupby(lambda x: x // 20000).sum(axis=0)

In [None]:
%timeit tdf.loc[:, 0:5].groupby(lambda x: x // 20000).sum()

In [None]:
%timeit tdf.groupby(lambda x: x // 5000).apply(sum, axis=0)

In [None]:
%timeit tdf.groupby(lambda x: x // 5000).apply(sum)

In [None]:
a1 = tdf.groupby(lambda x: x // 5000).sum(axis=0)
a2 = tdf.groupby(lambda x: x // 5000).apply(sum, axis=0)
np.allclose(a1, a2)

In [None]:
import dis

In [None]:
def t1(tdf):
    tdf.groupby(lambda x: x // 5000).apply(sum, axis=0)

def t2(tdf):
    tdf.groupby(lambda x: x // 5000).apply(sum)

In [None]:
dis.dis(t2)

In [None]:
dis.dis(t1)

In [None]:
def myfun(a):
    return a

def myfunex(a, **kwargs):
    return a

def tester1(a):
    t = 0
    for i in range(a):
        t += myfun(i)

def tester2(a):
    t = 0
    for i in range(a):
        t += myfunex(i, other=12)

In [None]:
dis.dis(tester1)

In [None]:
dis.dis(tester2)

In [None]:
%timeit tester1(100)

In [None]:
%timeit tester2(100)

In [None]:
%timeit myfun(12)

In [None]:
%timeit myfunex(12, extra=144)

In [None]:
import keyword

In [None]:
keyword.kwlist

In [None]:
def count(str_in, what='aeiou '):
    return ''.join([i if i in what else '_' for i in str_in.lower() ])

In [None]:
count('Stephen Mildenhall', 'dhl ')

# Creating Loss Development Factors to be used as features in our Machine Learning Model(s).

Let's write a function that allows us to generate volume-weighted development patterns for each of company/LOB above. 

In [None]:
# Determine LDF Weights
weight = np.array(~triangles.iloc[:,1:].isna())
columns = [f'{triangles.columns[num]}-{triangles.columns[num+1]}'
           for num, item in enumerate(triangles.columns[:-1])]

# Volume-weighted numerator and demoninator
numerator = (
    (weight*triangles.iloc[:,1:]).reset_index() 
                                 .drop('AccidentYear',axis=1)
                                 .groupby(['GRNAME','LOB'])
                                 .sum(axis=0))
denominator = (
    (weight*triangles.iloc[:,:-1]).reset_index()
                                  .drop('AccidentYear',axis=1)
                                  .groupby(['GRNAME','LOB'])
                                  .sum(axis=0))
numerator.columns = denominator.columns = columns

# Development Patterns
ldf = (numerator/denominator).fillna(1.0)
ldf.head(10)

# What is Scikit-Learn?

**Machine Learning in Python**

* Simple and efficient tools for data mining and data analysis
* Accessible to everybody, and reusable in various contexts
* Built on NumPy, SciPy, and matplotlib
* Open source, commercially usable - BSD license


#### scikit-learn covers the majority of supervised and unsupervised ML techniques available today and  is continually expanding
![](https://scikit-learn.org/stable/_static/ml_map.png)

`sklearn` is the defacto standard Machine Learning API for Python.  Other libraries yield to the simplicity of its API. 

![](https://github.com/PirateGrunt/paw_rpm/blob/master/notebooks/assets/one_api.png?raw=true)

* Want to do some Keras Deep learning?  No problem, just use `keras.wrappers.scikit_learn`
* XGBoost anyone?  Use: `xgboost.sklearn`
* Don't want to learn the syntax for the Light GBM? `lightgbm.sklearn` to the rescue.
* Natural langauge processing requires unique functionality, right? Nope, `nltk.classify.scikitlearn`


##### Scikit-learn is a consistent API for all Machine Learning Algorithms

Estimators are the building block of scikit-learn.  Almost everything is an estimator.  All estimators have `fit()` methods. Most have either a `predict()` or `transform()` method. Supervised techniques generally have a `score()` method as well.

The basic ML workflow looks like this:
```python
from sklearn.EstimatorFamily import Estimator
est = Estimator(hyperparameter_1, ... ,hyperparameter_n) # Create a model
est.fit(X_train, y_train) # Fit the model
est.score(X_test, y_test) # Evaluate model efficacy
est.predict(X_test) # Create predictions
```

##### Importing your estimators
`from sklearn.EstimatorFamily import Estimator` is typically how you'd import an estimator.  Some examples are:
``` python
from sklearn.linear_model import RidgeRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
```

In [None]:
# Exercise - Import the support vector classifier and a k-neighbors classifier
# from sklearn
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

##### Hyperparameters of your estimarors (Controlling how your estimator fits)
Instantiating an estimator typically looks like `est = Estimator(hyperparameter_1, ... ,hyperparameter_n)`.
Upon instantition you have the *option* of setting hyperparameters (i.e. parameters whose values are set before the learning process).  All hyperparameters have defaults that may or may not be satisfactory for your particular problem.

Exmaples of setting initial hyperparameters on an estimator:
```python
rr = RidgeRegression(alpha=0.5, fit_intercept=False, normalize=True)
knc = KNeighborsClassifier(n_neighbors=10)
gbc = GradientBoostingClassifier()
```

In [None]:
# Exercise - Override the SVC hyperparameters such that it uses a kernel type
# of a second degree polynomial
SVC(kernel='poly', degree=2)

##### Transformers - a special kind of estimator
Several `sklearn` estimators implement a `transform()` method.  Transformers are typically used to 'transform' your featureset in a way that will improve another algorithms (e.g. regressor, classifier) performance.

Typical examples include:
```python
sklearn.preprocessing.PCA # Principle Components transformation
sklearn.preprocessing.OneHotEncoder # Categorical to dummy transformation
sklearn.preprocessing.StandardScaler # Removing the mean and scaling to unit variance for each feature
sklearn.preprocessing.LabelEncoder # Single-column label to integer tranformation
```

In [None]:
# Exercise - Import and create a labelEncoder transformer named 'le'
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

##### Fitting an estimator
To fit an estimator we require data - in most cases the data must be `numpy` arrays that are numeric in nature. However, many of the preprocessing transformers are helpers designed to meet this requirement.

In [None]:
response = ldf.reset_index()['LOB']
# Exercise - Pass 'response'to the fit method of your LabelEncoder() instance
# you created in the previous exercise.
le.fit(response)

##### Mutating the Estimator with fit()
Though it looks like nothing happened, a lot happened under the hood.  Our estimator has seen data can now be applied to new datasets.  Once an estimator is fit, it spin off useful metadata that describes the fit model.  `sklearn` uses a trailing underscore in property names to help users distinguish between hyperparameters and the new metadata.
```python
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False)
lr.fit(X, y)
print(lr.fit_intercept) # A hyperparameter.  Returns False.
print(lr.coef_) # Trailing underscore denotes the property comes from a 'fit'.  Returns model coefficients.
```

Additionally the predict, transform, and score methods (if applicable) become available.

In [None]:
# Exercise - access the 'classes_' property of our label encoder.
le.classes_

##### Transforming  a simple dataset
With a fit estimator we can create predictions or transformations on any new dataset that has the same number of features as our original data.

In [None]:
# Exercise -  create an array called 'y' that uses your LabelEncoder
# transform() method on 'response'.  Display y.
y = le.transform(response)
y

### Supervised Learning Example - Identifying the line of business of an unlabeled triangle
We've computed the volume weighted development patterns of twenty companies for each line of business, `wkcomp`, `comauto`, and `ppauto` and want to use them to train a Machine Learning model that can identify the appropriate line of business.

Defining this problem more concretely:<br>
The LDFs are our featureset, **X**, and the known line of business is our response, **y**.

`sklearn` generally likes to consume `numpy` arrays.  It does not like mixed datatypes like a `pandas.DataFrame`.  The supervised learning estimators particularly like `numpy` arrays to be strictly numeric.  We've already created a `numpy` array for **y** using `LabelEncoder`.

Fortunately, our LDFs are already numeric, but we just need to convert them to a `numpy` array.  This is done esiest using the `values` attribute of our `DataFrame`.

In [None]:
# Exercise - create a matrix called X that isequal to ldf.values
X = ldf.values

##### Train/Test Split
It is best practice in machine learning to evaluate models on a test set of data.  Since this is covered substantially in other literature, we will not go into the details of why here.  `sklearn` comes with several utilities to split data, but we will explore the simplest one.

```python
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42)
```

`train_test_split` returns a tuple of our features/response split into training and test sets. The `random_state` argument shows up in a lot of places in `sklearn`.  Generally, when there is a stochastic component to the `sklearn` component you are using, `random_state` is there to allow you to set a seed so that your work can be replicated.

In [None]:
# Exercise - using the sample code in the previos cell, split the loss
# reserving data into training and test sets.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42)

##### Fitting our classifier
Our data is in a numerical format, its been split, and now we are ready to do some Machine Learning.  

Don't forget, when fitting any supervised learning technique, you must specify both your featureset and your response in the `fit` method.

In [None]:
# Exercise - Create a KNeighborsClassifier and fit it to our training data
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

##### Evaluating our classifier
The `score()` method of all classifiers defaults to an accuracy measure.  For regressors, it will return an R-squared figure.

In [None]:
# Exercise - Use score() to evaluate the accuracy of our KNeighborsClassifier
# on our test set.
knn.score(X_test,y_test)

##### Classifier Confusion Matrix
Another way of looking at a classifier's performance is by way of its `confusion_matrix` which gives a bit more information than our accuracy score.  Specifically, it tells us our false positive and false negative rate.

In [None]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(confusion_matrix(y_test, knn.predict(X_test)),
             index=le.classes_, columns=le.classes_)

##### Try another classifier
Remember the `sklearn` API was designed to make using different algorithms as consistent as possible. 

In [None]:
# Exercise - Fit a LogisticRegression and evaluate its accuracy on the test data.
from sklearn.linear_model import LogisticRegression
model = LogisticRegression() # Swap in the another estimator
model.fit(X_train, y_train) # No changes
print(model.score(X_test,y_test)) # No changes
pd.DataFrame(confusion_matrix(y_test, model.predict(X_test)),
             index=le.classes_, columns=le.classes_)

##### Visual representation of first three Development Factors
By inspection (at least across the first three development ages), it is more difficult to distinguish between `wkcomp` and `ppauto` in line with where our classifiers are least accurate.

In [None]:
import seaborn as sns
%matplotlib inline
g = sns.pairplot(ldf.reset_index()[['LOB','1-2','2-3','3-4']], hue="LOB")

##### Robustness of train_test_split
Let's test the performance of our `KNeighborsClassifier` using a different random state.

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=41)
# Exercise - What is our accuracy when we change our train_test_split
# random_state to 41?
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_test,y_test)

##### Cross-validation
* `sklearn` provides a `cross_val_score` to test the accuracy of an estimator across multiple folds painting a truer picture of an estimators' efficacy than a simple train/test split.
* With `cross_val_score`, we don't really need to provide separate train and test sets.  Though, with enough data, it is sometimes instructive to have train/test and holdout 

In [None]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(knn, X, y, cv=5))

##### Improving model accuracy with GridSearchCV

With `GridSearchCV`, we can feed a hyperparameter grid into our estimator to determine an 'optimal' set of hyperparameters to use for our particular business problem.  `GridSearchCV` itself is an estimator and so it has the usual `'fit()` and `predict()` methods any other classifier would.

At a minimum, parameterizing the GridSearchCV estimator we need to specify:
1. The estimator we want to use
2. The hyperparameter searchspace as a dictionary

Optionally, we can also specify:
1. The number of folds to use

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid={'n_neighbors':[1,3,5,7,9,11]}
grid = GridSearchCV(knn, param_grid, cv=5, refit=True)
grid.fit(X, y)
print(f'Best Accuracy Score: {grid.best_score_}')
grid.best_estimator_

A Visual inspection of the cross-validated scores shows support for `n_neighbors=5`

In [None]:
g = sns.pointplot(x=grid.cv_results_['param_n_neighbors'],
                  y=grid.cv_results_['mean_test_score']) \
       .set(xlabel='n_neighbors', ylabel='Accuracy', title='Gridsearch Results')

In [None]:
# Exercise - Explore values of KNeighborsClassifier hyperparameter, p to
# improve the cross-validated score of our estimator
param_grid=dict(n_neighbors=[1,3,5,7,9,11], p=[1,2,3,4,5,6])
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X, y)
print(f'Best Score: {grid.best_score_}')
grid.best_estimator_

In [None]:
# Exercise - Display the confusion matrix of your best estimator from the
# previous exercise on the entire dataset
pd.DataFrame(confusion_matrix(y, grid.best_estimator_.predict(X)),
             index=le.classes_, columns=le.classes_)

### More complex workflows with Pipeline

The authors of `sklearn` recognize that composability of multiple estimators will be necessary to build the best models.  For example, you may want to cluster a feature before feeding it into a Regressor.

The `Pipeline` is useful for chaining one or more transformers together.  Pipelines themselves are estimators and have `fit()`, `predict()`, and `score()` function and can be used with all of the `sklearn` funcitons used for regular estimators including but not limited to: `cross_val_score`, `confusion_martix`

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

from sklearn.datasets import load_boston

# Regression Dataset
boston = load_boston()
    
# Polynomial Transformer
poly = PolynomialFeatures(degree=2) 

# Regressor
rf = RandomForestRegressor(n_estimators=10, random_state=42)

# Set up Steps
steps=[('poly', poly), ('rf', rf)]

# Build a Pipeline
pipe = Pipeline(steps=steps) 

# Cross-Validation R-square
np.mean(cross_val_score(pipe, boston['data'], boston['target'], cv=5))

In [None]:
# Exercise - Create a Pipeline with Principle Components Analysis (PCA) as a
# first step and the optimal KNeighbors hyperparameters (n_neighbors=3 and p=3)
from sklearn.decomposition import PCA

steps=[('pca', PCA()),
       ('knn',KNeighborsClassifier(n_neighbors=3, p=3))]

pipe = Pipeline(steps=steps)
np.mean(cross_val_score(pipe, X, y,cv=5))

#### Pipelines and GridSearchCV
Since a `Pipeline` is just another estimator `GridSearchCV` allows the hyperparameter space of all estimators in the pipeline to be gridsearched in one go.  

To avoid hyperparameter name clashes between one estimator and another within a pipeline, `sklearn` uses a double underscore naming convention of the form {estimator_name}__{hyperparameter} for the keys of its parameter grid.


In [None]:
param_grid = dict(rf__n_estimators=[10, 25],
                  rf__max_depth=[10, 15, 20],
                  rf__min_samples_split=[5, 10, 15],
                  poly__degree=[1, 2])

# Set up Steps
steps=[('poly', PolynomialFeatures()), ('rf', RandomForestRegressor())]

# Build a Pipeline
pipe = Pipeline(steps=steps) 

# Hyperparameter grid
param_grid = dict(rf__n_estimators=[10, 25],
                  rf__max_depth=[10, 15, 20],
                  rf__min_samples_split=[5, 10, 15],
                  poly__degree=[1, 2])

#Grid Search
grid = GridSearchCV(pipe, param_grid, cv=5, iid=True)
grid.fit(boston['data'], boston['target'])

print(f'Best R-square: {grid.best_score_}')
grid.best_estimator_

In [None]:
# Exercise - Use Pipeline and GridsearchCV to determine whether we achieve a
# better cross-validated accuracy using PCA and KNeighborsClassifier combined

param_grid=dict(knn__n_neighbors=[1,3,5,7,9,11],
                knn__p=[1,2,3,4,5,6],
                pca__n_components=[3, 5, 7, 9])

pipe = Pipeline(steps=[('pca', PCA()),
                       ('knn',KNeighborsClassifier())])

grid = GridSearchCV(pipe, param_grid, cv=5, refit=True)
grid.fit(X, y)

print(f'Best Score: {grid.best_score_}')
grid.best_estimator_

### Scikit-Learn Recap

* Almost everything is an Estimator.  They all have a `fit` method and depending on the nature of the estimator may also have a `predict`, `score` or `transform` method.
* The API is standardized across estimator
* A transformer is a special type of estimator that trasnforms data for another Estimator
* Cross-validation with Grid Search helps in hyperparameter selection
* Pipelines are useful for composing a chain of Estimators.
* The documentation is a goldmine of information