# Pipelines
*Sequentially apply a list of transforms and a final estimator.*
- Scaling or imputation are examples of *transforms*
- a classifier is an *estimator*

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### The Mamography Mass dataset from UCI

In [None]:
# Note: Given what 'Shape' and 'Margin' actually mean it is not really valid to treat 
# them as numeric/ordinal.

mam_mass = pd.read_csv('MamMass.csv',na_values='?')
mam_mass.pop('BI-RADS')
y = mam_mass.pop('Severity').values
X = mam_mass.values

In [None]:
mam_mass.head()

### Two sample missing value imputers from `sklearn`
- `SimpleImputer` replace missing values with the mean for that column
- `KNNImputer` use similar instances to estimate missing values

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean') # Not used
imp_kNN = KNNImputer(missing_values = np.nan)
imp_kNN.fit(X)
Xi = imp_kNN.transform(X)

Also scale the data (otherwise `Age` attribute will dominate)

In [None]:
bScal = StandardScaler().fit(Xi)
XiS = bScal.transform(Xi)

Making the train-test-split after Imputation and Scaling is not the right way to do things.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(XiS, y, 
                                                    test_size=0.2,
                                                    random_state=1)
X_train.shape, X_test.shape

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)


## Fit Impute and Scale transforms on Train data only
The right way to do it. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=1)
X_train.shape, X_test.shape

In [None]:
imp_kNN = KNNImputer(missing_values = np.nan)
imp_kNN.fit(X_train)
Xi_train = imp_kNN.transform(X_train)
Xi_test = imp_kNN.transform(X_test)

In [None]:
bScal = StandardScaler().fit(Xi_train)
XiS_train = bScal.transform(Xi_train)
XiS_test = bScal.transform(Xi_test)

In [None]:
knn = KNeighborsClassifier()
knn.fit(XiS_train,y_train)
y_pred = knn.predict(XiS_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)


## With Pipelines

In [None]:
kNNpipe  = Pipeline(steps=[
    ('imputer', KNNImputer(missing_values = np.nan)),
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())])

In [None]:
kNNpipe.fit(X_train, y_train)
y_pred = kNNpipe.predict(X_test)
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test,y_pred)))
confusion_matrix(y_test, y_pred)


## Pipelines & Cross Validation

In [None]:
kNNpipe  = Pipeline(steps=[
    ('imputer', KNNImputer(missing_values = np.nan)),
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())])

acc_arr = cross_val_score(kNNpipe, X, y, cv=8, n_jobs = -1)
print("Accuracy: {0:4.2f}".format(sum(acc_arr)/len(acc_arr)))

Accuracy estimate with pipeline and cross-validation is worse than with hold-out - why?  
Hold-out split is a *lucky* split - change `random_state` and repeat. 

## Data Leaks
A demonstration of how having access to test data during feature selection can inflate accuracy estimates.   
First feature selection outside cross validation.   
- Testing is 10 x 10-fold cross validation.

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import RepeatedKFold

In [None]:
hotel_rev = pd.read_csv('HotelRevHelpfulness.csv')
y = hotel_rev.pop('reviewHelpfulness').values
hotel_rev.pop('hotelId')
X = hotel_rev.values

In [None]:
kk = 7
FS_trans = SelectKBest(mutual_info_classif, k=kk).fit(X, y)
X_FS = FS_trans.transform(X)  
X_FS.shape

In [None]:
rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
knn = KNeighborsClassifier()
acc_arr = cross_val_score(knn, X_FS, y, cv=rkf, n_jobs = -1)
print("Accuracy (feature selection before X-Val): {0:4.2f}".format(sum(acc_arr)/len(acc_arr)))

The right way: Feature selection within X-Val

In [None]:
FSpipe  = Pipeline(steps=[
    ('fs', SelectKBest(mutual_info_classif, k=kk)),
    ('classifier', KNeighborsClassifier())])
X.shape

In [None]:
acc_arr = cross_val_score(FSpipe, X, y, cv=rkf, n_jobs = -1)
print("Accuracy: (feature selection within X-Val) {0:4.2f}".format(sum(acc_arr)/len(acc_arr)))