# Cross-Validation

- Do **k-fold cross-validation** with independent test set
- Use scikit-learn for **hyperparameter optimization**

In [None]:
!pip install category_encoders==2.*

In [28]:
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, validation_curve # k-fold CV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # Hyperparameter tuning
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 50)

# I. Wrangle Data

In [85]:
# Use wrangle function to import training and test data, and clean
def wrangle(fm_path, tv_path=None):
    if tv_path:
        df = pd.merge(pd.read_csv(fm_path, 
                                  na_values=[0, -2.000000e-08]),
                      pd.read_csv(tv_path)).set_index('id')
    else:
        df = pd.read_csv(fm_path, 
                         na_values=[0, -2.000000e-08],
                         index_col='id')

    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    
    # Drop constant columns
    df.drop(columns=['recorded_by'], inplace=True)
    
    # Create age feature
    df['pump_age'] = df['date_recorded'].dt.year - df['construction_year']
    df.drop(columns=['date_recorded','construction_year'], inplace=True)
    
    # Drop HCCCs
    cutoff = 100
    drop_cols = [col for col in df.select_dtypes('object').columns
                 if df[col].nunique() > cutoff]
    df.drop(columns=drop_cols, inplace=True)

    # Drop duplicate columns
    dupe_cols = [col for col in df.head(15).T.duplicated().index
                 if df.head(15).T.duplicated()[col]]
    df.drop(columns=dupe_cols, inplace=True)             


    return df

# Using the above wrangle function to read train_features.csv and train_labels.csv into the DataFrame
df = wrangle(fm_path= 'train_features.csv',
             tv_path= 'train_labels.csv')

# test_features.csv into the DataFrame X_test
X_test = wrangle(fm_path= 'test_features.csv')

# II. Split Data

## Split TV from FM

In [86]:
df.columns

Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
       'basin', 'region', 'region_code', 'district_code', 'population',
       'public_meeting', 'scheme_management', 'permit', 'extraction_type',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'payment_type', 'water_quality', 'quality_group', 'quantity', 'source',
       'source_type', 'source_class', 'waterpoint_type', 'status_group',
       'pump_age'],
      dtype='object')

In [87]:
df

Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region,region_code,district_code,population,public_meeting,scheme_management,permit,extraction_type,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,status_group,pump_age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
454.0,50.0,2092.0,35.426020,-4.227446,,Internal,Manyara,21,1.0,160.0,True,Water Board,True,gravity,gravity,water board,user-group,pay per bucket,per bucket,soft,good,insufficient,spring,spring,groundwater,communal standpipe,functional,15.0
510.0,,,35.510074,-5.724555,,Internal,Dodoma,1,6.0,,True,VWC,True,india mark ii,handpump,vwc,user-group,never pay,never pay,soft,good,enough,shallow well,shallow well,groundwater,hand pump,functional,
14146.0,,,32.499866,-9.081222,,Lake Rukwa,Mbeya,12,6.0,,True,VWC,False,other,other,vwc,user-group,never pay,never pay,soft,good,enough,shallow well,shallow well,groundwater,other,non functional,
47410.0,,,34.060484,-8.830208,,Rufiji,Mbeya,12,7.0,,True,VWC,True,gravity,gravity,vwc,user-group,pay monthly,monthly,soft,good,insufficient,river,river/lake,surface,communal standpipe,non functional,
1288.0,300.0,1023.0,37.032690,-6.040787,,Wami / Ruvu,Morogoro,5,1.0,120.0,True,VWC,True,other,other,vwc,user-group,pay when scheme fails,on failure,salty,salty,enough,shallow well,shallow well,groundwater,other,non functional,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68525.0,1000.0,327.0,36.367112,-8.774761,,Rufiji,Morogoro,5,4.0,255.0,True,Water Board,True,gravity,gravity,wug,user-group,pay monthly,monthly,soft,good,enough,river,river/lake,surface,communal standpipe,functional,16.0
11980.0,1000.0,1743.0,34.531524,-9.769604,,Lake Nyasa,Iringa,11,5.0,35.0,True,VWC,False,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,seasonal,spring,spring,groundwater,communal standpipe,functional,23.0
35778.0,,-13.0,38.974416,-5.420823,,Pangani,Tanga,4,5.0,1000.0,False,Private operator,False,other,other,private operator,commercial,never pay,never pay,soft,good,insufficient,shallow well,shallow well,groundwater,other,non functional,6.0
49444.0,,,34.316586,-3.107161,,Lake Victoria,Shinyanga,17,6.0,,True,WUG,True,nira/tanira,handpump,wug,user-group,never pay,never pay,soft,good,enough,shallow well,shallow well,groundwater,hand pump,functional,


In [88]:
round(df.isnull().sum()/len(df)*100,2)

amount_tsh               70.14
gps_height               34.25
longitude                 3.02
latitude                  3.02
num_private              98.70
basin                     0.00
region                    0.00
region_code               0.00
district_code             0.04
population               35.87
public_meeting            5.66
scheme_management         6.53
permit                    5.13
extraction_type           0.00
extraction_type_class     0.00
management                0.00
management_group          0.00
payment                   0.00
payment_type              0.00
water_quality             0.00
quality_group             0.00
quantity                  0.00
source                    0.00
source_type               0.00
source_class              0.00
waterpoint_type           0.00
status_group              0.00
pump_age                 34.73
dtype: float64

In [29]:
df.corr()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,region_code,population,pump_age
amount_tsh,1.0,0.059626,-0.036765,-0.030419,-0.038344,-0.001729,0.021589
gps_height,0.059626,1.0,-0.572844,0.281433,-0.322055,-0.036637,0.023007
longitude,-0.036765,-0.572844,1.0,-0.300245,0.149095,-0.097179,-0.035064
latitude,-0.030419,0.281433,-0.300245,1.0,-0.245281,0.048344,-0.019061
region_code,-0.038344,-0.322055,0.149095,-0.245281,1.0,0.080257,0.020465
population,-0.001729,-0.036637,-0.097179,0.048344,0.080257,1.0,-0.027999
pump_age,0.021589,0.023007,-0.035064,-0.019061,0.020465,-0.027999,1.0


In [40]:
df['payment_type'].value_counts()

never pay     20318
per bucket     7223
monthly        6574
unknown        6520
on failure     3154
annually       2886
other           844
Name: payment_type, dtype: int64

In [39]:
df['payment'].value_counts()

never pay                20318
pay per bucket            7223
pay monthly               6574
unknown                   6520
pay when scheme fails     3154
pay annually              2886
other                      844
Name: payment, dtype: int64

In [4]:
target = 'status_group'
y = df[target]
X = df.drop(columns=target)

# Create train-validation split to compare with train-val process

In [5]:
X_train,y_val,y_train,y_val = train_test_split(X,y,test_size=.2,random_state=42)

# Training-Validation Split

- Since we're doing k-fold CV, there's no need for a validation set.

# III. Establish Baseline

This is a **classification** problem, our baseline will be **accuracy**. 

In [6]:
baseline_Acc = y_train.value_counts(normalize=True).max()
print('Baseline accuracy:',baseline_Acc)

Baseline accuracy: 0.5425489938182296


# IV. Build Models

- `DecisionTreeClassifier`
- `RandomForestClassifier`

In [6]:
model_dt = make_pipeline(OrdinalEncoder(),
                        SimpleImputer(strategy='mean'),
                        DecisionTreeClassifier(random_state=42))
model_dt.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['basin', 'region', 'public_meeting',
                                      'scheme_management', 'permit',
                                      'extraction_type',
                                      'extraction_type_class', 'management',
                                      'management_group', 'payment',
                                      'payment_type', 'water_quality',
                                      'quality_group', 'quantity', 'source',
                                      'source_type', 'source_class',
                                      'waterpoint_type'],
                                mapping=[{'col': 'basin',
                                          'data_typ...
                                          'data_type': dtype('O'),
                                          'mapping': groundwater    1
surface        2
unknown        3
NaN           -2
dtype: int64},
                           

In [7]:
model_rf = make_pipeline(OrdinalEncoder(),
                         SimpleImputer(strategy='mean'),
                         RandomForestClassifier(n_jobs=-1,
                                                random_state=42))

model_rf.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['basin', 'region', 'public_meeting',
                                      'scheme_management', 'permit',
                                      'extraction_type',
                                      'extraction_type_class', 'management',
                                      'management_group', 'payment',
                                      'payment_type', 'water_quality',
                                      'quality_group', 'quantity', 'source',
                                      'source_type', 'source_class',
                                      'waterpoint_type'],
                                mapping=[{'col': 'basin',
                                          'data_typ...
                                          'data_type': dtype('O'),
                                          'mapping': groundwater    1
surface        2
unknown        3
NaN           -2
dtype: int64},
                           

### Train_Test_Split scores (can't be compared to CV scores)

# DT Accuracy

In [None]:
training_acc = accuracy_score(y_train,model_dt.predict(X_train))
val_acc = accuracy_score(y_val,model_dt.predict(X_val))

print('Training Accuracy Score:', training_acc)
print('Validation Accuracy Score:', val_acc)

# RF Accuracy

In [None]:
training_acc = accuracy_score(y_train,model_rf.predict(X_train))
val_acc = accuracy_score(y_val,model_rf.predict(X_val))

print('Training Accuracy Score:', training_acc)
print('Validation Accuracy Score:', val_acc)

# CV Score for DT

In [7]:
cv_scores_dt = cross_val_score(model_dt,X,y,cv=5)

NameError: name 'model_dt' is not defined

In [None]:
print('CV score for Decision Tree')
print(cv_scores_dt)
print('-------------------------------------------------------')
print('The mean CV accuracy for DT is:',cv_scores_dt.mean())
print('-------------------------------------------------------')
print('The mean CV std deviation for DT is:',cv_scores_dt.std())

# CV Score for RF

In [None]:
cv_scores_rf = cross_val_score(model_rf,X,y,cv=5, n_jobs=-1)

In [None]:
print('CV score for Random Forest')
print(cv_scores_rf)
print('-------------------------------------------------------')
print('The mean CV accuracy for RF is:',cv_scores_rf.mean())
print('-------------------------------------------------------')
print('The mean CV std deviation for RF is:',cv_scores_rf.std())

**Check cross-validation scores**

![Cross Validation](https://upload.wikimedia.org/wikipedia/commons/4/4b/KfoldCV.gif)

# V. Tune Model

- What are important hyperparameters for `RandomForestClassifier`?
  - `max_depth`: 5-35
  - `n_estimators` 25-100
  - imputation strategy

In [19]:
clf = make_pipeline(OrdinalEncoder(),
                    SimpleImputer(),
                    RandomForestClassifier(random_state=42,n_jobs=-1))

param_grid = {'simpleimputer__strategy':['meadian','mean'],
             'randomforestclassifier__max_depth':range(5,35,5),
             'randomforestclassifier__n_estimators':range(25,200,5),
             'randomforestclassifier__max_samples':np.arange(0.2,1,0.1),
             'randomforestclassifier__max_features':['sqrt','log2'],
             'randomforestclassifier__min_samples_split':np.arange(2, 10, 1),
             'randomforestclassifier__class_weight':['balanced',None,'balanced_subsample'],
             'randomforestclassifier__criterion':['gini','entropy'],
             'randomforestclassifier__warm_start':[False,True],
             'randomforestclassifier__oob_score':[False,True],
             'randomforestclassifier__bootstrap':[False,True]}

**`GridSearch`:** Very thourough, but it can take a long time.

Test out every single combination of hyperparameters that we provide it
**and** it's going to use cross-validation.

In [None]:
model_rfgs = GridSearchCV(clf,
                        param_grid = param_grid,
                        n_jobs=-1,
                        verbose=1)
model_rfgs.fit(X,y)

Fitting 5 folds for each of 107520 candidates, totalling 537600 fits


In [None]:
model_rfgs.best_params_

In [None]:
print('The Best Accuracy score for the grid search is:',model_rfgs.best_score_)

**`RandomizedSearchCV`:** Quicker, less effective but usually good enough.

In [66]:
model_rfrs = RandomizedSearchCV(clf,
                                param_distributions = param_grid,
                               n_iter=40,
                               n_jobs=-1,
                               verbose=1)
model_rfrs.fit(X,y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


 0.75931729        nan 0.78667467        nan        nan        nan
        nan 0.7912413  0.78520166        nan        nan        nan
        nan 0.74898458        nan        nan 0.7124307         nan
        nan 0.79753359 0.6176476         nan        nan        nan
 0.72150083 0.79305114        nan        nan 0.78903174        nan
        nan 0.78869506 0.78739026        nan]


RandomizedSearchCV(estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder()),
                                             ('simpleimputer', SimpleImputer()),
                                             ('randomforestclassifier',
                                              RandomForestClassifier(n_jobs=-1,
                                                                     random_state=42))]),
                   n_iter=40, n_jobs=-1,
                   param_distributions={'randomforestclassifier__bootstrap': [False,
                                                                              True],
                                        'randomforestclassifier__class_weight': ['balanced',
                                                                                 None,
                                                                                 'balanced_...
                                        'randomforestclassifie

In [67]:
model_rfrs.best_params_

{'simpleimputer__strategy': 'mean',
 'randomforestclassifier__warm_start': False,
 'randomforestclassifier__oob_score': False,
 'randomforestclassifier__n_estimators': 125,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__max_samples': 0.6000000000000001,
 'randomforestclassifier__max_features': 'sqrt',
 'randomforestclassifier__max_depth': 25,
 'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__class_weight': 'balanced_subsample',
 'randomforestclassifier__bootstrap': True}

In [14]:
print('The Best Accuracy score for the grid search is:',model_rfgs.best_score_)

NameError: name 'model_rfgs' is not defined

In [68]:
print('The Best Accuracy score for the grid search is:',model_rfrs.best_score_)

The Best Accuracy score for the grid search is: 0.7975335894282543


In [9]:
model_rf = make_pipeline(OrdinalEncoder(),
                         SimpleImputer(strategy='mean'),
                         RandomForestClassifier(n_jobs=-1,
                                                random_state=42,
                                                max_depth=25,
                                                max_samples=.5,
                                                n_estimators=95))

model_rf.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['basin', 'region', 'public_meeting',
                                      'scheme_management', 'permit',
                                      'extraction_type',
                                      'extraction_type_class', 'management',
                                      'management_group', 'payment',
                                      'payment_type', 'water_quality',
                                      'quality_group', 'quantity', 'source',
                                      'source_type', 'source_class',
                                      'waterpoint_type'],
                                mapping=[{'col': 'basin',
                                          'data_typ...
unknown        3
NaN           -2
dtype: int64},
                                         {'col': 'waterpoint_type',
                                          'data_type': dtype('O'),
                                          'map

In [None]:
training_acc = accuracy_score(y_train,model_rf.predict(X_train))
val_acc = accuracy_score(y_val,model_rf.predict(X_val))

print('Training Accuracy Score:', training_acc)
print('Validation Accuracy Score:', val_acc)

# Use your hypter parameters to retrain your model on X and than pass your X_test to make predicitions

# VI. Communicate Results

**Showing Feature Importance**

Plot the feature importance for our `RandomForest` model.

In [None]:
best_estimator = model_rfgs.best_estimator_
importances = best_estimator.named_steps['randomforestclassifier'].feature_importances_
features = X_train.columns
feat_imp = pd.Series(importances, index=features).sort_values()
feat_imp.plot(kind='barh')
plt.xlabel('Reduction in Gini Impurity');

# Make Submission

In [None]:
X_test = X_test[X_train.columns]
y_pred = model_rfgs.predict(X_test)
submission = pd.DataFrame({'status_group':y_pred}, index=X_test.index)
datestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H%M_')
submission.to_csv(f'{datestamp}submission.csv')

In [None]:
#save model
import pickle
filename = 'accuracy_8065perc'
pickle.dump(model_rfgs,open(filename,'wb'))

#load model
model_rfgs = pickle.load(open(filename,'rb'))