# K-Fold Cross-Validation

### Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

### Data preparation

In [2]:
# importing data
data_path= '../data/diamonds.csv'
diamonds = pd.read_csv(data_path)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)],axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)],axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)],axis=1)
diamonds.drop(['cut','color','clarity'], axis=1, inplace=True)

## Diamonds dataset

### Preparing objects for modelling

In [3]:
from sklearn.preprocessing import RobustScaler
target_name = 'price'
robust_scaler = RobustScaler()
X = diamonds.drop('price', axis=1)
X = robust_scaler.fit_transform(X)
y = diamonds[target_name]
# Notice that we are not doing train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

### Training our model

In [4]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators=50, max_depth=16, random_state=123, n_jobs=-1)

In [5]:
# this will work from sklearn version 0.19, if you get an error 
# make sure you upgrade: $conda upgrade scikit-learn
from sklearn.model_selection import cross_validate

In [6]:
scores = cross_validate(estimator=RF,X=X,y=y,
                        scoring=['mean_squared_error','r2'],
                        cv=10, n_jobs=-1)

In [7]:
scores = pd.DataFrame(scores)
scores['test_mean_squared_error'] = -1*scores['test_mean_squared_error']
scores['train_mean_squared_error'] = -1*scores['train_mean_squared_error']
scores

Unnamed: 0,fit_time,score_time,test_mean_squared_error,test_r2,train_mean_squared_error,train_r2
0,2.704191,0.720918,375539.0,0.538764,148065.528065,0.991526
1,3.141356,0.988628,450604.1,0.672636,150123.441197,0.991437
2,3.756991,1.060821,1429308.0,0.386105,118993.885068,0.993105
3,3.542923,1.004674,2386801.0,0.569107,121708.19462,0.992298
4,3.403554,1.176127,6002576.0,0.653763,84805.13487,0.9901
5,3.73744,0.910923,1376623.0,0.958366,134400.626049,0.990314
6,3.83971,4.791745,24477.21,-0.314355,149193.566169,0.99096
7,5.881141,0.306817,64057.53,-0.214988,149713.173174,0.991024
8,5.870614,0.363968,115613.3,0.304016,156899.220946,0.990759
9,6.064633,0.298291,197635.0,0.396521,154009.67005,0.991083


In [8]:
print("Mean test MSE:", round(scores['test_mean_squared_error'].mean()))
print("Mean test R-squared:", scores['test_r2'].mean())

Mean test MSE: 1242323
Mean test R-squared: 0.39499334944982994


## Credit card default dataset 

### Preparing the data

In [9]:
default = pd.read_csv('../data/credit_card_default.csv', index_col="ID")
default.rename(columns=lambda x: x.lower(), inplace=True)
default.rename(columns={'pay_0':'pay_1','default payment next month':'default'}, inplace=True)
# Base values: female, other_education, not_married
default['grad_school'] = (default['education'] == 1).astype('int')
default['university'] = (default['education'] == 2).astype('int')
default['high_school'] = (default['education'] == 3).astype('int')
default.drop('education', axis=1, inplace=True)

default['male'] = (default['sex']==1).astype('int')
default.drop('sex', axis=1, inplace=True)

default['married'] = (default['marriage'] == 1).astype('int')
default.drop('marriage', axis=1, inplace=True)

# For pay_n features if >0 then it means the customer was delayed on that month
pay_features = ['pay_' + str(i) for i in range(1,7)]
for p in pay_features:
    default[p] = (default[p] > 0).astype(int)

### Preparing objects for modelling

In [10]:
target_name = 'default'
X_credit = default.drop('default', axis=1)
feature_names = X_credit.columns
robust_scaler = RobustScaler()
X_credit = robust_scaler.fit_transform(X_credit)
y_credit = default[target_name]

In [11]:
from sklearn.ensemble import RandomForestClassifier
RF_credit = RandomForestClassifier(n_estimators=35, max_depth=20, random_state=55, 
                                   max_features='sqrt', n_jobs=-1)

In [12]:
scores_credit = cross_validate(estimator=RF_credit, X=X_credit, y=y_credit,
                        scoring=['accuracy','precision','recall'],
                        cv=10, n_jobs=-1)

In [13]:
scores_credit = pd.DataFrame(scores_credit)
scores_credit

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,train_accuracy,train_precision,train_recall
0,0.865801,0.32209,0.795068,0.563636,0.326807,0.949554,0.996981,0.774113
1,0.981133,0.315338,0.8014,0.589005,0.338855,0.949924,0.995921,0.776792
2,1.001128,0.315592,0.802732,0.59194,0.35241,0.949517,0.997841,0.773443
3,0.952572,0.317346,0.796734,0.569231,0.334337,0.953109,0.996623,0.79069
4,0.763571,0.317416,0.807333,0.614973,0.346386,0.950111,0.995288,0.778131
5,0.728374,0.315622,0.808333,0.59408,0.423193,0.950926,0.995521,0.781648
6,0.73445,0.316079,0.827943,0.686076,0.408748,0.947743,0.995439,0.767286
7,1.205706,0.588914,0.826275,0.707602,0.365008,0.950743,0.997643,0.779508
8,1.238292,0.590923,0.817272,0.670623,0.340875,0.947261,0.996073,0.764105
9,1.76804,0.314352,0.810937,0.631148,0.348416,0.947261,0.997158,0.76377


In [14]:
scores_credit[['test_accuracy','test_precision','test_recall']].mean()

test_accuracy     0.809403
test_precision    0.621831
test_recall       0.358503
dtype: float64

In [15]:
scores_credit[['test_accuracy','test_precision','test_recall']].std()

test_accuracy     0.011415
test_precision    0.050435
test_recall       0.032185
dtype: float64