In [51]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, LeaveOneOut, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('income_evaluation.txt', na_values='?')
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [5]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [6]:
df.fillna('missing', inplace=True)

In [7]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [8]:
X = df.drop(' income', axis=1)
y = df[' income']

In [9]:
X.shape

(32561, 14)

In [10]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name:  income, dtype: int64

In [11]:
num_cols = X.select_dtypes(include=np.number)
num_cols

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
32556,27,257302,12,0,0,38
32557,40,154374,9,0,0,40
32558,58,151910,9,0,0,40
32559,22,201490,9,0,0,20


In [12]:
num_cols = X.select_dtypes(include=np.number).columns
num_cols

Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')

In [13]:
cat_cols = X.select_dtypes(exclude=np.number)
cat_cols

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [14]:
cat_cols = X.select_dtypes(exclude=np.number).columns
cat_cols

Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object')

In [15]:
ct = ColumnTransformer([
                        ('rob', RobustScaler(), num_cols),
                        ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_cols)
])

In [16]:
pipe = Pipeline([
                 ('ct_step', ct),
                 ('model', RandomForestClassifier(n_estimators=10, random_state=0))
])

##KFold

In [17]:
kf = KFold(n_splits=5, shuffle=False, random_state=None)

In [18]:
i = 1
for train, val in kf.split(X):
  print("iteration", i)
  print(train, "train fold:", len(train))
  print(val, "val fold:", len(val))
  print("........................")
  i += 1


iteration 1
[ 6513  6514  6515 ... 32558 32559 32560] train fold: 26048
[   0    1    2 ... 6510 6511 6512] val fold: 6513
........................
iteration 2
[    0     1     2 ... 32558 32559 32560] train fold: 26049
[ 6513  6514  6515 ... 13022 13023 13024] val fold: 6512
........................
iteration 3
[    0     1     2 ... 32558 32559 32560] train fold: 26049
[13025 13026 13027 ... 19534 19535 19536] val fold: 6512
........................
iteration 4
[    0     1     2 ... 32558 32559 32560] train fold: 26049
[19537 19538 19539 ... 26046 26047 26048] val fold: 6512
........................
iteration 5
[    0     1     2 ... 26046 26047 26048] train fold: 26049
[26049 26050 26051 ... 32558 32559 32560] val fold: 6512
........................


In [68]:
scores_kf = []
i = 1
for train, val in kf.split(X):
  pipe.fit(X.loc[train], y[train])
  sco = pipe.score(X.loc[val], y[val])
  scores_kf.append(sco)
  print("intiration", i)
  print("train len", len(train))
  print("val len", len(val))
  i += 1

intiration 1
train len 26048
val len 6513
intiration 2
train len 26049
val len 6512
intiration 3
train len 26049
val len 6512
intiration 4
train len 26049
val len 6512
intiration 5
train len 26049
val len 6512


In [69]:
scores_kf

[0.8481498541378781,
 0.8445945945945946,
 0.8468980343980343,
 0.8487407862407862,
 0.8493550368550369]

In [74]:
np.array(scores_kf)

array([0.84814985, 0.84459459, 0.84689803, 0.84874079, 0.84935504])

In [75]:
np.array(scores_kf).mean()

0.8475476612452659

In [76]:
np.array(scores_kf).std()

0.0016849574364709557

##Stratified KFold

In [24]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name:  income, dtype: int64

In [25]:
print("each fold should have less than 50 K:", 24720/5)

each fold should have less than 50 K: 4944.0


In [26]:
print("each fold should have greater than 50 K:", 7841/5)

each fold should have greater than 50 K: 1568.2


In [27]:
skf = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)

In [28]:
scores_skf = []
i = 1
for train, val in skf.split(X, y):
  pipe.fit(X.loc[train], y[train])
  sco = pipe.score(X.loc[val], y[val])
  scores_skf.append(sco)
  print("intiration", i)
  print("train len", len(train))
  print("val len", len(val))
  i += 1

intiration 1
train len 26048
val len 6513
intiration 2
train len 26049
val len 6512
intiration 3
train len 26049
val len 6512
intiration 4
train len 26049
val len 6512
intiration 5
train len 26049
val len 6512


In [29]:
scores_skf

[0.8460003070781514,
 0.8422911547911548,
 0.8445945945945946,
 0.847512285012285,
 0.8481265356265356]

In [30]:
np.array(scores_skf)

array([0.84600031, 0.84229115, 0.84459459, 0.84751229, 0.84812654])

In [31]:
np.array(scores_skf).mean()

0.8457049754205442

In [32]:
np.array(scores_skf).std()

0.0021026012198182417

In [33]:
i = 1
for train, val in skf.split(X, y):
  print("iteration", i)
  print(train, "train fold:", len(train))
  print(val, "val fold:", len(val))
  print("y train count", y[train].value_counts())
  print("y val count", y[val].value_counts())
  print("........................")
  i += 1

iteration 1
[ 6499  6500  6512 ... 32558 32559 32560] train fold: 26048
[   0    1    2 ... 6514 6515 6516] val fold: 6513
y train count  <=50K    19776
 >50K      6272
Name:  income, dtype: int64
y val count  <=50K    4944
 >50K     1569
Name:  income, dtype: int64
........................
iteration 2
[    0     1     2 ... 32558 32559 32560] train fold: 26049
[ 6499  6500  6512 ... 13121 13123 13125] val fold: 6512
y train count  <=50K    19776
 >50K      6273
Name:  income, dtype: int64
y val count  <=50K    4944
 >50K     1568
Name:  income, dtype: int64
........................
iteration 3
[    0     1     2 ... 32558 32559 32560] train fold: 26049
[12997 12999 13000 ... 19727 19729 19733] val fold: 6512
y train count  <=50K    19776
 >50K      6273
Name:  income, dtype: int64
y val count  <=50K    4944
 >50K     1568
Name:  income, dtype: int64
........................
iteration 4
[    0     1     2 ... 32558 32559 32560] train fold: 26049
[19482 19484 19485 ... 26161 26168 26170

##All in One KFOLD

In [38]:
start = time.time()
all_kfold = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=5)
print("time taken:", time.time()-start)

time taken: 3.0432114601135254


In [39]:
all_kfold

array([0.84600031, 0.84229115, 0.84459459, 0.84751229, 0.84812654])

In [40]:
all_kfold.mean()

0.8457049754205442

In [41]:
all_kfold.std()

0.0021026012198182417

##LOO CV

In [88]:
start = time.time()
loocv = cross_val_score(estimator=pipe, X=X.head(1000), y=y.head(1000), scoring='accuracy', cv=LeaveOneOut())
print("time taken:", time.time()-start)

time taken: 41.26244306564331


In [89]:
loocv

array([1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0.,
       1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [90]:
loocv.mean()

0.839

In [91]:
loocv.std()

0.36753095107759287

##Repeated KFold

In [59]:
start = time.time()
repeated_kfold = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=RepeatedKFold(n_splits=5, n_repeats=5))
print("time taken:", time.time()-start)

time taken: 14.802448987960815


In [60]:
repeated_kfold

array([0.83893751, 0.84029484, 0.84966216, 0.8544226 , 0.85119779,
       0.84600031, 0.8404484 , 0.84474816, 0.85288698, 0.85396192,
       0.84600031, 0.85104423, 0.84720516, 0.84474816, 0.84505528,
       0.83955167, 0.84136978, 0.84766585, 0.84321253, 0.85334767,
       0.84139413, 0.84674447, 0.85119779, 0.85027641, 0.84152334])

In [61]:
repeated_kfold.mean()

0.8465158975188914

In [62]:
repeated_kfold.std()

0.00479322570478419

##Repeated StratifiedKFold

In [63]:
start = time.time()
repeated_stratifiedKfold = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5))
print("time taken:", time.time()-start)

time taken: 14.937236785888672


In [64]:
repeated_stratifiedKfold

array([0.85183479, 0.84382678, 0.85211916, 0.85027641, 0.84275184,
       0.84661446, 0.85089066, 0.84613022, 0.83845209, 0.84797297,
       0.84784278, 0.84428747, 0.84735872, 0.84720516, 0.85257985,
       0.85290957, 0.84490172, 0.84136978, 0.84843366, 0.85519042,
       0.84292953, 0.84490172, 0.84551597, 0.84812654, 0.84797297])

In [65]:
repeated_stratifiedKfold.mean()

0.8472958100053909

In [66]:
repeated_stratifiedKfold.std()

0.0039050076369137885

##Tally

In [92]:
#accuracy means of 5 folds
print("kfold:",np.array(scores_kf).mean())
print("stratified kfold:",np.array(scores_skf).mean())
print("All-in-one KFold:", all_kfold.mean())
print("LOO CV:", loocv.mean())
print("Repeated KFold:", repeated_kfold.mean())
print("Repeated Stratified KFold:", repeated_stratifiedKfold.mean())

kfold: 0.8475476612452659
stratified kfold: 0.8457049754205442
All-in-one KFold: 0.8457049754205442
LOO CV: 0.839
Repeated KFold: 0.8465158975188914
Repeated Stratified KFold: 0.8472958100053909
