# KFold Cross validation and StratifiedKFold validation

In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
digit=load_digits()

In [3]:
print(dir(digit))

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']


# train_test_split

In [4]:
from sklearn.model_selection import train_test_split

In [35]:
X_train,X_test,y_train,y_test=train_test_split(digit.data,digit.target,test_size=0.3)

### Logistic Regression

In [11]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train,y_train)
lr.score(X_train,y_train)*100

99.68178202068417

In [12]:
lr.score(X_test,y_test)*100

96.29629629629629

## Support vector Machine

In [13]:
svm=SVC()
svm.fit(X_train,y_train)
svm.score(X_train,y_train)*100

99.36356404136833

## RandomForest Classifier

In [14]:
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
rf.score(X_train,y_train)*100

100.0

# K fold

In [15]:
#Provides train/test indices to split data in train/test sets.
from sklearn.model_selection import KFold

In [17]:
#default 5 folds we have
kf = KFold(n_splits=2)

In [18]:
kf

KFold(n_splits=2, random_state=None, shuffle=False)

In [19]:
len([8,12,22,36,4,85,68,97,8,9])

10

In [20]:
for indices in kf.split([8,12,22,36,4,85,68,97,8,9]):
    print(indices)

(array([5, 6, 7, 8, 9]), array([0, 1, 2, 3, 4]))
(array([0, 1, 2, 3, 4]), array([5, 6, 7, 8, 9]))


In [21]:
for train_index,test_index in kf.split([8,12,22,36,4,85,68,97,8,9]):
    print('Train index:',train_index,'Test index:',test_index)

Train index: [5 6 7 8 9] Test index: [0 1 2 3 4]
Train index: [0 1 2 3 4] Test index: [5 6 7 8 9]


In [22]:
kf2 = KFold(n_splits=5)
kf2

KFold(n_splits=5, random_state=None, shuffle=False)

In [23]:
for indices in kf2.split([8,12,22,36,4,85,68,97,8,9]):
    print(indices)

(array([2, 3, 4, 5, 6, 7, 8, 9]), array([0, 1]))
(array([0, 1, 4, 5, 6, 7, 8, 9]), array([2, 3]))
(array([0, 1, 2, 3, 6, 7, 8, 9]), array([4, 5]))
(array([0, 1, 2, 3, 4, 5, 8, 9]), array([6, 7]))
(array([0, 1, 2, 3, 4, 5, 6, 7]), array([8, 9]))


In [30]:
import numpy as np
a = np.array([8,12,22,36,4,85,68,97,8,9])
for train_index,test_index in kf2.split([8,12,22,36,4,85,68,97,8,9]):
    print(a[train_index],a[test_index])

[22 36  4 85 68 97  8  9] [ 8 12]
[ 8 12  4 85 68 97  8  9] [22 36]
[ 8 12 22 36 68 97  8  9] [ 4 85]
[ 8 12 22 36  4 85  8  9] [68 97]
[ 8 12 22 36  4 85 68 97] [8 9]


In [28]:
# how the data in the test is decided????
# total_no_elements / folds
len(a)/5

2.0

## If we want combined result then lets Create a model function

In [31]:
def get_model(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)#training
    return model.score(X_test,y_test)*100

In [32]:
#calculate testing data accuracy for random forest
get_model(rf,X_train,X_test,y_train,y_test)

96.11111111111111

In [33]:
#calculate Testing accuracy for SVM
get_model(svm,X_train,X_test,y_train,y_test)

98.51851851851852

In [36]:
print('Testing data accuracy for Logistic reg:',get_model(lr,X_train,X_test,y_train,y_test))
print('Testing data accuracy for SVM:',get_model(svm,X_train,X_test,y_train,y_test))
print('Testing data accuracy for Random Forest:',get_model(rf,X_train,X_test,y_train,y_test))

Testing data accuracy for Logistic reg: 97.22222222222221
Testing data accuracy for SVM: 98.88888888888889
Testing data accuracy for Random Forest: 97.77777777777777


### Now apply the same method using StratifiedKFold

In [37]:
from sklearn.model_selection import StratifiedKFold

In [38]:
StratifiedKFold()

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [39]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])

In [40]:
skf = StratifiedKFold(n_splits=2)

In [42]:
#Generate indices to split data into training and test set.
list(skf.split(X,y))

[(array([1, 3]), array([0, 2])), (array([0, 2]), array([1, 3]))]

In [44]:
#lets apply kfold
list(kf.split(X))

[(array([2, 3]), array([0, 1])), (array([0, 1]), array([2, 3]))]

In [45]:
list(skf.split([8,12,22,36,4,85,68,97,8,9],[0,0,0,1,1,1,1,1,1,1]))

[(array([2, 6, 7, 8, 9]), array([0, 1, 3, 4, 5])),
 (array([0, 1, 3, 4, 5]), array([2, 6, 7, 8, 9]))]

In [46]:
for train_index,test_index in skf.split(X,y):
    print('train_index:',train_index,'test index:',test_index)
X_train, X_test = X[train_index], X[test_index]

train_index: [1 3] test index: [0 2]
train_index: [0 2] test index: [1 3]


In [47]:
for train_index,test_index in kf.split(X,y):
    print('train_index:',train_index,'test index:',test_index)
X_train, X_test = X[train_index], X[test_index]

train_index: [2 3] test index: [0 1]
train_index: [0 1] test index: [2 3]


## Use all above things in to one block

In [48]:
kf

KFold(n_splits=2, random_state=None, shuffle=False)

In [49]:
for train_index,test_index in kf.split(digit.data):
    print(train_index,test_index)

[ 899  900  901  902  903  904  905  906  907  908  909  910  911  912
  913  914  915  916  917  918  919  920  921  922  923  924  925  926
  927  928  929  930  931  932  933  934  935  936  937  938  939  940
  941  942  943  944  945  946  947  948  949  950  951  952  953  954
  955  956  957  958  959  960  961  962  963  964  965  966  967  968
  969  970  971  972  973  974  975  976  977  978  979  980  981  982
  983  984  985  986  987  988  989  990  991  992  993  994  995  996
  997  998  999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
 1095 

In [None]:
kf

In [50]:
scoreof_Logistic=[]
scoreof_SVM=[]
scoreof_rf=[]

for train_index,test_index in kf.split(digit.data):#its KFold
    X_train,X_test,y_train,y_test= digit.data[train_index],digit.data[test_index],digit.target[train_index],digit.target[test_index]
    
    scoreof_Logistic.append(get_model(LogisticRegression(solver='liblinear',multi_class='ovr'),X_train,X_test,y_train,y_test))
    scoreof_SVM.append(get_model(SVC(kernel='linear'),X_train,X_test,y_train,y_test))
    scoreof_rf.append(get_model(RandomForestClassifier(n_estimators=100),X_train,X_test,y_train,y_test))

In [51]:
scoreof_Logistic

[88.54282536151278, 91.64810690423162]

In [52]:
#average accuracy of Logisric regression
sum(scoreof_Logistic)/2

90.0954661328722

In [53]:
import numpy as np
np.mean(scoreof_Logistic)

90.0954661328722

In [54]:
scoreof_SVM

[92.99221357063404, 94.43207126948775]

In [55]:
scoreof_rf

[92.76974416017798, 92.98440979955457]

## Apply stratified kfold

In [56]:
skf

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

In [57]:
scoreof_Logistic=[]
scoreof_SVM=[]
scoreof_rf=[]

for train_index,test_index in skf.split(digit.data,digit.target):#its StratifiedKFold
    X_train,X_test,y_train,y_test= digit.data[train_index],digit.data[test_index],digit.target[train_index],digit.target[test_index]
    
    scoreof_Logistic.append(get_model(LogisticRegression(solver='liblinear',multi_class='ovr'),X_train,X_test,y_train,y_test))
    scoreof_SVM.append(get_model(SVC(kernel='linear'),X_train,X_test,y_train,y_test))
    scoreof_rf.append(get_model(RandomForestClassifier(n_estimators=100),X_train,X_test,y_train,y_test))

In [61]:
get_model(rf,X_train,X_test,y_train,y_test)

93.54120267260579

In [58]:
scoreof_Logistic

[88.76529477196885, 91.87082405345211]

In [59]:
scoreof_rf

[92.21357063403782, 93.20712694877506]

In [60]:
scoreof_SVM

[93.32591768631812, 94.76614699331849]

## Magic of ML starts from here :-)

In [62]:
from sklearn.model_selection import cross_val_score

In [63]:
cross_val_score()

TypeError: cross_val_score() missing 2 required positional arguments: 'estimator' and 'X'

In [64]:
#it splits data into 5 folds by default
cross_val_score(lr,digit.data,digit.target)*100

array([92.22222222, 88.33333333, 95.26462396, 95.82172702, 89.41504178])

In [65]:
cross_val_score(lr,digit.data,digit.target,cv=4)*100

array([93.77777778, 91.09131403, 95.76837416, 90.42316258])

In [66]:
cross_val_score(svm,digit.data,digit.target,cv=3)*100

array([96.49415693, 97.9966611 , 96.49415693])

In [67]:
cross_val_score(rf,digit.data,digit.target)*100

array([91.66666667, 91.38888889, 96.37883008, 96.93593315, 93.59331476])

In [68]:
(cross_val_score(rf,digit.data,digit.target,cv=5)*100).mean()

93.60244506344785

In [70]:
digit.data.shape

(1797, 64)

In [72]:
1797/10

179.7

In [None]:
Data wrangling/muging
Feature engineering
Feature selection
Feature extraction

Assumptions of each algorithm
Acccuracy measures/Performance metrics of each algo.
Advantages/Disadvantages of each model
Cost function of each algorithm

Hypothesis testing
AB testing
P value: can we change p value?
Type I / Type II error
Bias, variance , tradeoff
Overfitting , underfitting
Undersampling, over sampling
AUC curve, ROC curve

How to improvve accuracy of a model?
When to select which algorithm??