## KFold Cross Validation method

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

In [14]:
iris = load_iris()

In [15]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [16]:
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [19]:
x = df.drop(['target'], axis='columns')
y = df.target

In [28]:
# train test split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

In [30]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9333333333333333

In [31]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt.score(x_test, y_test)

0.9666666666666667

In [32]:
sv = SVC()
sv.fit(x_train, y_train)
sv.score(x_test, y_test)

0.9666666666666667

#### KFold

In [23]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

In [25]:
# example for kflod split
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9,10]):
    print (train_index, test_index)

[4 5 6 7 8 9] [0 1 2 3]
[0 1 2 3 7 8 9] [4 5 6]
[0 1 2 3 4 5 6] [7 8 9]


_use KFold for iris dataset_

In [33]:
# method to return score of a model
def get_score (model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [None]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

# to store scores of each model
score_lr=[]  # scores of LogisticRegression
score_rf=[]
score_svm=[]
score_dt=[]

for train_index, test_index in folds.split(iris.data, iris.target):
    x_train, x_test, y_train, y_test = iris.data[train_index], iris.data[test_index] ,iris.target[train_index] ,\
                                        iris.target[test_index]
    
    score_lr.append(get_score(LogisticRegression(),x_train, x_test, y_train, y_test))
    score_rf.append(get_score(RandomForestClassifier(n_estimators=30),x_train, x_test, y_train, y_test))
    score_svm.append(get_score(SVC(),x_train, x_test, y_train, y_test))
    score_dt.append(get_score(DecisionTreeClassifier(),x_train, x_test, y_train, y_test))


In [43]:
score_lr

[0.98, 0.96, 0.98]

In [44]:
score_rf

[0.98, 0.94, 0.94]

In [45]:
score_svm

[0.96, 0.98, 0.94]

In [46]:
score_dt

[0.98, 0.92, 1.0]

### cross_val_score func

In [47]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(LogisticRegression(), x, y, cv=3)

In [74]:
cross_val_score(RandomForestClassifier(n_estimators=40), x, y, cv=3)

array([0.98, 0.94, 0.96])

In [75]:
cross_val_score(DecisionTreeClassifier(), x, y, cv=3)

array([0.98, 0.92, 1.  ])

In [76]:
cross_val_score(SVC(), x, y, cv=3)

array([0.96, 0.98, 0.94])

_Parameter tunning using cross_val_score_

In [78]:
cross_val_score(RandomForestClassifier(n_estimators=40), x, y, cv=3)

array([0.98, 0.94, 0.94])

In [81]:
cross_val_score(RandomForestClassifier(n_estimators=80), x, y, cv=3)

array([0.98, 0.94, 0.98])

In [84]:
cross_val_score(RandomForestClassifier(n_estimators=100), x, y, cv=3)

array([0.98, 0.94, 0.98])