In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits

In [2]:
digit = load_digits()

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(digit.data,digit.target,test_size = 0.2)

### Logistic Regression

In [7]:
lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train,y_train)
lr_model.score(X_test,y_test)

0.9638888888888889

### SVM

In [8]:
svm_model = SVC()
svm_model.fit(X_train,y_train)
svm_model.score(X_test,y_test)

0.9888888888888889

### Random Forest

In [13]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
rf_model.score(X_test,y_test)

0.9833333333333333

So when we re exicute the train_test_split method our samples get changed and it always changes the score of our model.

Let's try K Fold

In [14]:
from sklearn.model_selection import KFold

In [15]:
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [17]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [35]:
kf = KFold(n_splits=3)
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index,test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [36]:
# Create a function
def get_score(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

Using the above models in the function

In [23]:
get_score(LogisticRegression(max_iter=10000),X_train,X_test,y_train,y_test)

0.9638888888888889

In [24]:
get_score(SVC(),X_train,X_test,y_train,y_test)

0.9888888888888889

In [25]:
get_score(RandomForestClassifier(),X_train,X_test,y_train,y_test)

0.975

Using KFold

In [26]:
from sklearn.model_selection import StratifiedKFold

In [27]:
folds = StratifiedKFold(n_splits=3)

In [37]:
lr_score = []
svc_score = []
rf_score = []
for train_index,test_index in kf.split(digit.data):
    X_train,X_test,y_train,y_test = digit.data[train_index], digit.data[test_index], digit.target[train_index],digit.target[test_index]
    lr_score.append(get_score(LogisticRegression(max_iter=10000),X_train,X_test,y_train,y_test))
    svc_score.append(get_score(SVC(),X_train,X_test,y_train,y_test))
    rf_score.append(get_score(RandomForestClassifier(),X_train,X_test,y_train,y_test))
    

In [38]:
lr_score

[0.9282136894824707, 0.9415692821368948, 0.9165275459098498]

In [39]:
svc_score

[0.9666110183639399, 0.9816360601001669, 0.9549248747913188]

In [40]:
rf_score

[0.9449081803005008, 0.9465776293823038, 0.9265442404006677]

And After that we can take the average of each to get our final Score

We can also do all the above using **Cross_val_score**

In [41]:
from sklearn.model_selection import cross_val_score

In [51]:
lr = cross_val_score(LogisticRegression(max_iter=10000),digit.data,digit.target)
lr

array([0.925     , 0.87777778, 0.93871866, 0.93314763, 0.89693593])

In [52]:
np.mean(lr)

0.9143160012380068

In [47]:
svc = cross_val_score(SVC(),digit.data,digit.target)
svc

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [50]:
np.mean(svc)

0.9632838130609718

In [48]:
rf = cross_val_score(RandomForestClassifier(),digit.data,digit.target)
rf

array([0.93888889, 0.90555556, 0.96100279, 0.96935933, 0.91086351])

In [49]:
np.mean(rf)

0.9371340142370783