## Initial imports and training

In [18]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

In [19]:
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [20]:
digits.data.shape

(1797, 64)

In [29]:
X= digits.data
y=digits.target

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
lr = LogisticRegression(max_iter=200)
svm = SVC()
rf = RandomForestClassifier()

lr.fit(X_train,y_train)
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)

print(lr.score(X_test,y_test))
print(svm.score(X_test,y_test))
print(rf.score(X_test,y_test))

0.9722222222222222
0.9888888888888889
0.9833333333333333


In [22]:
#As the scores keep changing with the train_test_split function, we can use K-fold cross validation.
# it trains the model on divided collection of data n number of times, each time using one divison as test and others as training data.
# this way, we get a fairer model score, each time, then we can simply average it and find which model is better

##Manual implementation

In [23]:
#sample data implementation
from sklearn.model_selection import KFold
kf= KFold(n_splits=3)

for i,j in kf.split([1,2,3,4,5,6,7,8,9]): #here, kf.split returns one by one, a pair of training and testing data, in form of two lists
  print(i,j)                              # i is the training split, and j is the testing split.
                                          # remember 1st is train, 2nd is test

#REMEMBER, this loop returns the indexes, divided into folds and training and testing.


[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


### StratifiedKFold

In [24]:
# get_score method to calculate scores
def get_score(model, xtrain,xtest,ytrain,ytest):
  model.fit(xtrain,ytrain)
  return model.score(xtest,ytest)

In [30]:
#For our digits dataset we will use stratified kfold method, this keeps the classes in the folds also uniform.
lr_arr = []
svm_arr = []
rf_arr = []

from sklearn.model_selection import StratifiedKFold
kff = StratifiedKFold(n_splits = 5)

for i,j in kff.split(digits.data,digits.target):
  X_train,X_test,y_train,y_test = digits.data[i],digits.data[j],digits.target[i],digits.target[j] #we just need the divided indexes, then we can feed into the
                                                                                                  # rows, to get our correspnding data
  lr_arr.append(get_score(lr,X_train,X_test,y_train,y_test))
  svm_arr.append(get_score(svm,X_train,X_test,y_train,y_test))
  rf_arr.append(get_score(rf,X_train,X_test,y_train,y_test))

print(lr_arr)
print(svm_arr)
print(rf_arr)

[0.9222222222222223, 0.8722222222222222, 0.9415041782729805, 0.9415041782729805, 0.8969359331476323]
[0.9611111111111111, 0.9444444444444444, 0.9832869080779945, 0.9888579387186629, 0.9387186629526463]
[0.9388888888888889, 0.9166666666666666, 0.9693593314763231, 0.9665738161559888, 0.9331476323119777]


### cross_val_score method for everything

In [32]:
#Below method does all our fold creation, fitting, and score_calculation, storing in array, printing IN ONE GO
from sklearn.model_selection import cross_val_score
cross_val_score(lr,digits.data,digits.target)

array([0.92222222, 0.87222222, 0.94150418, 0.94150418, 0.89693593])

In [33]:
cross_val_score(svm,digits.data,digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [34]:
cross_val_score(rf,digits.data,digits.target)

array([0.93333333, 0.92222222, 0.96100279, 0.9637883 , 0.93036212])