In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_digits
digits = load_digits()

In [36]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [37]:
X = digits.data
y = digits.target

In [38]:
X.shape

(1797, 64)

In [39]:
y.shape

(1797,)

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

**Logistic Regression**

In [41]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9611111111111111

**SVM**

In [42]:
from sklearn.svm import SVC
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.36666666666666664

**Random Forest**

In [43]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9796296296296296

In [44]:
# by dividing the data by using train_test_split it is not the best way because,
# if training set contain all the questions related to algebra and test set contain question related,
# to calculus then our model will not perform well, 
# so by using cross validation we can avoid this problem

In [45]:
# in kfold cross validation we split the total data into folds eg take 5
# in 1st model take svm 1st fold is test set and remaninig 2, 3, 4, 5 folds are training set
# then we train the model(svm1) on training set and test the testset and will note the score of model1
# in 2nd model 2nd fold is test set and remaining 1, 3, 4, 5 flods are training set,
# with this data we train the 2nd model(svm2) and will note the score.
# we do this for 5 times and create 5 svm models, svm1, svm2,..,svm5,
# now we average the scores of all 5 svms,
# now we will note the average score for svm
# like this we calculate the scores for all models (DecisionT, LogisticRegression, RandomForest, ....)
# finally we take the best model with good average score

In [46]:
# by using cross validation we can choose the best model eg(svm, or logisticregression or .....)

<h2 style='color:purple'>KFold cross validation</h2>

**Use KFold for our digits example**

In [47]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [48]:
get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test)

0.36666666666666664

In [49]:
pd.Series(y).value_counts() # from 0-9 classification total 10 classes

3    183
1    182
5    182
4    181
6    181
9    180
7    179
0    178
2    177
8    174
dtype: int64

In [50]:
from sklearn.model_selection import KFold
# n_splits is no of folds
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [51]:
# kf.split([1,2,3,4,5,6,7,8,9]) will split into 3 folds one fold for testing and 2 folds for training
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)
# in 1st fold test index is 0, 1, 2 remaining indexs are train indexes
# in 2nd fold test index is 3, 4, 5 remaining indexs are train indexes
# in 3rd fold test index is 6, 7, 8 remaining indexs are train indexes

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [52]:
scores_logistic = []
scores_svm = []
scores_rf = []

# for kf no need y(target) because it will go in sequence of test data
for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], \
                                       y[train_index], y[test_index]
    
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear', multi_class='ovr'),
                                     X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), 
                                X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40),
                               X_train, X_test, y_train, y_test))

In [53]:
print(scores_logistic)
print(scores_svm)
print(scores_rf)

[0.8964941569282137, 0.9515859766277128, 0.9115191986644408]
[0.41068447412353926, 0.41569282136894825, 0.4273789649415693]
[0.9365609348914858, 0.9482470784641068, 0.9115191986644408]


In [54]:
print(np.mean(np.array(scores_logistic)))
print(np.mean(np.array(scores_svm)))
print(np.mean(np.array(scores_rf)))

0.9198664440734557
0.41791875347801893
0.9321090706733445


In [55]:
# StratifiedKFold is similar to KFold it is better when seperating folds based on target categories,
# it will create folds on each classification(target) category in an uniform way

In [56]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

# we are creating three arrays for three models
scores_logistic = []
scores_svm = []
scores_rf = []

# for dividing folds uniformly based on target variable we need to supply target variable also(y)
# but for kfold no need to supply target variable(y) just X is enough
# but in skf both X and y are needed
for train_index, test_index in folds.split(X, y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], \
                                       y[train_index], y[test_index]
    
    # for every training and testing folds classifiers get traind 
    # as we did 3folds 3 scores are created for each classifier
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear', multi_class='ovr'),
                                     X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), 
                                X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40),
                               X_train, X_test, y_train, y_test))

In [57]:
print(np.mean(np.array(scores_logistic)))
print(np.mean(np.array(scores_svm)))
print(np.mean(np.array(scores_rf)))

0.9193099610461881
0.4346132442960489
0.9354479688369505


In [58]:
print(scores_logistic)
print(scores_svm)
print(scores_rf)

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]
[0.3806343906510851, 0.41068447412353926, 0.5125208681135225]
[0.9298831385642737, 0.9532554257095158, 0.9232053422370617]


<h2 style='color:purple'>cross_val_score function</h2>

In [59]:
from sklearn.model_selection import cross_val_score

**Logistic regression model performance using cross_val_score**

In [60]:
# here cv is no of folds
# if cv=3 means there are 3 folds which means 3 test datas which means 3 models which means 3 scores
# the below code will do the same thing as stratifiedkfold for loop
print(scores_logistic)
cross_val_score(LogisticRegression(solver='liblinear', multi_class='ovr'), X, y, cv=3)
# from below outputs we can observe that both outputs are nearly same

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]


array([0.89482471, 0.95325543, 0.90984975])

**svm model performance using cross_val_score**

In [61]:
print(scores_svm)
cross_val_score(SVC(gamma='auto'), X, y, cv=3)

[0.3806343906510851, 0.41068447412353926, 0.5125208681135225]


array([0.38063439, 0.41068447, 0.51252087])

**random forest performance using cross_val_score**

In [62]:
print(scores_rf)
cross_val_score(RandomForestClassifier(n_estimators=40), X, y, cv=3)

[0.9298831385642737, 0.9532554257095158, 0.9232053422370617]


array([0.92153589, 0.94490818, 0.92153589])

In [None]:
# cross_val_score uses stratifiedkfold as default

<h2 style='color:purple'>Parameter tunning using k fold cross validation</h2>

In [None]:
# we can also do parameter tuning with cross_val_score
# instead of using different classifiers we use the same classifier with different parameters,
# which is parameter tuning

In [63]:
cross_val_score(RandomForestClassifier(n_estimators=5), X, y, cv=10)

array([0.87222222, 0.91111111, 0.89444444, 0.77777778, 0.88333333,
       0.9       , 0.92222222, 0.93854749, 0.79888268, 0.8547486 ])

In [64]:
np.mean(cross_val_score(RandomForestClassifier(n_estimators=5), X, y, cv=10))

0.8686871508379888

In [65]:
np.mean(cross_val_score(RandomForestClassifier(n_estimators=20), X, y, cv=10))

0.9365704531346989

In [66]:
np.mean(cross_val_score(RandomForestClassifier(n_estimators=30), X, y, cv=10))

0.9382371198013656

In [67]:
np.mean(cross_val_score(RandomForestClassifier(n_estimators=40), X, y, cv=10))

0.942675356921167

Here we used cross_val_score to
fine tune our random forest classifier and figured that having around 40 trees in random forest gives best result. 