In [1]:
# 2023 OCT 15

In [2]:
import sklearn
import numpy as np
import pandas as pd

In [3]:
# import modules
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# evaluation
from sklearn.metrics import accuracy_score

# dataset
from sklearn.datasets import load_iris

In [4]:
# load dataset
iris = load_iris()
iris_features = iris.data
iris_labels = iris.target

#
print("dataset size:", iris_features.shape[0])

# method
classifier = DecisionTreeClassifier(random_state=156)

dataset size: 150


**K-fold setting**

In [5]:
k_fold = KFold(n_splits=5)
batch_accuracies = []

**training & prediction with K-fold cross validation**

In [6]:
iteration = 0

for train_indices, test_indices in k_fold.split(iris_features):
    # divide into folds
    X_train, X_test = iris_features[train_indices], iris_features[test_indices]
    y_train, y_test = iris_labels[train_indices], iris_labels[test_indices]
    
    # train & predict
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    
    # accuracy
    batch_accuracy = accuracy_score(y_test, predictions)
    batch_accuracies.append(batch_accuracy)
    iteration += 1
    
    #
    print(f"batch #{iteration}, batch accuracy score: {accuracy_score(y_test, predictions): .3f}")
    print(f"training set size: {X_train.shape[0]}, test set size: {X_test.shape[0]}")
    print(f"training set indices: {test_indices}")
    
# evaluation: (mean batch performances)
print("accuracy score:", np.mean(batch_accuracies))

batch #1, batch accuracy score:  1.000
training set size: 120, test set size: 30
training set indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
batch #2, batch accuracy score:  0.967
training set size: 120, test set size: 30
training set indices: [30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
 54 55 56 57 58 59]
batch #3, batch accuracy score:  0.867
training set size: 120, test set size: 30
training set indices: [60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
 84 85 86 87 88 89]
batch #4, batch accuracy score:  0.933
training set size: 120, test set size: 30
training set indices: [ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119]
batch #5, batch accuracy score:  0.733
training set size: 120, test set size: 30
training set indices: [120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 1

**into a function: using pandas DataFrame**

In [7]:
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df["label"] = iris.target

print(iris_df["label"].value_counts())

# data
iris_features = iris_df.iloc[:, :-1]
iris_labels = iris_df["label"]  # (iris_labels = iris_df.iloc[:, -1])

0    50
1    50
2    50
Name: label, dtype: int64


In [8]:
def classify_iris(k_fold_n_splits):
    # k-fold setting
    k_fold = KFold(n_splits=k_fold_n_splits)
    batch_accuracies = []
    
    iteration = 0
    for train_indices, test_indices in k_fold.split(iris_df):
        # divide into folds
        X_train, X_test = iris_features.iloc[train_indices], iris_features.iloc[test_indices]
        y_train, y_test = iris_labels.iloc[train_indices], iris_labels.iloc[test_indices]
        
        #
        display(X_test.head(3))
        print("taining set labels:\n", y_train.value_counts(), sep="")
        print("test set labels:\n", y_test.value_counts(), sep="")

        # train & predict
        classifier.fit(X_train, y_train)
        predictions = classifier.predict(X_test)

        # accuracy
        batch_accuracy = accuracy_score(y_test, predictions)
        batch_accuracies.append(batch_accuracy)
        iteration += 1

        #
        print(f"batch #{iteration}, batch accuracy score: {accuracy_score(y_test, predictions): .3f}")
#         print(f"training set size: {X_train.shape[0]}, test set size: {X_test.shape[0]}")
#         print(f"training set indices: {test_indices}")

    # evaluation: (mean batch performances)
    print("\n<EVALUATION RESULT>")
    print("accuracy score:", np.mean(batch_accuracies))


*K-Fold OK CASE*

In [9]:
classify_iris(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


taining set labels:
1    50
2    50
0    20
Name: label, dtype: int64
test set labels:
0    30
Name: label, dtype: int64
batch #1, batch accuracy score:  1.000


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
30,4.8,3.1,1.6,0.2
31,5.4,3.4,1.5,0.4
32,5.2,4.1,1.5,0.1


taining set labels:
2    50
1    40
0    30
Name: label, dtype: int64
test set labels:
0    20
1    10
Name: label, dtype: int64
batch #2, batch accuracy score:  0.967


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
60,5.0,2.0,3.5,1.0
61,5.9,3.0,4.2,1.5
62,6.0,2.2,4.0,1.0


taining set labels:
0    50
2    50
1    20
Name: label, dtype: int64
test set labels:
1    30
Name: label, dtype: int64
batch #3, batch accuracy score:  0.867


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
90,5.5,2.6,4.4,1.2
91,6.1,3.0,4.6,1.4
92,5.8,2.6,4.0,1.2


taining set labels:
0    50
1    40
2    30
Name: label, dtype: int64
test set labels:
2    20
1    10
Name: label, dtype: int64
batch #4, batch accuracy score:  0.933


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
120,6.9,3.2,5.7,2.3
121,5.6,2.8,4.9,2.0
122,7.7,2.8,6.7,2.0


taining set labels:
0    50
1    50
2    20
Name: label, dtype: int64
test set labels:
2    30
Name: label, dtype: int64
batch #5, batch accuracy score:  0.733

<EVALUATION RESULT>
accuracy score: 0.9


*K-Fold FAIL CASE*

In [10]:
classify_iris(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


taining set labels:
1    50
2    50
Name: label, dtype: int64
test set labels:
0    50
Name: label, dtype: int64
batch #1, batch accuracy score:  0.000


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
50,7.0,3.2,4.7,1.4
51,6.4,3.2,4.5,1.5
52,6.9,3.1,4.9,1.5


taining set labels:
0    50
2    50
Name: label, dtype: int64
test set labels:
1    50
Name: label, dtype: int64
batch #2, batch accuracy score:  0.000


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
100,6.3,3.3,6.0,2.5
101,5.8,2.7,5.1,1.9
102,7.1,3.0,5.9,2.1


taining set labels:
0    50
1    50
Name: label, dtype: int64
test set labels:
2    50
Name: label, dtype: int64
batch #3, batch accuracy score:  0.000

<EVALUATION RESULT>
accuracy score: 0.0


**stratified K-fold cross validation**

In [13]:
def classify_iris_SKF(k_fold_n_splits, labels):
    # k-fold setting
    k_fold = StratifiedKFold(n_splits=k_fold_n_splits)
    batch_accuracies = []
        
    iteration = 0
    # need to pass targets to achieve good sampling
    for train_indices, test_indices in k_fold.split(iris_df, iris_labels.to_numpy()):
        # divide into folds
        X_train, X_test = iris_features.iloc[train_indices], iris_features.iloc[test_indices]
        y_train, y_test = iris_labels.iloc[train_indices], iris_labels.iloc[test_indices]
        
        #
        display(X_test.head(3))
        print("taining set labels:\n", y_train.value_counts(), sep="")
        print("test set labels:\n", y_test.value_counts(), sep="")

        # train & predict
        classifier.fit(X_train, y_train)
        predictions = classifier.predict(X_test)

        # accuracy
        batch_accuracy = accuracy_score(y_test, predictions)
        batch_accuracies.append(batch_accuracy)
        iteration += 1

        #
        print(f"batch #{iteration}, batch accuracy score: {accuracy_score(y_test, predictions): .3f}")
#         print(f"training set size: {X_train.shape[0]}, test set size: {X_test.shape[0]}")
#         print(f"training set indices: {test_indices}")

    # evaluation: (mean batch performances)
    print("\n<EVALUATION RESULT>")
    print("accuracy score:", np.mean(batch_accuracies))


In [14]:
classify_iris_SKF(3, iris_labels)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


taining set labels:
2    34
0    33
1    33
Name: label, dtype: int64
test set labels:
0    17
1    17
2    16
Name: label, dtype: int64
batch #1, batch accuracy score:  0.980


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
17,5.1,3.5,1.4,0.3
18,5.7,3.8,1.7,0.3
19,5.1,3.8,1.5,0.3


taining set labels:
1    34
0    33
2    33
Name: label, dtype: int64
test set labels:
0    17
2    17
1    16
Name: label, dtype: int64
batch #2, batch accuracy score:  0.940


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
34,4.9,3.1,1.5,0.2
35,5.0,3.2,1.2,0.2
36,5.5,3.5,1.3,0.2


taining set labels:
0    34
1    33
2    33
Name: label, dtype: int64
test set labels:
1    17
2    17
0    16
Name: label, dtype: int64
batch #3, batch accuracy score:  0.980

<EVALUATION RESULT>
accuracy score: 0.9666666666666667
