<a href="https://colab.research.google.com/github/TrollRider-Kristian/Springboard-AI-Mini-Projects/blob/main/codebasics_kfold_cross_validation_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.svm import SVC

In [3]:
iris_features, iris_target = load_iris(return_X_y = True, as_frame = True)

In [4]:
print (iris_features.head(10))

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
5                5.4               3.9                1.7               0.4
6                4.6               3.4                1.4               0.3
7                5.0               3.4                1.5               0.2
8                4.4               2.9                1.4               0.2
9                4.9               3.1                1.5               0.1


In [5]:
print (iris_target.head(10))

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: target, dtype: int64


In [6]:
print (iris_features.isnull().sum())

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64


In [7]:
# Play with KFold.  Note that it returns indices, not values.
play_with_kfold = KFold (n_splits = 4)

split_1d = [(i+1) for i in range (10)]
print (split_1d)
for partition_train, partition_test in play_with_kfold.split(split_1d):
  # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
  # From the docs, n_samples = 10 because the array is of size 10, and n_splits is 4.
  # The first 10 % 4 partitions have fold length 10 // 4 + 1 = 2 + 1 = 3.  The rest of length 2.
  print (f"Training Partition: {partition_train}")
  print (f"Test Partition: {partition_test}")

print ("-----------------------------------------------------------")

split_2d = [[(i+1) * (j+1) for i in range(4)] for j in range (5)]
print (split_2d)
for partition_train, partition_test in play_with_kfold.split(split_2d):
  # In this case, n_samples = 5 because it's a 5x4 matrix.  We'd need to k-fold the inner arrays separately.
  # The first 5 % 4 = 1 partition has fold length 5 // 4 + 1 = 1 + 1 = 2.  The rest are of length 1.
  print (f"Training Partition: {partition_train}")
  print (f"Test Partition: {partition_test}")

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Training Partition: [3 4 5 6 7 8 9]
Test Partition: [0 1 2]
Training Partition: [0 1 2 6 7 8 9]
Test Partition: [3 4 5]
Training Partition: [0 1 2 3 4 5 8 9]
Test Partition: [6 7]
Training Partition: [0 1 2 3 4 5 6 7]
Test Partition: [8 9]
-----------------------------------------------------------
[[1, 2, 3, 4], [2, 4, 6, 8], [3, 6, 9, 12], [4, 8, 12, 16], [5, 10, 15, 20]]
Training Partition: [2 3 4]
Test Partition: [0 1]
Training Partition: [0 1 3 4]
Test Partition: [2]
Training Partition: [0 1 2 4]
Test Partition: [3]
Training Partition: [0 1 2 3]
Test Partition: [4]


In [13]:
NUM_SPLITS = 4
NUM_CLASSES = 2
# KRISTIAN_NOTE - Default is 5 splits, but "n_splits cannot be greater than the number of members in each class."
play_with_stratified_kfold = StratifiedKFold (n_splits = NUM_SPLITS)
test_arr = ["Fire", "Red", "Water", "Blue", "Air", "White", "Earth", "Green"]
print (test_arr)
# 4 members each in 2 classes [0:3], so only 2 splits is ok and so is 4 splits, but not 3.
# KRISTIAN_NOTE - test_target must have same length as test_arr
test_target = [i % NUM_CLASSES for i in range (len(test_arr))]
print (test_target)
if (len (test_arr) % NUM_SPLITS * NUM_CLASSES == 0):
  # The "Stratified" part of Stratified KFold will attempt to evenly distribute the members of each
  # class in the target evenly among the training and test partitions.  For example, when there are
  # 2 members each among 4 classes, one member of each class will go into the training set and one member
  # in the test set.  When there are 4 memebers each among 2 classes, two members of each class will
  # each go into the training and test sets.  This even division of data points by class is the reason
  # Stratified Kfold requires both data AND a target; it's not just dividing data among a random "fold".
  for partition_train, partition_test in play_with_stratified_kfold.split(test_arr, test_target):
    print (f"Training Partition: {partition_train}")
    print (f"Test Partition: {partition_test}")
else:
  print ("Sorry, can't divide the data points evenly among the number of distinct classes in test_target.")

['Fire', 'Red', 'Water', 'Blue', 'Air', 'White', 'Earth', 'Green']
[0, 1, 0, 1, 0, 1, 0, 1]
Training Partition: [2 3 4 5 6 7]
Test Partition: [0 1]
Training Partition: [0 1 4 5 6 7]
Test Partition: [2 3]
Training Partition: [0 1 2 3 6 7]
Test Partition: [4 5]
Training Partition: [0 1 2 3 4 5]
Test Partition: [6 7]


In [14]:
features_train, features_test, targets_train, targets_test = train_test_split(\
  iris_features, iris_target, test_size = 0.2, random_state = 9)

In [15]:
stratified_kfold_models = StratifiedKFold (n_splits = 3)