# Stratified Cross Validation

## Requirements

In [18]:
from sklearn.model_selection import RepeatedStratifiedKFold
from google.colab import drive
import pathlib
import pandas as pd
import numpy as np

In [2]:
drive.mount('/content/gDrive', force_remount=True)

Mounted at /content/gDrive


In [3]:
dataset_path = "/content/gDrive/MyDrive/Aulas/AprendizadoDeMáquina/Projeto/Dataset"

## Load datasets

In [4]:
mfeat_fac = pd.read_csv(f"{dataset_path}/Original/mfeat-fac", delim_whitespace=True, header=None)
mfeat_fou = pd.read_csv(f"{dataset_path}/Original/mfeat-fou", delim_whitespace=True, header=None)
mfeat_zer = pd.read_csv(f"{dataset_path}/Original/mfeat-zer", delim_whitespace=True, header=None)
labels = pd.read_csv(f"{dataset_path}/Original/labels.csv")

In [5]:
mfeat_zer.shape

(2000, 47)

In [6]:
labels.shape

(2000, 1)

In [7]:
mfeat_fac.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,98,236,531,673,607,647,2,9,3,6,...,474,536,628,632,18,36,8,15,12,13
1,121,193,607,611,585,665,7,9,2,4,...,520,458,570,634,15,32,11,13,15,11
2,115,141,590,605,557,627,12,6,3,3,...,535,498,572,656,20,35,16,14,13,6
3,90,122,627,692,607,642,0,6,4,5,...,576,549,628,621,16,35,7,12,15,9
4,157,167,681,666,587,666,8,6,1,4,...,594,525,568,653,16,35,10,15,13,13


In [8]:
mfeat_fac_y = labels.values
mfeat_fac_X = mfeat_fac.values

mfeat_fou_y = labels.values
mfeat_fou_X = mfeat_fou.values

mfeat_zer_y = labels.values
mfeat_zer_X = mfeat_zer.values

## Cross Validation

In [21]:
def stratidies_kfold(x, y, dataset_name, df):
  rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=30, random_state=42)

  print(f"Dataset: {dataset_name}")
  repeat_num = 1
  split = 1
  for i, (train_index, test_index) in enumerate(rskf.split(x, y)):
      X_train, X_test = x[train_index], x[test_index]
      y_train, y_test = y[train_index], y[test_index]

      columns = ["label"]
      columns.extend(df.columns)

      train_df = pd.DataFrame(np.column_stack([y_train, X_train]), columns=columns)
      test_df = pd.DataFrame(np.column_stack([y_test, X_test]), columns=columns)

      base_path = f"{dataset_path}/CrossValidation/{dataset_name}/repeat{repeat_num}/split{split}/"

      pathlib.Path(base_path).mkdir(parents=True, exist_ok=True)

      train_df.to_csv(f'{base_path}train_fold.csv', index=False)
      test_df.to_csv(f'{base_path}test_fold.csv', index=False)

      print(f"Repeat {repeat_num}, Split {split} Saved.")

      split += 1

      if split > 10:
        split = 1
        repeat_num += 1

In [22]:
stratidies_kfold(mfeat_fac_X, mfeat_fac_y, "mfeat_fac", mfeat_fac)
stratidies_kfold(mfeat_fou_X, mfeat_fou_y, "mfeat_fou", mfeat_fou)
stratidies_kfold(mfeat_zer_X, mfeat_zer_y, "mfeat_zer", mfeat_zer)

Dataset: mfeat_fac
Repeat 1, Split 1 Saved.
Repeat 1, Split 2 Saved.
Repeat 1, Split 3 Saved.
Repeat 1, Split 4 Saved.
Repeat 1, Split 5 Saved.
Repeat 1, Split 6 Saved.
Repeat 1, Split 7 Saved.
Repeat 1, Split 8 Saved.
Repeat 1, Split 9 Saved.
Repeat 1, Split 10 Saved.
Repeat 2, Split 1 Saved.
Repeat 2, Split 2 Saved.
Repeat 2, Split 3 Saved.
Repeat 2, Split 4 Saved.
Repeat 2, Split 5 Saved.
Repeat 2, Split 6 Saved.
Repeat 2, Split 7 Saved.
Repeat 2, Split 8 Saved.
Repeat 2, Split 9 Saved.
Repeat 2, Split 10 Saved.
Repeat 3, Split 1 Saved.
Repeat 3, Split 2 Saved.
Repeat 3, Split 3 Saved.
Repeat 3, Split 4 Saved.
Repeat 3, Split 5 Saved.
Repeat 3, Split 6 Saved.
Repeat 3, Split 7 Saved.
Repeat 3, Split 8 Saved.
Repeat 3, Split 9 Saved.
Repeat 3, Split 10 Saved.
Repeat 4, Split 1 Saved.
Repeat 4, Split 2 Saved.
Repeat 4, Split 3 Saved.
Repeat 4, Split 4 Saved.
Repeat 4, Split 5 Saved.
Repeat 4, Split 6 Saved.
Repeat 4, Split 7 Saved.
Repeat 4, Split 8 Saved.
Repeat 4, Split 9 Saved.
Rep