This notebook demonstrates how to use the RepeatedMultilabelStratifiedKfoldModule() class.

This class is used to generate train, val and holdout test dataset splits for a kfold CV repeated {kfold_reps} number of times. 

It follows the MongoDB approach for nested fields. i.e. if your .json file has labels located in a nested dictionary, you can specify these nested labels by using the "dot" notation (see example below)

The .json file.

# Imports

In [31]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from repeatedmultilabelstratifiedkfold.RepeatedMultilabelStratifiedKfoldModule import RepeatedMultilabelStratifiedKfold

# Example use

In [32]:
def validateLoop(rmskSplitter, onehotEncodeLabel):
  for rep in range(rmskSplitter.kfold_reps):
      for fold in range(rmskSplitter.k):
          train_set, val_set, holdout_test_set = rmskSplitter.getDataSplits(rep=rep, fold=fold, verbose=True)
          
          # check proportions
          print("Repetition", rep, "Fold", fold)
          for key, _set in {"training": train_set, "validation": val_set, "holdout test": holdout_test_set}.items():
            multi_strat_labels_dict = rmskSplitter.get_multiStratificationLabels(_set)
            
            # one hot encode labels with categorical labels to calculate individual proportions
            for label in onehotEncodeLabel:
              data = [[i] for i in multi_strat_labels_dict[label]]
              data = np.array(data)
              encoder = OneHotEncoder(sparse=False)
              onehot = encoder.fit_transform(data)
              for i in range(onehot.shape[1]):
                multi_strat_labels_dict[label+'_'+str(i)] = list(onehot[:,i])

              multi_strat_labels_dict.pop(label, None)

            df = pd.DataFrame(multi_strat_labels_dict)
            df_prop = df.sum()/df.shape[0]
            print(key, 'set proportions:')
            print(df_prop)

In [34]:
# replace this string with the path to your own .json file
jsonfile = os.path.join(os.path.dirname(os.path.abspath("__file__")), "dataset.json")
with open(jsonfile) as f:
    docs = json.load(f)
print(len(docs))

# define dictionary keys/labels should be used for the multi stratification process
multi_strat_labels=["label_0", "label_1.sublabel_0",  "label_1.sublabel_1",  "label_1.sublabel_2",  "label_1.sublabel_3", "label_2"]
rmskSplitter = RepeatedMultilabelStratifiedKfold(docs=docs, multi_strat_labels=multi_strat_labels, kfolds=4, kfold_reps=100, holdout_ratio=0.1)
onehotEncodeLabel = ["label_0"]
validateLoop(rmskSplitter, onehotEncodeLabel)

504
Repetition 0 Fold 0
Training:   340 0.7472527472527473
Validation:  115 0.25274725274725274
Test:       49 0.09722222222222222
Repetition 0 Fold 0
training set proportions:
label_1.sublabel_0    0.973529
label_1.sublabel_1    0.400000
label_1.sublabel_2    0.173529
label_1.sublabel_3    0.052941
label_2               0.688235
label_0_0             0.067647
label_0_1             0.164706
label_0_2             0.073529
label_0_3             0.105882
label_0_4             0.161765
label_0_5             0.426471
dtype: float64
validation set proportions:
label_1.sublabel_0    0.965217
label_1.sublabel_1    0.408696
label_1.sublabel_2    0.191304
label_1.sublabel_3    0.078261
label_2               0.669565
label_0_0             0.069565
label_0_1             0.165217
label_0_2             0.086957
label_0_3             0.113043
label_0_4             0.156522
label_0_5             0.408696
dtype: float64
holdout test set proportions:
label_1.sublabel_0    1.000000
label_1.sublabel_1    