# Install Packages

In [None]:
!pip install torch scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[?25l[K     |███▊                            | 10 kB 25.3 MB/s eta 0:00:01[K     |███████▍                        | 20 kB 19.4 MB/s eta 0:00:01[K     |███████████                     | 30 kB 22.3 MB/s eta 0:00:01[K     |██████████████▊                 | 40 kB 25.9 MB/s eta 0:00:01[K     |██████████████████▍             | 51 kB 27.9 MB/s eta 0:00:01[K     |██████████████████████          | 61 kB 30.2 MB/s eta 0:00:01[K     |█████████████████████████▊      | 71 kB 31.3 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81 kB 32.6 MB/s eta 0:00:01[K     |████████████████████████████████| 89 kB 7.4 MB/s 
Installing collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [None]:
"b33797064593ab2229a0135dc69001bea05cb56a20c2f243b1231213642e260a"



# Load Classes

In [None]:
from google.colab import drive
# drive.mount("/content/drive", force_remount=True)
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# RepeatedMultilabelStratifiedKfoldModule.py

import numpy as np
import pandas as pd
# import random
# import torch
import pprint
import sys
import os
os.path.dirname(os.path.abspath(__file__))
projec_dir = "/content/drive/MyDrive/OHRI/OMNI Group/Ultrasound Deep Learning/test/"
sys.path.append(projec_dir)
from iterative_stratification import IterativeStratification # https://github.com/scikit-multilearn/scikit-multilearn/issues/217 

# TODO: change this to be a general module for repeated kfold CV with multi label stratification. Option to add dataset balancing (class imbalance) by upsampling or downsampling classes
class RepeatedMultilabelStratifiedKfoldModule():
    def __init__(self, docs, multi_strat_labels, kfolds=4, kfold_reps=10, holdout_ratio=0.1):
        self.holdout_ratio = holdout_ratio
        self.rskf_holdout_test_sets = []
        self.rskf_val_sets = []
        self.kfold_reps = kfold_reps
        self.k = kfolds
        self.train_val_ratio = 1/self.k #train_val_ratio
        self.multi_strat_labels = multi_strat_labels #["label"]
        self.docs = docs
        self.setupMultiLabelStratifiedRepeatedKFold()

    def get_multiStratificationLabels(self, docs):
        multi_strat_labels_dict = {key:[] for key in self.multi_strat_labels}
        for doc in docs:
            for key in self.multi_strat_labels:
                if "." in key:
                    keys = key.split(".")
                    val = doc[keys[0]]
                    for k in keys[1:]:
                        val = val[k]
                else:
                    val = doc[key]
                multi_strat_labels_dict[key].append(val)

        return multi_strat_labels_dict

    def get_holdout_indices(self, random_state):

        if self.holdout_ratio == 0:
            return [i for i in range(len(self.docs))], []

        ## multilabel stratified sample
        multi_strat_labels_dict = self.get_multiStratificationLabels(self.docs)
        Y_df = pd.DataFrame(multi_strat_labels_dict)
        Y = np.array(Y_df)
        X = [i for i in range(len(self.docs))]
        X = np.array([X,X]).transpose()

        # X_train, Y_train, X_test, Y_test = iterative_train_test_split(X, Y, self.holdout_ratio)
        n_splits = int(1/self.holdout_ratio)
        k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state=random_state)
        for train, test in k_fold.split(X, Y):
          break

        return list(train), list(test)

    def setupMultiLabelStratifiedRepeatedKFold(self):
        self.rskf_holdout_test_sets = []
        self.rskf_val_sets = []
        for rep in range(self.kfold_reps):
            self.rskf_val_sets.append([])
            # reset seed for each repetition
            # set seed for reproducibility
            random_state = rep*369
            # torch.manual_seed(random_state)
            # random.seed(random_state)
            # np.random.seed(random_state)
            rskf_train_val_sets_indices, rskf_holdout_test_set_indices = self.get_holdout_indices(random_state=random_state)
            self.rskf_holdout_test_sets.append(rskf_holdout_test_set_indices)

            rskf_train_val_docs = [self.docs[i] for i in rskf_train_val_sets_indices]
            multi_strat_labels_dict = self.get_multiStratificationLabels(rskf_train_val_docs)
            Y_df = pd.DataFrame(multi_strat_labels_dict)
            Y = np.array(Y_df)
            X = np.array([rskf_train_val_sets_indices,rskf_train_val_sets_indices]).transpose()
            n_splits = int(1/self.train_val_ratio)
            k_fold = IterativeStratification(n_splits=n_splits, order=1, random_state=random_state)
            for train, test in k_fold.split(X, Y):
              self.rskf_val_sets[rep].append(list(X[test, 0]))

        # check no reps are redundant....
        duplicate_exists, duplicate_reps = self.checkIfDuplicates(self.rskf_holdout_test_sets)
        return

    def checkIfDuplicates(self, listOfElems):
      ''' Check if any hold out test sets have duplicates'''
      duplicate_reps = []    
      setOfElems = set()
      for rep, elem in enumerate(listOfElems):
          elem.sort()
          elem = [str(i) for i in elem]
          elem = "-".join(elem)
          if elem in setOfElems:
              duplicate_reps.append(rep)
          else:
              setOfElems.add(elem)         
      
      if duplicate_reps:
        print("Duplicate holdout test sets found. Possibility of a repetition to be redundant. Consider reducing the number of repetition")
        print("10 repetitions for a 4-fold CV provides 40(10x4) estimates which is usually enough for most experiments")
        print("duplicate_reps", duplicate_reps)
        # print("The following was tested and worked: single-class stratification")
        '''
        The following was tested and worked: 
        single-class stratification, data size 1200, reps = 1000, kfolds = 4  
        '''
        return True, duplicate_reps
      
      return False, duplicate_reps

    def getDataSplits(self, rep, fold, verbose=False):
        train_set, val_set, holdout_test_set = [], [], []
        for i, doc in enumerate(self.docs):

            if i in self.rskf_holdout_test_sets[rep] and i in self.rskf_val_sets[rep][fold]:
              print("OH NOOOOOOOO!!!!!!!!!")

            if i in self.rskf_holdout_test_sets[rep]:
                holdout_test_set.append(doc)
            elif i in self.rskf_val_sets[rep][fold]:
                val_set.append(doc)
            else:
                train_set.append(doc)

        if verbose:
            cv_set_size = len(train_set) + len(val_set)
            print("Repetition", rep, "Fold", fold)
            print('Training:  ', len(train_set), len(train_set) / cv_set_size)
            print('Validation: ', len(val_set), len(val_set) / cv_set_size)
            print('Test:      ', len(holdout_test_set), len(holdout_test_set) / len(self.docs))

        return train_set, val_set, holdout_test_set



# Test Code

In [None]:
from sklearn.preprocessing import OneHotEncoder

def validateLoop(rmskSplitter, onehotEncodeLabel):
  for rep in range(rmskSplitter.kfold_reps):
      for fold in range(rmskSplitter.k):
          train_set, val_set, holdout_test_set = rmskSplitter.getDataSplits(rep=rep, fold=fold, verbose=True)
          
          # check proportions
          print("Repetition", rep, "Fold", fold)
          for key, _set in {"training": train_set, "validation": val_set, "holdout test": holdout_test_set}.items():
            multi_strat_labels_dict = rmskSplitter.get_multiStratificationLabels(_set)
            
            # one hot encode labels with categorical labels to calculate individual proportions
            for label in onehotEncodeLabel:
              data = [[i] for i in multi_strat_labels_dict[label]]
              data = np.array(data)
              encoder = OneHotEncoder(sparse=False)
              onehot = encoder.fit_transform(data)
              for i in range(onehot.shape[1]):
                multi_strat_labels_dict[label+'_'+str(i)] = list(onehot[:,i])

              multi_strat_labels_dict.pop(label, None)

            df = pd.DataFrame(multi_strat_labels_dict)
            df_prop = df.sum()/df.shape[0]
            print(key, 'set proportions:')
            print(df_prop)

def recursive_remap(d, keymap):
    if isinstance(d, dict):
        return {keymap[k]: recursive_remap(v, keymap) for k, v in d.items() if k in keymap}
    return d

## test FetalUS

In [None]:
import json

with open("/content/drive/MyDrive/OHRI/OMNI Group/Ultrasound Deep Learning/test/database.json") as f:
    subjects_docs = json.load(f)


categorical_mapping = {'NormalNT': 0, 'CysticHygroma': 1,
                        "NormalNeuro": 2, "Ventriculomegaly": 3,
                        "NormalKidney": 4, "MCDK": 5}

# convert labels to cateforical values
for i, doc in enumerate(subjects_docs):
  subjects_docs[i]["label"] = categorical_mapping[doc["label"]]

onehotEncodeLabel = ["label"]
rmskSplitter = RepeatedMultilabelStratifiedKfoldModule(docs=subjects_docs, multi_strat_labels=["label"], kfolds=4, kfold_reps=10, holdout_ratio=0.1)
validateLoop(rmskSplitter, onehotEncodeLabel)

Repetition 0 Fold 0
Training:   735 0.7507660878447395
Validation:  244 0.24923391215526047
Test:       109 0.10018382352941177
Repetition 0 Fold 0
training set proportions:
label_0    0.146939
label_1    0.121088
label_2    0.194558
label_3    0.034014
label_4    0.389116
label_5    0.114286
dtype: float64
validation set proportions:
label_0    0.143443
label_1    0.118852
label_2    0.196721
label_3    0.032787
label_4    0.393443
label_5    0.114754
dtype: float64
holdout test set proportions:
label_0    0.155963
label_1    0.110092
label_2    0.201835
label_3    0.027523
label_4    0.385321
label_5    0.119266
dtype: float64
Repetition 0 Fold 1
Training:   734 0.7497446373850868
Validation:  245 0.25025536261491316
Test:       109 0.10018382352941177
Repetition 0 Fold 1
training set proportions:
label_0    0.145777
label_1    0.119891
label_2    0.196185
label_3    0.032698
label_4    0.391008
label_5    0.114441
dtype: float64
validation set proportions:
label_0    0.146939
label_

## test ICH dataset - multi stratification

In [None]:
import json

with open("/content/drive/MyDrive/OHRI/OMNI Group/Ultrasound Deep Learning/test/dataset.json") as f:
    subjects_docs = json.load(f)


categorical_mapping = {"SPOTLIGHT": 0, "PREDICT": 1,
                        "STOPIT": 2, "BS": 3,
                        "FE": 4, "TOH": 5}
# convert labels to cateforical values
for i, doc in enumerate(subjects_docs):
  subjects_docs[i]["source_dataset"] = categorical_mapping[doc["source_dataset"]]

print(len(subjects_docs))
multi_strat_labels=["source_dataset", "baseline.IPH",  "baseline.IVH",  "baseline.SAH",  "baseline.SDH", "non_expander"]
onehotEncodeLabel = ["source_dataset"]
rmskSplitter = RepeatedMultilabelStratifiedKfoldModule(docs=subjects_docs, multi_strat_labels=multi_strat_labels, kfolds=4, kfold_reps=10, holdout_ratio=0.1)
validateLoop(rmskSplitter, onehotEncodeLabel)

504


NameError: ignored

In [None]:
import json

jsonfile = "/content/drive/MyDrive/OHRI/OMNI Group/Ultrasound Deep Learning/test/dataset.json"
with open(jsonfile) as f:
    subjects_docs = json.load(f)


categorical_mapping = {"SPOTLIGHT": 0, "PREDICT": 1,
                        "STOPIT": 2, "BS": 3,
                        "FE": 4, "TOH": 5}
# convert labels to cateforical values
for i, doc in enumerate(subjects_docs):
  subjects_docs[i]["source_dataset"] = categorical_mapping[doc["source_dataset"]]

print(len(subjects_docs))

keymap = {"source_dataset":"label_0", "baseline":"label_1", "non_expander": "label_2", "IPH":"sublabel_0",  "IVH": "sublabel_1",  "SAH": "sublabel_2",  "SDH": "sublabel_3"}
new_docs = []
for doc in subjects_docs:
  new_doc = recursive_remap(doc, keymap)
  new_docs.append(new_doc)


with open(jsonfile, "w") as write_file:
  json.dump(new_docs, write_file, indent=4)

# multi_strat_labels_dict.pop(label, None)

# onehotEncodeLabel = ["source_dataset"]
# rmskSplitter = RepeatedMultilabelStratifiedKfoldModule(docs=subjects_docs, multi_strat_labels=multi_strat_labels, kfolds=4, kfold_reps=10, holdout_ratio=0.1)
# validateLoop(rmskSplitter, onehotEncodeLabel)

504


# Other Approach

https://stats.stackexchange.com/questions/65828/how-to-use-scikit-learns-cross-validation-functions-on-multi-label-classifiers