In [2]:
import os

class Args:
    def __init__(self):
        self.clinicla_path = "../preprocess/preprocessed_data/clinical_kidney.csv"
        self.seed = 24
        self.fold_range = 5
        self.save_path = "./kidney_splits"
        
args = Args()

In [4]:
import random
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
    

def preprocess_clinical_data(clinical_path):
    data_clinical = pd.read_csv(clinical_path, header=None)
    target_data = data_clinical[[6, 7]]
    clin_data_categorical = data_clinical[[1, 2, 3, 4]]
    clin_data_continuous = data_clinical[[5]]
    return clin_data_categorical, clin_data_continuous, target_data, data_clinical[[0]] 

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
        
if not os.path.exists(args.save_path):
    os.mkdir(args.save_path)

set_seed(args.seed)
clin_data_x, _, clin_data_y, clinical_label = preprocess_clinical_data(args.clinicla_path)
clinical_label = clinical_label.to_numpy()
clin_data_x.reset_index(drop=True)
clin_data_y.reset_index(drop=True)
train_testVal_strtfdKFold = StratifiedKFold(n_splits=args.fold_range, random_state=args.seed, shuffle=True)
train_testVal_kfold = train_testVal_strtfdKFold.split(clin_data_x, clin_data_y[[6]]) 
for k, (train_val, test) in enumerate(train_testVal_kfold):
    split = {"train": train_val, "test": test}
    np.save(os.path.join(args.save_path, f"split_{k}.npy"), split)