In [10]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder

In [31]:
class DataSetHandler:
    def __init__(self, path, random_state):
        self.file_path = path
        self.data = pd.read_csv(self.file_path)
        self.random_state = random_state
        self.X = None
        self.Y = None
        self.labels = None
        self.group_identifier = None
        self.kfolds = None
        self.fold_indices = None
        self.fold_data = None
        print("DataSetHandler initialized successfully")
    
    
    def preprocess(self, dropna: bool, impute_value: float = None, phenotype_column: str = 'cell_type', group_identifier_column: str = None) -> None:
        """
        This function processes the laoded dataframe. If NA values are present, they can be dropped or imputed with a specified value. A group identifier can be
        specified so the kfolds are created with the group identifier in mind. The phenotype column is encoded and the data is split into X and Y. 
        """

        if not isinstance(dropna, bool):
            raise TypeError("dropna must be a boolean value")
        if not isinstance(impute_value, (float, int, type(None))):
            raise TypeError("impute_value must be a number")
        
        if dropna:
            self.data.dropna(inplace=True)
        if impute_value is not None:
            self.data.fillna(impute_value, inplace=True)

        label_encoder = LabelEncoder()
        self.Y = label_encoder.fit_transform(self.data[phenotype_column])
        self.labels = pd.DataFrame({
            'label': range(len(label_encoder.classes_)),
            'phenotype': label_encoder.classes_
            })
        self.X = self.data.drop(columns=[phenotype_column])
        if group_identifier_column is not None:
            self.group_identifier = self.data[group_identifier_column]

        print("Data successfully preprocessed")
    
    def save_labels(self, save_path: str) -> None:
        if self.labels is None:
            raise ValueError("No labels have been created. Call preprocess first.")
        
        if save_path is None:
            save_path = os.getcwd()
        
        self.labels.to_csv(os.path.join(save_path, 'labels.csv'), index=False)
        print(f"Labels saved in: {save_path} as labels.csv")

    def createKfold(self, k: int) -> None:
        """
        Creates folds, depending if the user has specified a group identifier or not. If a group identifier is present, GroupKFold is used, otherwise KFold is used.
        Folds will be carried by the fold_data attribute.
        """
        if not isinstance(k, int):
            raise TypeError("k must be an integer")
        
        if self.group_identifier is not None:
            self.kfolds = GroupKFold(n_splits=k)
            fold_generator = self.kfolds.split(self.X, self.Y, self.group_identifier)
        else:
            self.kfolds = KFold(n_splits=k, shuffle=True, random_state=self.random_state)
            fold_generator = self.kfolds.split(self.X)

        self.fold_indices = list(fold_generator)
        
        self.fold_data = []
        for train_index, test_index in self.fold_indices:
            X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
            Y_train, Y_test = self.Y[train_index], self.Y[test_index]
            
            fold = {
                'X_train': X_train,
                'X_test': X_test,
                'Y_train': Y_train,
                'Y_test': Y_test
            }
            self.fold_data.append(fold)
        
        print(f"{k} folds created. To save the folds, call save_folds method.")


    def save_folds(self, save_path: str) -> None:

        if self.fold_indices is None:
            raise ValueError("No folds have been created. Call createKfold first.")
        
        if save_path is None:
            save_path = os.getcwd()

        kfolds_dir = os.path.join(save_path, 'kfolds')
        os.makedirs(kfolds_dir, exist_ok=True)

        fold_data = {
            'random_state': self.random_state,
            'folds': [{'train': train.tolist(), 'test': test.tolist()} for train, test in self.fold_indices]
        }

        with open(os.path.join(kfolds_dir, 'fold_indices.json'), 'w') as f:
            json.dump(fold_data, f)

        for i, (train_index, test_index) in enumerate(self.fold_indices):
            train_data = self.data.iloc[train_index]
            test_data = self.data.iloc[test_index]
            
            train_data.to_csv(os.path.join(kfolds_dir, f'fold_{i+1}_train.csv'), index=False)
            test_data.to_csv(os.path.join(kfolds_dir, f'fold_{i+1}_test.csv'), index=False)

        print(f"Folds saved in: {kfolds_dir}")

            
    @classmethod
    def load_folds(cls, load_path: str = None, data_path: str = None) -> 'DataSetHandler':
        if load_path is None:
            load_path = os.getcwd()
        
        kfolds_dir = os.path.join(load_path, 'kfolds')
        
        with open(os.path.join(kfolds_dir, 'fold_indices.json'), 'r') as f:
            fold_data = json.load(f)
        
        if data_path is None:
            first_fold_file = [f for f in os.listdir(kfolds_dir) if f.startswith('fold_1_train_')][0]
            data_path = os.path.join(kfolds_dir, first_fold_file.replace('fold_1_train_', ''))
        
        handler = cls(data_path, fold_data['random_state'])
        handler.data = pd.read_csv(data_path)
        handler.fold_indices = [(np.array(fold['train']), np.array(fold['test'])) for fold in fold_data['folds']]
        handler.kfolds = KFold(n_splits=len(handler.fold_indices), shuffle=True, random_state=handler.random_state)
        
        return handler

        


In [14]:
df = pd.read_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/Maps_data/cHL_1_MIBI/quantification/processed/cHL1_MIBI_cleaned.csv')

In [15]:
df

Unnamed: 0,CD45,CD20,CD163,Histone H3,CD45RO,CD28,CD153 (CD30L),Lag3,CD4,CD11c,...,CD30,TIM3,RORgT,TCRgd,CD86,CD25,Na-K ATPase,cellSize,identifier,cell_type
0,0.001217,0.016976,0.131463,0.192797,0.001151,0.000207,0.000074,0.000160,0.001707,0.000000,...,0.000000,0.001195,0.000785,0.000921,0.001140,0.001094,0.003093,249,31,M2
1,0.000000,0.000000,0.024141,0.202666,0.000945,0.000000,0.000000,0.000112,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000307,0.000030,0.001797,22,31,M2
2,0.000451,0.000000,0.040998,0.051099,0.003537,0.000000,0.000000,0.000000,0.002056,0.000000,...,0.000000,0.000010,0.000205,0.000214,0.000653,0.001518,0.005575,35,31,M2
3,0.000000,0.000000,0.159136,0.057322,0.000526,0.000000,0.000000,0.000000,0.000156,0.000000,...,0.000000,0.000262,0.000000,0.000328,0.000456,0.000939,0.004156,29,31,M2
4,0.000000,0.000000,0.117901,0.072533,0.003073,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000030,0.000000,0.000702,0.000378,0.001069,0.008932,14,31,M2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1669848,0.000345,0.000000,0.002531,0.196096,0.001716,0.000462,0.002012,0.000046,0.006395,0.005166,...,0.000003,0.000364,0.000058,0.000490,0.000726,0.000953,0.005393,145,6,Neutrophil
1669849,0.000205,0.000000,0.000139,0.184302,0.001173,0.000058,0.006320,0.000045,0.000000,0.000000,...,0.000000,0.000000,0.000242,0.000564,0.000394,0.000344,0.001827,36,6,Neutrophil
1669850,0.000812,0.000608,0.000000,0.179846,0.001337,0.000555,0.002020,0.000016,0.003382,0.000000,...,0.000000,0.000012,0.000111,0.000868,0.000354,0.000809,0.004953,44,6,Neutrophil
1669851,0.000000,0.000000,0.000000,0.080264,0.000978,0.000000,0.001857,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000150,0.000395,0.000022,0.001163,29,6,Neutrophil


In [26]:
cHL1_datahandler = DataSetHandler('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/Maps_data/cHL_1_MIBI/quantification/processed/cHL1_MIBI_cleaned.csv', random_state=20240922)

DataSetHandler initialized successfully


In [27]:
cHL1_datahandler.preprocess(dropna=False, phenotype_column='cell_type', group_identifier_column='identifier')

Data successfully preprocessed


In [28]:
cHL1_datahandler.createKfold(5)

5 folds created. To save the folds, call save_folds method.


In [29]:
cHL1_datahandler.save_folds(save_path='/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/Maps_data/cHL_1_MIBI/quantification/')

Folds saved in: /Volumes/Lukas_SSD/phenotyping_benchmark/datasets/Maps_data/cHL_1_MIBI/quantification/kfolds
