In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold, KFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
class DataSetHandler:
    def __init__(self, path, random_state):
        self.file_path = path
        self.data = pd.read_csv(self.file_path)
        self.random_state = random_state
        self.X = None
        self.Y = None
        self.X_train = None
        self.X_test = None
        self.Y_train = None
        self.Y_test = None
        self.labels = None
        self.group_identifier = None
        self.kfolds = None
    
    
    def preprocess(self, dropna: bool, impute_value: float = None, phenotype_index: int = -1, group_identifier_index: int = None) -> None:
        if not isinstance(dropna, bool):
            raise TypeError("dropna must be a boolean value")
        if not isinstance(impute_value, (float, int, type(None))):
            raise TypeError("impute_value must be a number")
        
        if dropna:
            self.data.dropna(inplace=True)
        if impute_value is not None:
            self.data.fillna(impute_value, inplace=True)

        label_encoder = LabelEncoder()
        self.Y = label_encoder.fit_transform(self.data[:, phenotype_index])
        self.labels = pd.DataFrame({
            'label': range(len(label_encoder.classes_)),
            'phenotype': label_encoder.classes_
            })
        self.X = self.data.drop(columns=[self.data.columns[-1]])
        if group_identifier_index is not None:
            self.group_identifier = self.data.iloc[:, group_identifier_index]
    

    def createKfold(self, k: int) -> None:
        if not isinstance(k, int):
            raise TypeError("k must be an integer")
        
        if self.group_identifier is not None:
            self.kfolds = GroupKFold(n_splits=k)
        else:
            self.kfolds = KFold(n_splits=k, shuffle=True, random_state=self.random_state)
        

        
    
    
        

 
        
        
        
     
