In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from imblearn.metrics import specificity_score

from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
pd.set_option('display.max_columns', 200)

In [3]:
def percent_of_nan_rows(dataset:pd.DataFrame, column:str) -> float:
    return len(dataset[dataset[column].isna()]) / len(dataset) * 100

In [4]:
def weighed_specificity_score(y_test, y_pred):
    return specificity_score(y_test, y_pred, average='weighted')

In [5]:
def PerformGridSearchCV(data:pd.DataFrame, *, feature_cols:list, label_col:str, label_col_type:str,
                        train_row_len:int = 1200, model_name:str, cv:int):
    __docstring__ = 'Model can be "xgb", "rf", "dt", "nb".\nLabec col can be categorical "cat" or binary "bin".'
    X_train, X_test, y_train, y_test = (data[feature_cols][:train_row_len],
                                        data[feature_cols][train_row_len:],
                                        data[label_col][:train_row_len],
                                        data[label_col][train_row_len:])
    y_train = y_train.to_numpy().ravel()
    y_test = y_test.to_numpy().ravel()
    if model_name == 'xgb':
        model = GradientBoostingClassifier()
        param_grid = [
    {'n_estimators':[10, 100, 200, 500],
     'learning_rate':[1e-3, 1e-2, 1e-1, 1],
     'max_depth':[1, 2, 3],
    }
    ]
    elif model_name == 'rf':
        model = RandomForestClassifier()
        param_grid = [
    {'n_estimators':[10, 100, 200, 500],
     'criterion':['gini', 'entropy', 'log_loss'],
     'bootstrap': [True, False],
    }
    ]
    elif model_name == 'dt':
        model = DecisionTreeClassifier()
        param_grid = [
    {'criterion':['gini', 'entropy'],
    'class_weight':[None, 'balanced'],
    }
    ]
    elif model_name == 'nb':
        model = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
        param_grid = [{
            'verbose':[False]
        }]
    else:
        raise ValueError(f'Model should be in ["xgb", "rf", "dt", "nb"], {model} given.')
    
    if label_col_type == 'cat':
        specificity_score_ = make_scorer(weighed_specificity_score, greater_is_better=True)
        scoring = {'accuracy':get_scorer('accuracy'),
                   'recall':get_scorer('recall_weighted'),
                   'f1':get_scorer('f1_weighted'), 'specificity_score':specificity_score_}
    elif label_col_type == 'bin':
        specificity_score_ = make_scorer(specificity_score, greater_is_better=True)
        scoring = {'accuracy':get_scorer('accuracy'),
                   'recall':get_scorer('recall'), 'f1':get_scorer('f1'), 'specificity_score':specificity_score_}
    else:
        raise ValueError(f'Label type should be in ["cat", "bin"], {label_col_type} given.')
    cols = list(map(lambda x: 'mean_test_'+str(x), scoring))
    cols.extend(['mean_fit_time', 'params'])
    cv_obj = GridSearchCV(model, param_grid,
                              cv=cv,
                              scoring=scoring, verbose=0,
                              n_jobs = -1,
                              refit=False,
                              )
    cv_obj.fit(X_train, y_train)
    df = pd.DataFrame(cv_obj.cv_results_)[cols]
    df['model'] = model_name
    df['label_column'] = label_col
    return df

In [6]:
def get_model(model_name:str, params:dict):
    if model_name == 'xgb':
        model = GradientBoostingClassifier(**params)
    elif model_name == 'rf':
        model = RandomForestClassifier(**params)
    elif model_name == 'dt':
        model = DecisionTreeClassifier(**params)
    elif model_name == 'nb':
        model = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())], **params)
    return model

In [7]:
#Load dataset
myocardial_infarction_complications = fetch_ucirepo(id=579)
X = myocardial_infarction_complications.data.features
y = myocardial_infarction_complications.data.targets

In [8]:
#Assure that all NaN values are np.nan
X = X.fillna(np.nan)

In [9]:
#Get data on columns
vars_ = pd.DataFrame(myocardial_infarction_complications.variables)

In [10]:
#Number of missing values and number of complete rows
print(f'Missing values amount: {X.isna().sum().sum()}')
print(f'Fully entered rows amount: {len(X.dropna(axis=0, how="any"))}')

Missing values amount: 15974
Fully entered rows amount: 0


In [11]:
#Drop rows with more than 20% of values are NaN
X.drop(index = X[X.count(axis=1)<len(X.columns)*0.8].index, inplace=True)

In [12]:
#Drop columns with more than 30% of values are NaN
#Note: labels columns do not contain NaN values
for column in X.columns:
    if percent_of_nan_rows(X, column) > 30:
        X.drop(columns = [column], inplace=True)
        vars_.drop(index = vars_.loc[vars_['name'] == column].index, axis = 0, inplace=True)

In [13]:
#Get column names for columns of categorical, binary, continuous and int variable types
cat_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Categorical')]['name'].tolist()
bin_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Binary')]['name'].tolist()
int_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Integer')]['name'].tolist()
float_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Continuous')]['name'].tolist()

In [14]:
bin_cols_preds = vars_[(vars_['role'] == 'Target') & (vars_['type'] == 'Binary')]['name'].tolist()
cat_cols_preds = vars_[(vars_['role'] == 'Target') & (vars_['type'] == 'Categorical')]['name'].tolist()

In [15]:
#comment out
#y = pd.get_dummies(data = y, columns = cat_cols_preds)

In [16]:
data = X.merge(y, left_index=True, right_index=True)

In [17]:
#Multivariate iterative imputing of missing variables
imp = IterativeImputer(max_iter=100)

In [18]:
#Number of missing values and number of complete rows
print(f'Missing values amount: {data.isna().sum().sum()}')
print(f'Fully entered rows amount: {len(data.dropna(axis=0, how="any"))}')

Missing values amount: 4473
Fully entered rows amount: 544


In [19]:
imp.fit(data)

In [20]:
#Applying multivariate iterative imputing of missing variables /w rounding of values (needed for categorical data)
imp_data = pd.DataFrame(np.round(imp.transform(data)), columns=data.columns)

In [21]:
imp_data

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,nr_11,nr_01,nr_02,nr_03,nr_04,nr_07,nr_08,np_01,np_04,np_05,np_07,np_08,np_09,np_10,endocr_01,endocr_02,endocr_03,zab_leg_01,zab_leg_02,zab_leg_03,zab_leg_04,zab_leg_06,S_AD_ORIT,D_AD_ORIT,O_L_POST,K_SH_POST,MP_TP_POST,SVT_POST,GT_POST,FIB_G_POST,ant_im,lat_im,inf_im,post_im,IM_PG_P,ritm_ecg_p_01,ritm_ecg_p_02,ritm_ecg_p_04,ritm_ecg_p_06,ritm_ecg_p_07,ritm_ecg_p_08,n_r_ecg_p_01,n_r_ecg_p_02,n_r_ecg_p_03,n_r_ecg_p_04,n_r_ecg_p_05,n_r_ecg_p_06,n_r_ecg_p_08,n_r_ecg_p_09,n_r_ecg_p_10,n_p_ecg_p_01,n_p_ecg_p_03,n_p_ecg_p_04,n_p_ecg_p_05,n_p_ecg_p_06,n_p_ecg_p_07,n_p_ecg_p_08,n_p_ecg_p_09,n_p_ecg_p_10,n_p_ecg_p_11,n_p_ecg_p_12,fibr_ter_01,fibr_ter_02,fibr_ter_03,fibr_ter_05,fibr_ter_06,fibr_ter_07,fibr_ter_08,GIPO_K,K_BLOOD,GIPER_NA,NA_BLOOD,ALT_BLOOD,AST_BLOOD,L_BLOOD,ROE,TIME_B_S,R_AB_1_n,R_AB_2_n,R_AB_3_n,NITR_S,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,FIBR_PREDS,PREDS_TAH,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,RAZRIV,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS
0,77.0,1.0,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,138.0,0.0,0.0,8.0,16.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,55.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,132.0,0.0,0.0,8.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,132.0,0.0,0.0,11.0,10.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,68.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,120.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,146.0,1.0,0.0,9.0,14.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,60.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,160.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,132.0,0.0,0.0,8.0,16.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,70.0,0.0,0.0,2.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,138.0,1.0,0.0,10.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
1556,77.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,136.0,0.0,0.0,6.0,20.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
1557,77.0,0.0,0.0,4.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,130.0,1.0,1.0,13.0,6.0,2.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
1558,70.0,0.0,0.0,6.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,50.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,136.0,0.0,0.0,12.0,14.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
#Standardize int and float columns
for column in imp_data.columns:
    if (column in int_cols or column in float_cols):
        imp_data[column] = imp_data[column].map(
            lambda l: (l-imp_data[column].mean())/imp_data[column].std())

In [23]:
#Getting correlation matrix
corr_matrix = imp_data.corr(method='spearman')

In [24]:
identity = np.identity(n=len(corr_matrix.values))

In [25]:
where_ = np.where(corr_matrix.values-identity > 0.8)

In [26]:
where_ = np.vstack([where_[0], where_[1]])

In [27]:
#Displaying columns with significant correlation
for i in range(len(where_[0]))[::2]:
    print(corr_matrix.iloc[where_[0][i]].index[where_[0][i]], end='\t')
    print(corr_matrix.columns[where_[1][i]], end='\t')
    print(corr_matrix.iloc[where_[0][i]][where_[1][i]])

STENOK_AN	FK_STENOK	0.8450335438356769
S_AD_ORIT	D_AD_ORIT	0.8331132905991112
MP_TP_POST	ritm_ecg_p_02	0.845850627969873


In [28]:
#Miocardial rupture leads to death in 100% cases - RAZRIV is not needed as a label (repeats LET_IS_3)
imp_data.drop(columns = 'RAZRIV', inplace=True)
vars_.drop(index = vars_.loc[vars_['name'] == 'RAZRIV'].index, axis = 0, inplace=True)

In [29]:
#Paroxysms of atrial fibrillation at the time of admission ==
#ECG rhythm at the time of admission to hospital: atrial fibrillation - can be dropped to reduce dimensionality
imp_data.drop(columns = 'MP_TP_POST', inplace=True)
vars_.drop(index = vars_.loc[vars_['name'] == 'MP_TP_POST'].index, axis = 0, inplace=True)

In [30]:
imp_data

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,nr_11,nr_01,nr_02,nr_03,nr_04,nr_07,nr_08,np_01,np_04,np_05,np_07,np_08,np_09,np_10,endocr_01,endocr_02,endocr_03,zab_leg_01,zab_leg_02,zab_leg_03,zab_leg_04,zab_leg_06,S_AD_ORIT,D_AD_ORIT,O_L_POST,K_SH_POST,SVT_POST,GT_POST,FIB_G_POST,ant_im,lat_im,inf_im,post_im,IM_PG_P,ritm_ecg_p_01,ritm_ecg_p_02,ritm_ecg_p_04,ritm_ecg_p_06,ritm_ecg_p_07,ritm_ecg_p_08,n_r_ecg_p_01,n_r_ecg_p_02,n_r_ecg_p_03,n_r_ecg_p_04,n_r_ecg_p_05,n_r_ecg_p_06,n_r_ecg_p_08,n_r_ecg_p_09,n_r_ecg_p_10,n_p_ecg_p_01,n_p_ecg_p_03,n_p_ecg_p_04,n_p_ecg_p_05,n_p_ecg_p_06,n_p_ecg_p_07,n_p_ecg_p_08,n_p_ecg_p_09,n_p_ecg_p_10,n_p_ecg_p_11,n_p_ecg_p_12,fibr_ter_01,fibr_ter_02,fibr_ter_03,fibr_ter_05,fibr_ter_06,fibr_ter_07,fibr_ter_08,GIPO_K,K_BLOOD,GIPER_NA,NA_BLOOD,ALT_BLOOD,AST_BLOOD,L_BLOOD,ROE,TIME_B_S,R_AB_1_n,R_AB_2_n,R_AB_3_n,NITR_S,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,FIBR_PREDS,PREDS_TAH,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS
0,1.356955,1.0,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.632210,1.059206,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116897,0.0,0.245763,-0.672061,-0.302823,-0.234990,0.227566,4.0,0.0,0.0,1.0,0.0,-0.621742,-0.261379,-0.203819,0.0,-0.279280,-0.236833,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.581310,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.567952,0.417712,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,0.0,-0.774344,-0.672061,-0.302823,-0.234990,-0.969282,2.0,0.0,0.0,0.0,0.0,-0.621742,-0.261379,-0.203819,1.0,-0.279280,-0.236833,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.845619,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.632210,1.059206,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.251806,0.0,-0.774344,-0.672061,-0.302823,0.665709,-0.324825,3.0,3.0,0.0,0.0,0.0,0.659523,-0.261379,-0.203819,3.0,4.757454,5.403772,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.564028,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.567952,-0.865277,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,0.0,1.605905,1.141806,-0.302823,0.065243,0.043436,2.0,0.0,0.0,1.0,0.0,-0.621742,-0.261379,-0.203819,0.0,-0.279280,-0.236833,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.140795,1.0,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.898823,0.417712,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,0.0,-0.774344,-0.672061,-0.302823,-0.234990,0.227566,9.0,0.0,0.0,0.0,0.0,-0.621742,-0.261379,-0.203819,0.0,-0.279280,-0.236833,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,0.740234,0.0,0.0,2.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165435,-0.223783,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.251806,0.0,0.245763,1.141806,-0.302823,0.365476,0.043436,1.0,0.0,0.0,0.0,0.0,-0.621742,-0.261379,-0.203819,1.0,-0.279280,-0.236833,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1556,1.356955,0.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532129,0.417712,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,0.0,-0.094273,-0.672061,-0.302823,-0.835456,0.595828,3.0,0.0,0.0,0.0,1.0,-0.621742,-0.261379,-0.203819,0.0,2.239087,-0.236833,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1557,1.356955,0.0,0.0,4.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.934646,-0.865277,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,0.0,-1.114379,1.141806,3.196462,1.266175,-0.693086,2.0,0.0,-0.0,0.0,0.0,-0.621742,-0.261379,-0.203819,0.0,-0.279280,-0.236833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0
1558,0.740234,0.0,0.0,6.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-3.134808,-5.355739,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.251806,0.0,-0.094273,-0.672061,-0.302823,0.965942,0.043436,2.0,0.0,0.0,0.0,0.0,-0.621742,-0.261379,-0.203819,0.0,-0.279280,-0.236833,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [31]:
#Arterial hypertension presence correlating to each other
#imp_data[['GB', 'DLIT_AG']]
#Exertional angina pectoris == Functional class (FC) of angina pectoris correlation of presence too
#imp_data[['STENOK_AN', 'FK_STENOK']]

In [32]:
imp_data = pd.get_dummies(data = imp_data, columns = cat_cols).sample(frac=1)

In [33]:
imp_data

Unnamed: 0,AGE,SEX,SIM_GIPERT,nr_11,nr_01,nr_02,nr_03,nr_04,nr_07,nr_08,np_01,np_04,np_05,np_07,np_08,np_09,np_10,endocr_01,endocr_02,endocr_03,zab_leg_01,zab_leg_02,zab_leg_03,zab_leg_04,zab_leg_06,S_AD_ORIT,D_AD_ORIT,O_L_POST,K_SH_POST,SVT_POST,GT_POST,FIB_G_POST,IM_PG_P,ritm_ecg_p_01,ritm_ecg_p_02,ritm_ecg_p_04,ritm_ecg_p_06,ritm_ecg_p_07,ritm_ecg_p_08,n_r_ecg_p_01,n_r_ecg_p_02,n_r_ecg_p_03,n_r_ecg_p_04,n_r_ecg_p_05,n_r_ecg_p_06,n_r_ecg_p_08,n_r_ecg_p_09,n_r_ecg_p_10,n_p_ecg_p_01,n_p_ecg_p_03,n_p_ecg_p_04,n_p_ecg_p_05,n_p_ecg_p_06,n_p_ecg_p_07,n_p_ecg_p_08,n_p_ecg_p_09,n_p_ecg_p_10,n_p_ecg_p_11,n_p_ecg_p_12,fibr_ter_01,fibr_ter_02,fibr_ter_03,fibr_ter_05,fibr_ter_06,fibr_ter_07,fibr_ter_08,GIPO_K,K_BLOOD,GIPER_NA,NA_BLOOD,ALT_BLOOD,AST_BLOOD,L_BLOOD,ROE,NITR_S,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,FIBR_PREDS,PREDS_TAH,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS,INF_ANAM_0.0,INF_ANAM_1.0,INF_ANAM_2.0,INF_ANAM_3.0,STENOK_AN_0.0,STENOK_AN_1.0,STENOK_AN_2.0,STENOK_AN_3.0,STENOK_AN_4.0,STENOK_AN_5.0,STENOK_AN_6.0,FK_STENOK_0.0,FK_STENOK_1.0,FK_STENOK_2.0,FK_STENOK_3.0,FK_STENOK_4.0,IBS_POST_0.0,IBS_POST_1.0,IBS_POST_2.0,GB_0.0,GB_1.0,GB_2.0,GB_3.0,DLIT_AG_0.0,DLIT_AG_1.0,DLIT_AG_2.0,DLIT_AG_3.0,DLIT_AG_4.0,DLIT_AG_5.0,DLIT_AG_6.0,DLIT_AG_7.0,DLIT_AG_8.0,DLIT_AG_9.0,ZSN_A_0.0,ZSN_A_1.0,ZSN_A_2.0,ZSN_A_3.0,ZSN_A_4.0,ant_im_0.0,ant_im_1.0,ant_im_2.0,ant_im_3.0,ant_im_4.0,lat_im_0.0,lat_im_1.0,lat_im_2.0,lat_im_3.0,lat_im_4.0,inf_im_-1.0,inf_im_0.0,inf_im_1.0,inf_im_2.0,inf_im_3.0,inf_im_4.0,post_im_0.0,post_im_1.0,post_im_2.0,post_im_3.0,post_im_4.0,TIME_B_S_1.0,TIME_B_S_2.0,TIME_B_S_3.0,TIME_B_S_4.0,TIME_B_S_5.0,TIME_B_S_6.0,TIME_B_S_7.0,TIME_B_S_8.0,TIME_B_S_9.0,R_AB_1_n_0.0,R_AB_1_n_1.0,R_AB_1_n_2.0,R_AB_1_n_3.0,R_AB_2_n_0.0,R_AB_2_n_1.0,R_AB_2_n_2.0,R_AB_2_n_3.0,R_AB_3_n_0.0,R_AB_3_n_1.0,R_AB_3_n_2.0,R_AB_3_n_3.0,NOT_NA_1_n_0.0,NOT_NA_1_n_1.0,NOT_NA_1_n_2.0,NOT_NA_1_n_3.0,NOT_NA_1_n_4.0
101,-0.493207,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.385451,0.289413,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116897,0.0,0.585798,1.141806,-0.302823,-0.535223,-0.877217,0.0,1.940787,2.610107,-0.203819,4.757454,-0.236833,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False
766,-1.374236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.998904,1.700701,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.251806,0.0,0.585798,-0.672061,-0.302823,-0.535223,-0.877217,0.0,-0.621742,-0.261379,-0.203819,4.757454,5.403772,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False
1454,0.123514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.301340,-0.223783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.251806,0.0,-0.094273,1.141806,-0.302823,-0.835456,3.173655,0.0,-0.621742,-0.261379,-0.203819,-0.279280,-0.236833,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
985,-1.021825,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165435,-0.223783,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116897,0.0,-1.114379,-0.672061,-0.302823,-1.435923,-0.969282,0.0,-0.621742,-0.261379,-0.203819,-0.279280,-0.236833,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
1276,2.502293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.348782,0.032815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,0.0,1.095852,-0.672061,-0.302823,-0.535223,2.253003,1.0,-0.621742,-0.261379,-0.203819,-0.279280,-0.236833,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282,-0.581310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018758,-0.095484,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,-0.0,-0.264290,1.141806,-0.302823,0.365476,1.516480,1.0,-0.621742,-0.261379,-0.203819,-0.279280,-0.236833,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
71,0.299719,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055427,0.161114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.251806,0.0,0.075745,-0.672061,-0.302823,-0.535223,0.411697,0.0,-0.621742,-0.261379,-0.203819,-0.279280,-0.236833,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
575,-1.550442,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.567952,-0.223783,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.251806,0.0,0.075745,1.141806,-0.302823,-1.135690,0.872023,0.0,-0.621742,-0.261379,-0.203819,-0.279280,-0.236833,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
124,0.123514,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.422121,0.353562,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.251806,0.0,-0.774344,-0.672061,-0.302823,-0.835456,-0.877217,0.0,0.659523,-0.261379,-0.203819,-0.279280,-0.236833,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False


In [34]:
#Features and labels for classification
features = imp_data.columns[~imp_data.columns.isin(y.columns)]

In [35]:
labels = imp_data.columns[imp_data.columns.isin(y.columns)]

In [36]:
#Attemptimg PCA on dummy dataset
pca = PCA(0.9, svd_solver='full')
imp_data_pcanalysed = pca.fit_transform(imp_data[features])
imp_data_pcanalysed

array([[-0.60034643,  3.59476145,  1.2206785 , ...,  0.01785905,
        -0.10805249, -0.16765673],
       [-3.18141715,  3.92283655,  2.86704957, ..., -0.54711917,
         0.16683508,  0.12848818],
       [ 0.93576684, -0.66276598, -0.71105061, ...,  0.17472208,
         0.06629801, -0.49180179],
       ...,
       [ 0.96540434, -1.09063807,  1.5289888 , ...,  0.31717388,
        -0.13639015,  0.11628874],
       [-0.86195825, -0.36697482, -0.23794998, ...,  0.3775306 ,
        -0.59199761, -0.53570789],
       [-0.95594945,  1.90192453,  2.2190573 , ..., -0.36325857,
         0.47205633,  0.23588763]])

In [37]:
imp_data_pcanalysed.shape
#51-dimensional data - 3 times reduction, but the curse of dimensionality is not lifted

(1560, 51)

In [38]:
imp_data_pcanalysed_train = imp_data_pcanalysed[:1200]
imp_data_pcanalysed_test = imp_data_pcanalysed[1200:]

In [39]:
#To avoid the need for multi-label classification, the model will be trained on one label at a time
#Starting with Letality prediction

In [40]:
labels = labels[labels.str.startswith('LET_IS')][0]

In [41]:
#Train-test split on shuffled data w/o sklearn
X_train, X_test, y_train, y_test = imp_data[features][:1200], imp_data[features][1200:], imp_data[labels][:1200], imp_data[labels][1200:]
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

In [42]:
#The best classification algorithm will be searched via Grid search cross-validation
#It allows the testing of best parameters for any classifier
#Classifiers tested will be:
'''
xGB
Random Forests
Decision Trees
Multinomial Naive Bayes
'''
model_list = ['xgb', 'rf', 'dt', 'nb']
#Adding other models is not recommended - addition of class instances required to all functions

In [43]:
cv_ = 6

In [44]:
out = pd.DataFrame()
for model in model_list:
    res = PerformGridSearchCV(imp_data, feature_cols=features, label_col=labels, label_col_type='cat',
                             model_name=model, cv=cv_)
    out = pd.concat([out, res], ignore_index=True, axis=0)

In [45]:
first_metric = 'mean_test_specificity_score'
sec_metric = 'mean_test_f1'

In [46]:
#Employ a sum of 2 metrics - both F1 and specificity are equaly important
metric_sum = 'sum_f1_spec'
out[metric_sum] = out[first_metric]+out[sec_metric]
out = out.sort_values(metric_sum, ascending=False)

In [47]:
out

Unnamed: 0,mean_test_accuracy,mean_test_recall,mean_test_f1,mean_test_specificity_score,mean_fit_time,params,model,label_column,sum_f1_spec
40,0.755833,0.755833,0.760593,0.479289,0.193084,"{'learning_rate': 1, 'max_depth': 2, 'n_estima...",xgb,LET_IS,1.239882
31,0.839167,0.839167,0.806233,0.432948,9.756044,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",xgb,LET_IS,1.239181
72,0.764167,0.764167,0.767430,0.469904,0.013229,"{'class_weight': None, 'criterion': 'gini'}",dt,LET_IS,1.237334
27,0.836667,0.836667,0.806422,0.428202,5.834016,"{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...",xgb,LET_IS,1.234624
46,0.735000,0.735000,0.745174,0.484130,5.020217,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",xgb,LET_IS,1.229304
...,...,...,...,...,...,...,...,...,...
8,0.842500,0.842500,0.770484,0.157500,0.294432,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",xgb,LET_IS,0.927984
6,0.842500,0.842500,0.770484,0.157500,3.938636,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",xgb,LET_IS,0.927984
5,0.842500,0.842500,0.770484,0.157500,1.942425,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",xgb,LET_IS,0.927984
0,0.842500,0.842500,0.770484,0.157500,0.133295,"{'learning_rate': 0.001, 'max_depth': 1, 'n_es...",xgb,LET_IS,0.927984


In [48]:
model_name, params = out.loc[0][['model' ,'params']]

In [49]:
model = get_model(model_name, params)

In [50]:
y_pred = model.fit(X_train, y_train).predict(X_test)

In [51]:
#F1 score for predicting mortality
f1_score(y_test, y_pred, average='weighted')

0.7889554224883565

In [52]:
#Low specificity due to class imbalance
specificity_score(y_test, y_pred, average='weighted')

0.14444444444444443

In [53]:
#Binary variables now
labels = imp_data.columns[imp_data.columns.isin(y.columns)]
labels = labels[~labels.str.startswith('LET_IS')]

In [54]:
#Reduce to 3 since dataset is too imbalanced - some values do not appear in entire fractions of the train set
cv_ = 3

In [55]:
out = pd.DataFrame()
for label in labels:
    for model in model_list:
        res = PerformGridSearchCV(imp_data, feature_cols=features, label_col=label, label_col_type='bin',
                                 model_name=model, cv=cv_)
        out = pd.concat([out, res], ignore_index=True, axis=0)
#Precision is ill-defined and being set to 0.0 due to no predicted samples. - some GridSearchCV instances
# only contain "False" label values, so the TP/P ratio cannot be calculated due to profound set imbalance

In [56]:
metric_sum = 'sum_f1_spec'
out[metric_sum] = out[first_metric]+out[sec_metric]
out = out.sort_values(metric_sum, ascending=False)

In [57]:
#Max F1 score is up to ~0.47 - too low
out

Unnamed: 0,mean_test_accuracy,mean_test_recall,mean_test_f1,mean_test_specificity_score,mean_fit_time,params,model,label_column,sum_f1_spec
575,0.788333,0.310788,0.406572,0.935644,0.017082,"{'learning_rate': 1, 'max_depth': 1, 'n_estima...",xgb,ZSN,1.342216
566,0.789167,0.275439,0.380797,0.947652,0.514262,"{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...",xgb,ZSN,1.328448
565,0.795833,0.247144,0.361929,0.965106,0.209648,"{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...",xgb,ZSN,1.327035
600,0.795833,0.236581,0.351802,0.968370,0.216782,"{'bootstrap': False, 'criterion': 'gini', 'n_e...",rf,ZSN,1.320173
569,0.783333,0.286002,0.382451,0.936748,0.362807,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",xgb,ZSN,1.319199
...,...,...,...,...,...,...,...,...,...
152,0.970000,0.000000,0.000000,0.981445,0.006909,"{'class_weight': 'balanced', 'criterion': 'ent...",dt,PREDS_TAH,0.981445
229,0.932500,0.030303,0.022989,0.958012,0.005916,"{'class_weight': 'balanced', 'criterion': 'ent...",dt,JELUD_TAH,0.981001
151,0.965833,0.000000,0.000000,0.977224,0.006743,"{'class_weight': 'balanced', 'criterion': 'gini'}",dt,PREDS_TAH,0.977224
502,0.926667,0.000000,0.000000,0.972878,0.022048,"{'learning_rate': 1, 'max_depth': 2, 'n_estima...",xgb,DRESSLER,0.972878


In [58]:
#Testing PCA prediction F1 score for predicting mortality
model = get_model(model_name, params)
y_pred = model.fit(imp_data_pcanalysed_train, y_train).predict(imp_data_pcanalysed_test)
f1_score(y_test, y_pred, average='weighted')
#It is lower than that of the dataset without SVD dimensionality reduction - PCA is detremental to scoring
#(may however improve fit and pred times)

0.7889554224883565

In [59]:
specificity_score(y_test, y_pred, average='weighted')

0.14444444444444443

In [60]:
#Max values for separate variables - bad overall
out.groupby('label_column').max('mean_test_f1')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_recall,mean_test_f1,mean_test_specificity_score,mean_fit_time,sum_f1_spec
label_column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A_V_BLOK,0.965833,0.244444,0.264198,1.0,1.239439,1.239955
DRESSLER,0.9525,0.087719,0.080556,1.0,1.220166,1.044685
FIBR_JELUD,0.959167,0.218137,0.204373,1.0,1.218622,1.164377
FIBR_PREDS,0.899167,0.216996,0.235918,1.0,1.257875,1.174978
JELUD_TAH,0.9725,0.181818,0.183288,1.0,1.234465,1.168918
OTEK_LANC,0.905,0.347503,0.349215,1.0,1.264928,1.28187
PREDS_TAH,0.989167,0.216667,0.228571,1.0,1.296779,1.22138
P_IM_STEN,0.915833,0.186275,0.171908,1.0,1.232417,1.103504
REC_IM,0.913333,0.209524,0.179177,1.0,1.240906,1.110389
ZSN,0.795833,0.363904,0.419009,1.0,1.303235,1.342216
