In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from imblearn.metrics import specificity_score

from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
pd.set_option('display.max_columns', 200)

In [3]:
def percent_of_nan_rows(dataset:pd.DataFrame, column:str) -> float:
    return len(dataset[dataset[column].isna()]) / len(dataset) * 100

In [4]:
def weighed_specificity_score(y_test, y_pred):
    return specificity_score(y_test, y_pred, average='weighted')

In [5]:
def PerformGridSearchCV(data:pd.DataFrame, *, feature_cols:list, label_col:str, label_col_type:str,
                        train_row_len:int = 1200, model_name:str, cv:int):
    __docstring__ = 'Model can be "xgb", "rf", "dt", "nb".\nLabec col can be categorical "cat" or binary "bin".'
    X_train, X_test, y_train, y_test = (data[feature_cols][:train_row_len],
                                        data[feature_cols][train_row_len:],
                                        data[label_col][:train_row_len],
                                        data[label_col][train_row_len:])
    y_train = y_train.to_numpy().ravel()
    y_test = y_test.to_numpy().ravel()
    if model_name == 'xgb':
        model = GradientBoostingClassifier()
        param_grid = [
    {'n_estimators':[10, 100, 200, 500],
     'learning_rate':[1e-3, 1e-2, 1e-1, 1],
     'max_depth':[1, 2, 3],
    }
    ]
    elif model_name == 'rf':
        model = RandomForestClassifier()
        param_grid = [
    {'n_estimators':[10, 100, 200, 500],
     'criterion':['gini', 'entropy', 'log_loss'],
     'bootstrap': [True, False],
    }
    ]
    elif model_name == 'dt':
        model = DecisionTreeClassifier()
        param_grid = [
    {'criterion':['gini', 'entropy'],
    'class_weight':[None, 'balanced'],
    }
    ]
    elif model_name == 'nb':
        model = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
        param_grid = [{
            'verbose':[False]
        }]
    else:
        raise ValueError(f'Model should be in ["xgb", "rf", "dt", "nb"], {model} given.')
    
    if label_col_type == 'cat':
        specificity_score_ = make_scorer(weighed_specificity_score, greater_is_better=True)
        scoring = {'accuracy':get_scorer('accuracy'),
                   'recall':get_scorer('recall_weighted'),
                   'f1':get_scorer('f1_weighted'), 'specificity_score':specificity_score_}
    elif label_col_type == 'bin':
        specificity_score_ = make_scorer(specificity_score, greater_is_better=True)
        scoring = {'accuracy':get_scorer('accuracy'),
                   'recall':get_scorer('recall'), 'f1':get_scorer('f1'), 'specificity_score':specificity_score_}
    else:
        raise ValueError(f'Label type should be in ["cat", "bin"], {label_col_type} given.')
    cols = list(map(lambda x: 'mean_test_'+str(x), scoring))
    cols.extend(['mean_fit_time', 'params'])
    cv_obj = GridSearchCV(model, param_grid,
                              cv=cv,
                              scoring=scoring, verbose=0,
                              n_jobs = -1,
                              refit=False,
                              )
    cv_obj.fit(X_train, y_train)
    df = pd.DataFrame(cv_obj.cv_results_)[cols]
    df['model'] = model_name
    df['label_column'] = label_col
    return df

In [6]:
def get_model(model_name:str, params:dict):
    if model_name == 'xgb':
        model = GradientBoostingClassifier(**params)
    elif model_name == 'rf':
        model = RandomForestClassifier(**params)
    elif model_name == 'dt':
        model = DecisionTreeClassifier(**params)
    elif model_name == 'nb':
        model = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())], **params)
    return model

In [7]:
#Load dataset
myocardial_infarction_complications = fetch_ucirepo(id=579)
X = myocardial_infarction_complications.data.features
y = myocardial_infarction_complications.data.targets

In [8]:
#Assure that all NaN values are np.nan
X = X.fillna(np.nan)

In [9]:
#Get data on columns
vars_ = pd.DataFrame(myocardial_infarction_complications.variables)

In [10]:
#Number of missing values and number of complete rows
print(f'Missing values amount: {X.isna().sum().sum()}')
print(f'Fully entered rows amount: {len(X.dropna(axis=0, how="any"))}')

Missing values amount: 15974
Fully entered rows amount: 0


In [11]:
#Drop rows with more than 20% of values are NaN
X.drop(index = X[X.count(axis=1)<len(X.columns)*0.8].index, inplace=True)

In [12]:
#Drop columns with more than 30% of values are NaN
#Note: labels columns do not contain NaN values
for column in X.columns:
    if percent_of_nan_rows(X, column) > 30:
        X.drop(columns = [column], inplace=True)
        vars_.drop(index = vars_.loc[vars_['name'] == column].index, axis = 0, inplace=True)

In [13]:
#Get column names for columns of categorical, binary, continuous and int variable types
cat_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Categorical')]['name'].tolist()
bin_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Binary')]['name'].tolist()
int_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Integer')]['name'].tolist()
float_cols = vars_[(vars_['role'] == 'Feature') & (vars_['type'] == 'Continuous')]['name'].tolist()

In [14]:
bin_cols_preds = vars_[(vars_['role'] == 'Target') & (vars_['type'] == 'Binary')]['name'].tolist()
cat_cols_preds = vars_[(vars_['role'] == 'Target') & (vars_['type'] == 'Categorical')]['name'].tolist()

In [15]:
#comment out
y = pd.get_dummies(data = y, columns = cat_cols_preds)

In [16]:
data = X.merge(y, left_index=True, right_index=True)

In [17]:
#Multivariate iterative imputing of missing variables
imp = IterativeImputer(max_iter=100)

In [18]:
#Number of missing values and number of complete rows
print(f'Missing values amount: {data.isna().sum().sum()}')
print(f'Fully entered rows amount: {len(data.dropna(axis=0, how="any"))}')

Missing values amount: 4473
Fully entered rows amount: 544


In [19]:
imp.fit(data)

In [20]:
#Applying multivariate iterative imputing of missing variables /w rounding of values (needed for categorical data)
imp_data = pd.DataFrame(np.round(imp.transform(data)), columns=data.columns)

In [21]:
imp_data

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,nr_11,nr_01,nr_02,nr_03,nr_04,nr_07,nr_08,np_01,np_04,np_05,np_07,np_08,np_09,np_10,endocr_01,endocr_02,endocr_03,zab_leg_01,zab_leg_02,zab_leg_03,zab_leg_04,zab_leg_06,S_AD_ORIT,D_AD_ORIT,O_L_POST,K_SH_POST,MP_TP_POST,SVT_POST,GT_POST,FIB_G_POST,ant_im,lat_im,inf_im,post_im,IM_PG_P,ritm_ecg_p_01,ritm_ecg_p_02,ritm_ecg_p_04,ritm_ecg_p_06,ritm_ecg_p_07,ritm_ecg_p_08,n_r_ecg_p_01,n_r_ecg_p_02,n_r_ecg_p_03,n_r_ecg_p_04,n_r_ecg_p_05,n_r_ecg_p_06,n_r_ecg_p_08,n_r_ecg_p_09,n_r_ecg_p_10,n_p_ecg_p_01,n_p_ecg_p_03,n_p_ecg_p_04,n_p_ecg_p_05,n_p_ecg_p_06,n_p_ecg_p_07,n_p_ecg_p_08,n_p_ecg_p_09,n_p_ecg_p_10,n_p_ecg_p_11,n_p_ecg_p_12,fibr_ter_01,fibr_ter_02,fibr_ter_03,fibr_ter_05,fibr_ter_06,fibr_ter_07,fibr_ter_08,GIPO_K,K_BLOOD,GIPER_NA,NA_BLOOD,ALT_BLOOD,AST_BLOOD,L_BLOOD,ROE,TIME_B_S,R_AB_1_n,R_AB_2_n,R_AB_3_n,NITR_S,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,FIBR_PREDS,PREDS_TAH,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,RAZRIV,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS_0,LET_IS_1,LET_IS_2,LET_IS_3,LET_IS_4,LET_IS_5,LET_IS_6,LET_IS_7
0,77.0,1.0,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,138.0,0.0,0.0,8.0,16.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,55.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,132.0,0.0,0.0,8.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,132.0,0.0,0.0,11.0,10.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,68.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,120.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,146.0,1.0,0.0,9.0,14.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,160.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,132.0,0.0,0.0,8.0,16.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,70.0,0.0,0.0,2.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,137.0,1.0,0.0,10.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1556,77.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,136.0,0.0,0.0,6.0,20.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1557,77.0,0.0,0.0,4.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,130.0,1.0,1.0,13.0,6.0,2.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1558,70.0,0.0,0.0,6.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,50.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,136.0,0.0,0.0,12.0,15.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#Standardize int and float columns
for column in imp_data.columns:
    if (column in int_cols or column in float_cols):
        imp_data[column] = imp_data[column].map(
            lambda l: (l-imp_data[column].mean())/imp_data[column].std())

In [23]:
#Getting correlation matrix
corr_matrix = imp_data.corr(method='spearman')

In [24]:
identity = np.identity(n=len(corr_matrix.values))

In [25]:
where_ = np.where(corr_matrix.values-identity > 0.8)

In [26]:
where_ = np.vstack([where_[0], where_[1]])

In [27]:
#Displaying columns with significant correlation
for i in range(len(where_[0]))[::2]:
    print(corr_matrix.iloc[where_[0][i]].index[where_[0][i]], end='\t')
    print(corr_matrix.columns[where_[1][i]], end='\t')
    print(corr_matrix.iloc[where_[0][i]][where_[1][i]])

STENOK_AN	FK_STENOK	0.8452368695301404
S_AD_ORIT	D_AD_ORIT	0.8336463802737631
MP_TP_POST	ritm_ecg_p_02	0.845850627969873
RAZRIV	LET_IS_3	1.0


In [28]:
#Miocardial rupture leads to death in 100% cases - RAZRIV is not needed as a label (repeats LET_IS_3)
imp_data.drop(columns = 'RAZRIV', inplace=True)
vars_.drop(index = vars_.loc[vars_['name'] == 'RAZRIV'].index, axis = 0, inplace=True)

In [29]:
#Paroxysms of atrial fibrillation at the time of admission ==
#ECG rhythm at the time of admission to hospital: atrial fibrillation - can be dropped to reduce dimensionality
imp_data.drop(columns = 'MP_TP_POST', inplace=True)
vars_.drop(index = vars_.loc[vars_['name'] == 'MP_TP_POST'].index, axis = 0, inplace=True)

In [30]:
imp_data

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,nr_11,nr_01,nr_02,nr_03,nr_04,nr_07,nr_08,np_01,np_04,np_05,np_07,np_08,np_09,np_10,endocr_01,endocr_02,endocr_03,zab_leg_01,zab_leg_02,zab_leg_03,zab_leg_04,zab_leg_06,S_AD_ORIT,D_AD_ORIT,O_L_POST,K_SH_POST,SVT_POST,GT_POST,FIB_G_POST,ant_im,lat_im,inf_im,post_im,IM_PG_P,ritm_ecg_p_01,ritm_ecg_p_02,ritm_ecg_p_04,ritm_ecg_p_06,ritm_ecg_p_07,ritm_ecg_p_08,n_r_ecg_p_01,n_r_ecg_p_02,n_r_ecg_p_03,n_r_ecg_p_04,n_r_ecg_p_05,n_r_ecg_p_06,n_r_ecg_p_08,n_r_ecg_p_09,n_r_ecg_p_10,n_p_ecg_p_01,n_p_ecg_p_03,n_p_ecg_p_04,n_p_ecg_p_05,n_p_ecg_p_06,n_p_ecg_p_07,n_p_ecg_p_08,n_p_ecg_p_09,n_p_ecg_p_10,n_p_ecg_p_11,n_p_ecg_p_12,fibr_ter_01,fibr_ter_02,fibr_ter_03,fibr_ter_05,fibr_ter_06,fibr_ter_07,fibr_ter_08,GIPO_K,K_BLOOD,GIPER_NA,NA_BLOOD,ALT_BLOOD,AST_BLOOD,L_BLOOD,ROE,TIME_B_S,R_AB_1_n,R_AB_2_n,R_AB_3_n,NITR_S,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,FIBR_PREDS,PREDS_TAH,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS_0,LET_IS_1,LET_IS_2,LET_IS_3,LET_IS_4,LET_IS_5,LET_IS_6,LET_IS_7
0,1.356955,1.0,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.632851,1.058282,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.123430,0.0,0.246623,-0.668148,-0.302823,-0.234584,0.228162,4.0,0.0,0.0,1.0,0.0,-0.620119,-0.261379,-0.202208,0.0,-0.279280,-0.238136,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.581310,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.567731,0.416799,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,-0.772984,-0.668148,-0.302823,-0.234584,-0.969024,2.0,0.0,0.0,0.0,0.0,-0.620119,-0.261379,-0.202208,1.0,-0.279280,-0.238136,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.845619,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.632851,1.058282,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.247893,0.0,-0.772984,-0.668148,-0.302823,0.666034,-0.324385,3.0,3.0,0.0,0.0,0.0,0.661187,-0.261379,-0.202208,3.0,4.757454,5.390544,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.564028,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.567731,-0.866166,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,1.606098,1.147725,-0.302823,0.065622,0.043979,2.0,0.0,0.0,1.0,0.0,-0.620119,-0.261379,-0.202208,0.0,-0.279280,-0.238136,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.140795,1.0,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.899323,0.416799,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,-0.772984,-0.668148,-0.302823,-0.234584,0.228162,9.0,0.0,0.0,0.0,0.0,-0.620119,-0.261379,-0.202208,0.0,-0.279280,-0.238136,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,0.740234,0.0,0.0,2.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165796,-0.224683,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.247893,0.0,0.076688,1.147725,-0.302823,0.365828,0.043979,1.0,0.0,0.0,0.0,0.0,-0.620119,-0.261379,-0.202208,1.0,-0.279280,-0.238136,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1556,1.356955,0.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532560,0.416799,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,-0.093246,-0.668148,-0.302823,-0.834996,0.596527,3.0,0.0,0.0,0.0,1.0,-0.620119,-0.261379,-0.202208,0.0,2.239087,-0.238136,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1557,1.356955,0.0,0.0,4.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.934495,-0.866166,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,-1.112852,1.147725,3.196462,1.266446,-0.692750,2.0,0.0,-0.0,0.0,0.0,-0.620119,-0.261379,-0.202208,0.0,-0.279280,-0.238136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1558,0.740234,0.0,0.0,6.0,2.0,1.0,2.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-3.135077,-5.356544,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.247893,0.0,-0.093246,-0.668148,-0.302823,0.966240,0.136071,2.0,0.0,0.0,0.0,0.0,-0.620119,-0.261379,-0.202208,0.0,-0.279280,-0.238136,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
#Arterial hypertension presence correlating to each other
#imp_data[['GB', 'DLIT_AG']]
#Exertional angina pectoris == Functional class (FC) of angina pectoris correlation of presence too
#imp_data[['STENOK_AN', 'FK_STENOK']]

In [32]:
imp_data = pd.get_dummies(data = imp_data, columns = cat_cols).sample(frac=1)

In [33]:
imp_data

Unnamed: 0,AGE,SEX,SIM_GIPERT,nr_11,nr_01,nr_02,nr_03,nr_04,nr_07,nr_08,np_01,np_04,np_05,np_07,np_08,np_09,np_10,endocr_01,endocr_02,endocr_03,zab_leg_01,zab_leg_02,zab_leg_03,zab_leg_04,zab_leg_06,S_AD_ORIT,D_AD_ORIT,O_L_POST,K_SH_POST,SVT_POST,GT_POST,FIB_G_POST,IM_PG_P,ritm_ecg_p_01,ritm_ecg_p_02,ritm_ecg_p_04,ritm_ecg_p_06,ritm_ecg_p_07,ritm_ecg_p_08,n_r_ecg_p_01,n_r_ecg_p_02,n_r_ecg_p_03,n_r_ecg_p_04,n_r_ecg_p_05,n_r_ecg_p_06,n_r_ecg_p_08,n_r_ecg_p_09,n_r_ecg_p_10,n_p_ecg_p_01,n_p_ecg_p_03,n_p_ecg_p_04,n_p_ecg_p_05,n_p_ecg_p_06,n_p_ecg_p_07,n_p_ecg_p_08,n_p_ecg_p_09,n_p_ecg_p_10,n_p_ecg_p_11,n_p_ecg_p_12,fibr_ter_01,fibr_ter_02,fibr_ter_03,fibr_ter_05,fibr_ter_06,fibr_ter_07,fibr_ter_08,GIPO_K,K_BLOOD,GIPER_NA,NA_BLOOD,ALT_BLOOD,AST_BLOOD,L_BLOOD,ROE,NITR_S,NA_R_1_n,NA_R_2_n,NA_R_3_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,FIBR_PREDS,PREDS_TAH,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS_0,LET_IS_1,LET_IS_2,LET_IS_3,LET_IS_4,LET_IS_5,LET_IS_6,LET_IS_7,INF_ANAM_0.0,INF_ANAM_1.0,INF_ANAM_2.0,INF_ANAM_3.0,STENOK_AN_0.0,STENOK_AN_1.0,STENOK_AN_2.0,STENOK_AN_3.0,STENOK_AN_4.0,STENOK_AN_5.0,STENOK_AN_6.0,FK_STENOK_0.0,FK_STENOK_1.0,FK_STENOK_2.0,FK_STENOK_3.0,FK_STENOK_4.0,IBS_POST_0.0,IBS_POST_1.0,IBS_POST_2.0,GB_0.0,GB_1.0,GB_2.0,GB_3.0,DLIT_AG_0.0,DLIT_AG_1.0,DLIT_AG_2.0,DLIT_AG_3.0,DLIT_AG_4.0,DLIT_AG_5.0,DLIT_AG_6.0,DLIT_AG_7.0,DLIT_AG_8.0,DLIT_AG_9.0,ZSN_A_0.0,ZSN_A_1.0,ZSN_A_2.0,ZSN_A_3.0,ZSN_A_4.0,ant_im_0.0,ant_im_1.0,ant_im_2.0,ant_im_3.0,ant_im_4.0,lat_im_0.0,lat_im_1.0,lat_im_2.0,lat_im_3.0,lat_im_4.0,inf_im_0.0,inf_im_1.0,inf_im_2.0,inf_im_3.0,inf_im_4.0,post_im_0.0,post_im_1.0,post_im_2.0,post_im_3.0,post_im_4.0,TIME_B_S_1.0,TIME_B_S_2.0,TIME_B_S_3.0,TIME_B_S_4.0,TIME_B_S_5.0,TIME_B_S_6.0,TIME_B_S_7.0,TIME_B_S_8.0,TIME_B_S_9.0,R_AB_1_n_0.0,R_AB_1_n_1.0,R_AB_1_n_2.0,R_AB_1_n_3.0,R_AB_2_n_0.0,R_AB_2_n_1.0,R_AB_2_n_2.0,R_AB_2_n_3.0,R_AB_3_n_0.0,R_AB_3_n_1.0,R_AB_3_n_2.0,R_AB_3_n_3.0,NOT_NA_1_n_0.0,NOT_NA_1_n_1.0,NOT_NA_1_n_2.0,NOT_NA_1_n_3.0,NOT_NA_1_n_4.0
82,-1.374236,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.164291,0.096058,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,0.586492,1.147725,-0.302823,-0.234584,-0.140203,0.0,-0.620119,-0.261379,-0.202208,-0.279280,-0.238136,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
380,0.740234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.266087,1.058282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,0.076688,-0.668148,-0.302823,-0.834996,-0.692750,0.0,1.942493,-0.261379,-0.202208,-0.279280,-0.238136,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,True,False,False,False,False
107,-0.669413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.459207,0.416799,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.123430,0.0,-0.772984,-0.668148,-0.302823,-0.834996,-1.061115,0.0,-0.620119,-0.261379,-0.202208,-0.279280,-0.238136,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
1025,0.035411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.899323,0.416799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.123430,0.0,-0.093246,-0.668148,-0.302823,0.966240,-0.048112,0.0,0.661187,-0.261379,-0.202208,-0.279280,-0.238136,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
1211,0.652131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165796,0.416799,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.990539,0.0,-1.112852,-0.668148,-0.302823,0.065622,0.412344,0.0,0.661187,-0.261379,-0.202208,-0.279280,-0.238136,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,1.092646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.366378,1.058282,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.247893,0.0,1.096295,-0.668148,-0.302823,-0.534790,0.412344,0.0,-0.620119,-0.261379,-0.202208,-0.279280,-0.238136,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False
595,-1.286133,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165796,0.416799,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.247893,0.0,-2.302393,1.147725,-0.302823,-0.834996,-0.600659,0.0,-0.620119,-0.261379,-0.202208,4.757454,5.390544,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,True,False,False,False,False
554,0.740234,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.899323,-0.224683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.247893,0.0,-0.093246,1.147725,-0.302823,0.065622,0.136071,0.0,0.661187,-0.261379,-0.202208,-0.279280,-0.238136,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
77,1.004543,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312501,0.096058,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.123430,0.0,0.586492,1.147725,-0.302823,-0.234584,0.596527,0.0,-0.620119,-0.261379,-0.202208,-0.279280,-0.238136,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False


In [34]:
#Features and labels for classification
features = imp_data.columns[~imp_data.columns.isin(y.columns)]

In [35]:
labels = imp_data.columns[imp_data.columns.isin(y.columns)]

In [36]:
#Attemptimg PCA on dummy dataset
pca = PCA(0.9, svd_solver='full')
imp_data_pcanalysed = pca.fit_transform(imp_data[features])
imp_data_pcanalysed

array([[ 0.12070965, -0.71179005,  0.93304608, ...,  0.13770376,
        -0.37660399,  0.19676556],
       [-1.78454426,  0.22199818, -0.6782037 , ...,  0.5942162 ,
        -0.34637457, -0.82232762],
       [-0.93418405, -1.12570155,  0.76514312, ...,  0.01402526,
        -0.06116416,  0.30198638],
       ...,
       [ 0.08666321, -0.11434814, -0.38419303, ...,  0.33069229,
         0.20328459, -0.17595501],
       [-0.37660468, -0.95780815,  0.08802316, ..., -0.07330407,
         0.34663038,  0.50592704],
       [-2.39563406, -1.11003938, -0.21197806, ..., -0.43867964,
         0.19788426, -0.31182607]])

In [37]:
imp_data_pcanalysed.shape
#51-dimensional data - 3 times reduction, but the curse of dimensionality is not lifted

(1560, 51)

In [38]:
imp_data_pcanalysed_train = imp_data_pcanalysed[:1200]
imp_data_pcanalysed_test = imp_data_pcanalysed[1200:]

In [39]:
#To avoid the need for multi-label classification, the model will be trained on one label at a time
#Starting with Letality prediction

In [40]:
labels = labels[labels.str.startswith('LET_IS')][0]

In [41]:
#Train-test split on shuffled data w/o sklearn
X_train, X_test, y_train, y_test = imp_data[features][:1200], imp_data[features][1200:], imp_data[labels][:1200], imp_data[labels][1200:]
y_train = y_train.to_numpy().ravel()
y_test = y_test.to_numpy().ravel()

In [42]:
#The best classification algorithm will be searched via Grid search cross-validation
#It allows the testing of best parameters for any classifier
#Classifiers tested will be:
'''
xGB
Random Forests
Decision Trees
Multinomial Naive Bayes
'''
model_list = ['xgb', 'rf', 'dt', 'nb']
#Adding other models is not recommended - addition of class instances required to all functions

In [43]:
cv_ = 6

In [44]:
out = pd.DataFrame()
for model in model_list:
    res = PerformGridSearchCV(imp_data, feature_cols=features, label_col=labels, label_col_type='cat',
                             model_name=model, cv=cv_)
    out = pd.concat([out, res], ignore_index=True, axis=0)

In [45]:
first_metric = 'mean_test_specificity_score'
sec_metric = 'mean_test_f1'

In [46]:
#Employ a sum of 2 metrics - both F1 and specificity are equaly important
metric_sum = 'sum_f1_spec'
out[metric_sum] = out[first_metric]+out[sec_metric]
out = out.sort_values(metric_sum, ascending=False)

In [47]:
out

Unnamed: 0,mean_test_accuracy,mean_test_recall,mean_test_f1,mean_test_specificity_score,mean_fit_time,params,model,label_column,sum_f1_spec
76,0.865000,0.865000,0.863006,0.570112,0.016094,{'verbose': False},nb,LET_IS_0,1.433117
39,0.862500,0.862500,0.858587,0.541402,0.631316,"{'learning_rate': 1, 'max_depth': 1, 'n_estima...",xgb,LET_IS_0,1.399989
38,0.869167,0.869167,0.863289,0.533597,0.250636,"{'learning_rate': 1, 'max_depth': 1, 'n_estima...",xgb,LET_IS_0,1.396886
42,0.846667,0.846667,0.844221,0.515992,0.437040,"{'learning_rate': 1, 'max_depth': 2, 'n_estima...",xgb,LET_IS_0,1.360213
47,0.865833,0.865833,0.857865,0.500070,0.914743,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",xgb,LET_IS_0,1.357935
...,...,...,...,...,...,...,...,...,...
8,0.852500,0.852500,0.784624,0.147500,0.040722,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",xgb,LET_IS_0,0.932124
7,0.852500,0.852500,0.784624,0.147500,1.059870,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",xgb,LET_IS_0,0.932124
6,0.852500,0.852500,0.784624,0.147500,0.435089,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",xgb,LET_IS_0,0.932124
5,0.852500,0.852500,0.784624,0.147500,0.220751,"{'learning_rate': 0.001, 'max_depth': 2, 'n_es...",xgb,LET_IS_0,0.932124


In [48]:
model_name, params = out.loc[0][['model' ,'params']]

In [49]:
model = get_model(model_name, params)

In [50]:
y_pred = model.fit(X_train, y_train).predict(X_test)

In [51]:
#F1 score for predicting mortality
f1_score(y_test, y_pred, average='weighted')

0.7420054200542006

In [52]:
#Low specificity due to class imbalance
specificity_score(y_test, y_pred, average='weighted')

0.17777777777777778

In [53]:
#Binary variables now
labels = imp_data.columns[imp_data.columns.isin(y.columns)]
labels = labels[~labels.str.startswith('LET_IS')]

In [54]:
#Reduce to 3 since dataset is too imbalanced - some values do not appear in entire fractions of the train set
cv_ = 3

In [55]:
out = pd.DataFrame()
for label in labels:
    for model in model_list:
        res = PerformGridSearchCV(imp_data, feature_cols=features, label_col=label, label_col_type='bin',
                                 model_name=model, cv=cv_)
        out = pd.concat([out, res], ignore_index=True, axis=0)
#Precision is ill-defined and being set to 0.0 due to no predicted samples. - some GridSearchCV instances
# only contain "False" label values, so the TP/P ratio cannot be calculated due to profound set imbalance

In [56]:
metric_sum = 'sum_f1_spec'
out[metric_sum] = out[first_metric]+out[sec_metric]
out = out.sort_values(metric_sum, ascending=False)

In [57]:
#Max F1 score is up to ~0.47 - too low
out

Unnamed: 0,mean_test_accuracy,mean_test_recall,mean_test_f1,mean_test_specificity_score,mean_fit_time,params,model,label_column,sum_f1_spec
568,0.814167,0.257576,0.378313,0.971154,0.177629,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",xgb,ZSN,1.349467
570,0.800833,0.314394,0.410256,0.938034,0.837260,"{'learning_rate': 0.1, 'max_depth': 2, 'n_esti...",xgb,ZSN,1.348291
573,0.801667,0.291667,0.392224,0.945513,0.489210,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",xgb,ZSN,1.337737
572,0.807500,0.265152,0.376847,0.960470,0.250255,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",xgb,ZSN,1.337317
552,0.812500,0.246212,0.364113,0.972222,0.118105,"{'learning_rate': 0.01, 'max_depth': 1, 'n_est...",xgb,ZSN,1.336335
...,...,...,...,...,...,...,...,...,...
200,0.956667,0.000000,0.000000,0.977865,0.433307,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",xgb,JELUD_TAH,0.977865
199,0.950833,0.000000,0.000000,0.971895,0.237241,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",xgb,JELUD_TAH,0.971895
196,0.929167,0.037037,0.016260,0.948895,0.351236,"{'learning_rate': 1, 'max_depth': 2, 'n_estima...",xgb,JELUD_TAH,0.965155
197,0.940000,0.000000,0.000000,0.960830,0.776181,"{'learning_rate': 1, 'max_depth': 2, 'n_estima...",xgb,JELUD_TAH,0.960830


In [58]:
#Testing PCA prediction F1 score for predicting mortality
model = get_model(model_name, params)
y_pred = model.fit(imp_data_pcanalysed_train, y_train).predict(imp_data_pcanalysed_test)
f1_score(y_test, y_pred, average='weighted')
#It is lower than that of the dataset without SVD dimensionality reduction - PCA is detremental to scoring
#(may however improve fit and pred times)

0.7420054200542006

In [59]:
specificity_score(y_test, y_pred, average='weighted')

0.17777777777777778

In [60]:
out.groupby('label_column').max('mean_test_f1')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_recall,mean_test_f1,mean_test_specificity_score,mean_fit_time,sum_f1_spec
label_column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A_V_BLOK,0.964167,0.225397,0.170909,1.0,1.213335,1.15794
DRESSLER,0.960833,0.065278,0.083906,1.0,1.215411,1.061357
FIBR_JELUD,0.963333,0.160317,0.167869,1.0,1.354447,1.157484
FIBR_PREDS,0.901667,0.192949,0.191139,1.0,1.348354,1.162461
JELUD_TAH,0.978333,0.148148,0.097037,1.0,1.196402,1.068071
OTEK_LANC,0.9175,0.333333,0.327823,1.0,1.270746,1.261157
PREDS_TAH,0.990833,0.194444,0.16,1.0,1.2029,1.138117
P_IM_STEN,0.92,0.210636,0.185625,1.0,1.216285,1.133608
REC_IM,0.919167,0.205492,0.18915,1.0,1.224218,1.103844
ZSN,0.814167,0.363636,0.420378,1.0,1.277458,1.349467
