In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
from collections import namedtuple
import numpy as np
import scipy as sc
from scipy import stats

sys.path.insert(0, os.path.join(os.path.abspath('.'),'..', 'src'))
import tree_utils, ctree
import pickle
import json
import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from xgboost import XGBClassifier

from sklearn.metrics import f1_score, roc_auc_score


In [5]:
data = pd.read_parquet('../artifacts/data.parquet')

```
We’d like to show in the tree-visualization the names for the outcomes as follows: 
-	[X] In the conduction/muscle trees: SR should be ‘control’
-	[X] In the axis tree, ‘ normal axis’ should not be ‘ control’ , but just ‘ normal axis’ 
-	[X] In the conduction tree, ‘BF’ should be ‘ BfB’ 

The trees that we will show and are thus the most important:
1.	Conduction: Customized tree (no missing indicator features, with the morphology maps as we defined the 4 categories, with the customization of the QRS duration (of 110 and 120 ms as done before) 
2.	Axis/muscle: semi-customized (only use the  features of the selection that we provided, no missing indicator features, with the morphology maps in 4 categories) 

We’ll compare in the ROC-curves, and the net benefit curves 3 models, so it would be great if we could have the net benefit curves with the following combinations of models (the ROC-curve figures, we can make ourselves once we have the results of the new trees).  
-	Conduction: 
o	1. xgb 2. lr 3. dt 4. Customized dt (the decision tree being the customized one as described in point 1 before)
-	Axis/muscle: 
o	1.xgb 2. lr 3. Semi-customized dt (the decision tree being the semi-customized one as described in point 2 before)
```


In [None]:
morphology_categories = {
    'only positive, no notch/acc': ['R'], 
    'only negative, no notch/acc': ['S'],
    'both positive and negative, no notch/acc': ['Q.R', 'Q.R.S', 'R.S'],
    'only positive with notch/accent': [
        'R.R_acc', 'R.Rn', 'R.Rn.R_acc', 'Rn.R', 'Rn.R.R_acc', 'Rn.R.Rn'
    ],
    'only negative with notch/accent': [
        'S.Sn', 'Sn.S', 'Sn.S.Sn'
    ],
    'both positive and negative with notch/accent': [
        'Q.R.R_acc', 'Q.R.R_acc.S', 'Q.R.Rn', 'Q.R.Rn.S', 'Q.R.S.R_acc',
        'Q.R.S.R_acc.S_acc', 'Q.R.S.Sn', 'Q.Rn.R', 'Q.Rn.R.S', 'R.R_acc.S',
        'R.R_acc.S.S_acc', 'R.Rn.S', 'R.S.R_acc', 'R.S.R_acc.S_acc', 'R.S.Rn',
        'R.S.Rn.Sn', 'R.S.S_acc', 'R.S.Sn', 'R.Sn.S', 'R.Sn.S.R_acc',
        'R.Sn.S.Sn', 'Rn.R.R_acc.S', 'Rn.R.Rn.S', 'Rn.R.S', 'Rn.R.S.R_acc',
        'Rn.R.S.R_acc.S_acc', 'Rn.R.S.Rn'
    ],
    'none': ['none']
}
inv_morpho_map = {_v:k  for k,v in morphology_categories.items() for _v in v}


In [None]:
data_dir = r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data\E_ResearchData\2_ResearchData\Parquet'

In [None]:
NameMap = pd.read_parquet(os.path.join(data_dir, '..', 'Name_toSimpleName.parquet'))

In [None]:
NameMapDict = {k:v for k,v in zip(NameMap['Old_Name'].values, NameMap['New_Name'].values)}

In [None]:
MIN_MORPHO_PRESENCE = 0.15 # %
MULTI_CLASS = False
num_splits = 10
num_repeats = 10
MISSINGNESS_INDICATOR = False
MORPHO_MAP = True

MULTI_CLASS_STRING = "_MultiClass" if MULTI_CLASS else "_BinaryClass"
MISSINGNESS_INDICATOR_STRING = "_wMissing" if MISSINGNESS_INDICATOR else ""
MORPHO_MAP_STRING = "_wMorphoMap" if MORPHO_MAP else ""

TARGET = "muscle" # axis, muscle, conduction
rules_path = f'T://laupodteam/AIOS/Bram/notebooks/code_dev/miniECG_interpretation/TreeBuilder/assets/{TARGET}_tree.json'

In [None]:
output_dir = os.path.join(r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data\G_Output\2_Data\CustomTree', f'{TARGET}{MULTI_CLASS_STRING}')
os.makedirs(output_dir, exist_ok=True)

In [None]:
if TARGET=='conduction':
    rules_loader = ctree.LoadRules(rules_path, name_map=NameMapDict)
    processed_rules = rules_loader.get_processed_rules()
    
    SplitColumn = rules_loader.fold_split_col
    TargetCol = rules_loader.target_col
    IgnoreCols = rules_loader.ignore_cols + [SplitColumn]
    FeaturesToUse = rules_loader.features_to_use
else:
    rules_loader = None
    processed_rules = None
    rules_loader_dict = json.load(open(rules_path, 'r'))
    
    SplitColumn = rules_loader_dict['fold_split_col']
    TargetCol = rules_loader_dict['target_col']
    IgnoreCols = rules_loader_dict['ignore_cols'] + [SplitColumn]
    FeaturesToUse = rules_loader_dict['features_to_use']

In [None]:
DATA = pd.read_parquet(os.path.join(data_dir, f'DATA.parquet'))

if len(FeaturesToUse)>0:
    keep_columns = list(set(FeaturesToUse).difference(set(IgnoreCols)))
else:
    keep_columns = [c for c in DATA.columns if c not in IgnoreCols]
    
keep_columns = list(set(keep_columns+[TargetCol]))
    
DATA = DATA.loc[:, keep_columns]

In [None]:
DATA.columns = [NameMapDict[c] for c in DATA.columns]
morphology_columns = [c for c in DATA.columns if 'morphology' in c.lower()]
lead_columns = [c for c in DATA.columns if ('lead' in c.lower()) & ('morphology' not in c.lower())]
for c in morphology_columns:
    DATA.loc[:, c] = DATA[c].apply(lambda x: x[0].strip(",").strip(" "))
    DATA.loc[:, c] = DATA[c].apply(lambda x: x if x.strip()!="" else "none")

In [None]:
if MORPHO_MAP:
    for c in morphology_columns:
        DATA.loc[:, c] = DATA[c].map(inv_morpho_map)
        

In [None]:
vocab = set()
for lOl in [DATA[c].str.split(".").values for c in morphology_columns]:
    for l in lOl:
        for _s in l:
            vocab.add(_s)
Vocab = {k:v for k,v in enumerate(vocab)}

In [None]:
OneHot = OneHotEncoder(drop=None, 
                       sparse_output=False, 
                       min_frequency=MIN_MORPHO_PRESENCE,
                       handle_unknown='infrequent_if_exist')

MorphologyOneHot = pd.DataFrame(data=OneHot.fit_transform(DATA[morphology_columns]), 
                            columns=OneHot.get_feature_names_out(morphology_columns),
                            index=DATA.index)

In [None]:
DATA = DATA.drop(morphology_columns, axis=1)
DATA = pd.concat([DATA, MorphologyOneHot], axis=1)
keep_columns = DATA.columns


In [None]:
#TODO: Make multiple DATA, X,Y for: AXIS, MUSCLE and CONDUCTION

In [None]:
if TARGET == 'conduction':
    DATA = DATA.assign(Diagnosis=DATA.Diagnosis.map({
                                                                    'SR': 'Control',
                                                                    'BF': 'BfB',
                                                                    'RBBB': 'RBBB',
                                                                    'LBBB': 'LBBB',
                                                                    'LAFB': 'LAFB',
                                                                    'LAFB , LVH': 'LAFB',
                                                                    'Microvoltages , BF': 'BfB',
                                                                    'Microvoltages , RBBB': 'RBBB',
                                                                    'Microvoltages , LAFB': 'LAFB', 
                                                                    'LVH , BF': 'BfB',
                                                                    'LVH , RBBB': 'RBBB',
                                                                    'LVH , LBBB': 'LBBB'
                                                                }))
    Reduction_map = {'BfB': 'Abnormal', 
                     'LBBB': 'Abnormal', 
                     'RBBB': 'Abnormal',
                     'LAFB': 'Abnormal',
                     'Control': 'Control'}
elif TARGET == 'axis':
    if MULTI_CLASS:
        target_inclusion = ['Left', 'Normal', 'Right']
    else:
        target_inclusion = ['Left', 'Normal', 'Right', 'Extreme']
    DATA = DATA.loc[DATA['Heart Axis Diagnosis'].isin(target_inclusion)]    
    Reduction_map = {'Left': 'Abnormal', 
                     'Right': 'Abnormal',
                     'Extreme': 'Abnormal',
                     'Normal': 'Normal'}   
elif TARGET == 'muscle':
    DATA = DATA.assign(Diagnosis=DATA.Diagnosis.map({
                                                            'SR': 'Control',
                                                            'Microvoltages': 'Microvoltages',
                                                            'LVH': 'LVH',
                                                            'LAFB , LVH': 'LVH',
                                                            'Microvoltages , BF': 'Microvoltages',
                                                            'Microvoltages , RBBB': 'Microvoltages',
                                                            'Microvoltages , LAFB': 'Microvoltages',
                                                            'LVH , BF': 'LVH',
                                                            'LVH , RBBB': 'LVH',
                                                            'LVH , LBBB': 'LVH'
                                                        }))
    target_inclusion = ['Control','LVH','Microvoltages']
    DATA = DATA.loc[DATA['Diagnosis'].isin(target_inclusion)]    

    Reduction_map = {'Microvoltages': 'Abnormal', 
                     'LVH': 'Abnormal',
                     'Control': 'Control'}
else:
    raise ValueError(f'Unknown target {TARGET}')


In [None]:
if MULTI_CLASS==False:
    DATA.loc[:, TargetCol] = DATA[TargetCol].map(Reduction_map)
DATA = DATA.dropna(subset=[TargetCol])

In [None]:
Infreq_cat_dict = {morphology_columns[k]:list(inf_cats) for k, inf_cats in enumerate(OneHot.infrequent_categories_)}

In [None]:
json.dump(Infreq_cat_dict, open(os.path.join(output_dir, 'infrequent_categories_map.json'), 'w'))

# Make tree

In [2]:
TreeKwargs = {
    'criterion':'gini', 
    'splitter':'best', 
    'max_depth':5, 
    'min_samples_split':10, 
    'min_samples_leaf': 5, 
    'min_weight_fraction_leaf':0.05, 
    'max_features':None, 
    'random_state':7, 
    'max_leaf_nodes':50,
    'class_weight': 'balanced'
}
xgboost_kwargs = {
    'n_estimators': 150,
    'max_depth': 6,
    'max_leaves': 50,
    'learning_rate': 2e-3,
    'gamma': 0.4,
    'subsample': 0.55,
    'colsample_bytree':0.85,
    'reg_alpha': 0.005,
}
logistic_kwargs = {
    'penalty': 'elasticnet', 
    'solver': 'saga', 
    'dual': False, 
    'tol': 0.0001, 
    'C':1.0, 
    'fit_intercept': True, 
    'intercept_scaling':1, 
    'class_weight':None, 
    'random_state':7,     
    'max_iter':3000, 
    'verbose': 0, 
    'warm_start': False, 
    'n_jobs':-1, 
    'l1_ratio':0.5
}

In [None]:
Splitter = RepeatedStratifiedKFold(n_splits=num_splits, 
                                   n_repeats=num_repeats, 
                                   random_state=7)

In [None]:
X = DATA.loc[:,[c for c in keep_columns if c!=TargetCol]]
Y = DATA[TargetCol]

In [None]:
lb = LabelBinarizer()
lbe = LabelEncoder()
lbe.fit(Y)
TargetMap = {k:v for k,v in enumerate(lbe.classes_)}

In [None]:
TargetMap.values()

In [None]:
X.to_parquet(os.path.join(output_dir, f'data{MULTI_CLASS_STRING}{MISSINGNESS_INDICATOR_STRING}{MORPHO_MAP_STRING}.parquet'))

In [None]:
results_list = []
for i, (train_index, test_index) in tqdm.tqdm(enumerate(Splitter.split(X, Y)),
                                              total=num_splits * num_repeats):
    result_df = pd.DataFrame()
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    y_train_encoded = lbe.transform(Y_train)
    y_test_encoded = lbe.transform(Y_test)
        
    #Imputer = IterativeImputer(Lasso(), 
    #                           add_indicator=MISSINGNESS_INDICATOR, 
    #                           max_iter=5_000, verbose=0)
    Imputer = KNNImputer(add_indicator=MISSINGNESS_INDICATOR, n_neighbors=10, weights='distance')
    
    Imputer.fit(X_train)
    
    X_train_imputed = Imputer.transform(X_train)
    X_test_imputed = Imputer.transform(X_test)
    
    X_train_imputed = pd.DataFrame(data=X_train_imputed,
                                   columns=Imputer.get_feature_names_out())
    
    X_test_imputed = pd.DataFrame(data=X_test_imputed,
                                   columns=Imputer.get_feature_names_out())
    
    clf = ctree.CustomDecisionTree(custom_rules=processed_rules,
                             prune_threshold=None,
                             Tree_kwargs=TreeKwargs,
                             TargetMap = TargetMap, 
                             tot_max_depth=5)
    if processed_rules is not None:
        #print("Training custom tree...")
        clf.fit(X_train_imputed, y_train_encoded)
        enriched_rules = clf.get_enriched_rules()
        final_tree = clf.get_custom_rules_model()
        
        #############################
        ## Writing out the tree #####
        #############################
        
        if processed_rules is not None:
            json.dump(final_tree, 
                      open(os.path.join(output_dir, f"tree_Fold{Fold}_{Repeat}{MULTI_CLASS_STRING}{MISSINGNESS_INDICATOR_STRING}{MORPHO_MAP_STRING}.json"), mode='w'))
            ctree.update_html(tree=final_tree, 
                              html_path="../src/treeTemplate.html", 
                              output_path=os.path.join(output_dir, f"tree_Fold{Fold}_{Repeat}{MULTI_CLASS_STRING}{MISSINGNESS_INDICATOR_STRING}{MORPHO_MAP_STRING}.html"))
        cust_probas_train = clf.predict_proba(X_train_imputed)
        cust_probas_test = clf.predict_proba(X_test_imputed)


    clf_base = DecisionTreeClassifier(**TreeKwargs)
    clf_xgb = XGBClassifier(**xgboost_kwargs)
    clf_logistic = LogisticRegression(**logistic_kwargs)
    
    #print("Training classifiers...")
    #print("Training standard decision tree...")
    clf_base.fit(X_train_imputed, y_train_encoded)
    #print("Training xgboost...")
    clf_xgb.fit(X_train_imputed, y_train_encoded)
    #print("Training logistic regression...")
    clf_logistic.fit(X_train_imputed, y_train_encoded)
    
    Fold = i % num_splits
    Repeat = i // num_splits
    
    #############################
    ## Writing out the tree #####
    #############################
    
    sklearn_tree = clf.load_from_sklearn_tree(clf_base, X_train_imputed, y_train_encoded)
    final_tree_sklearn = sklearn_tree.get_custom_rules_model()
    json.dump(final_tree_sklearn, 
              open(os.path.join(output_dir, f"sklearn_tree_Fold{Fold}_{Repeat}{MULTI_CLASS_STRING}{MISSINGNESS_INDICATOR_STRING}{MORPHO_MAP_STRING}.json"), mode='w'))
    ctree.update_html(tree=final_tree_sklearn, 
                     html_path="../src/treeTemplate.html", 
                     output_path=os.path.join(output_dir, f"sklearn_tree_Fold{Fold}_{Repeat}{MULTI_CLASS_STRING}{MISSINGNESS_INDICATOR_STRING}{MORPHO_MAP_STRING}.html"))
    #############################
    #############################
    #############################    

    
    base_probas_train = clf_base.predict_proba(X_train_imputed)
    base_probas_test = clf_base.predict_proba(X_test_imputed)
    
    xgb_probas_train = clf_xgb.predict_proba(X_train_imputed)
    xgb_probas_test = clf_xgb.predict_proba(X_test_imputed)
    
    logistic_probas_train = clf_logistic.predict_proba(X_train_imputed)
    logistic_probas_test = clf_logistic.predict_proba(X_test_imputed)
    
    result_df['indices'] = np.hstack([train_index, test_index])
    result_df['Fold'] = Fold
    result_df['Repeat'] = Repeat
    result_df['Y_true'] = np.hstack([Y_train.values, Y_test.values])
    result_df[[f'Y_pred_normalDT_{cname}' for cname in TargetMap.values()]] = np.vstack([base_probas_train, base_probas_test])
    if processed_rules is not None:
        result_df[[f'Y_pred_customDT_{cname}' for cname in TargetMap.values()]] = np.vstack([cust_probas_train, cust_probas_test])
    result_df[[f'Y_pred_XGB_{cname}' for cname in TargetMap.values()]] = np.vstack([xgb_probas_train, xgb_probas_test])
    result_df[[f'Y_pred_LR_{cname}' for cname in TargetMap.values()]] = np.vstack([logistic_probas_train, logistic_probas_test])    
    result_df['Dataset'] = ['train' for _ in train_index]+['test' for _ in test_index]
    
    results_list.append(result_df)
    


In [None]:
Final_results = pd.concat(results_list, axis=0, ignore_index=True)
Fina_results = Final_results.reset_index(drop=True)

classes = set([c.split("_")[-1] for c in Final_results.columns if 'pred' in c])

for _class in classes:
    Final_results[f'Y_true_{_class}'] = (Final_results['Y_true'] == _class).astype(int)

Final_results.to_csv(
    os.path.join(output_dir, f"results{MULTI_CLASS_STRING}{MISSINGNESS_INDICATOR_STRING}{MORPHO_MAP_STRING}.csv"),
    index=False, sep=";")

Final_results.to_parquet(
    os.path.join(output_dir, f"results{MULTI_CLASS_STRING}{MISSINGNESS_INDICATOR_STRING}{MORPHO_MAP_STRING}.parquet"))

In [None]:
Final_results