In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.insert(0, os.path.join(os.path.abspath('.'),'..', 'src'))
import tree_utils, ctree

import numpy as np
import scipy as sc
from scipy import stats
import pandas as pd
import pickle
import json
import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# TODO: cross validation, output all HTMLs AND add JSONs

In [2]:
data_dir = r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data\E_ResearchData\2_ResearchData\Parquet'
output_dir = r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data\G_Output\2_Data\CustomTree'

In [3]:
MIN_MORPHO_PRESENCE = 0.15 # %
MULTI_CLASS = True
num_splits = 10
num_repeats = 10

MULTI_CLASS_STRING = "_MultiClass" if MULTI_CLASS else ""

In [4]:
DATA = pd.read_parquet(os.path.join(data_dir, f'DATA.parquet'))

morphology_columns = [c for c in DATA.columns if 'morphology' in c]
lead_columns = [c for c in DATA.columns if ('lead' in c) & ('morphology' not in c)]

for c in morphology_columns:
    DATA.loc[:, c] = DATA[c].apply(lambda x: x[0].strip(",").strip(" "))
    DATA.loc[:, c] = DATA[c].apply(lambda x: x if x.strip()!="" else "none")

OneHot = OneHotEncoder(drop=None, sparse_output=False, min_frequency=MIN_MORPHO_PRESENCE)
MorphologyOneHot = pd.DataFrame(data=OneHot.fit_transform(DATA[morphology_columns]), 
                            columns=OneHot.get_feature_names_out(morphology_columns),
                            index=DATA.index)

In [5]:
DATA = DATA.drop(morphology_columns, axis=1)
DATA = pd.concat([DATA, MorphologyOneHot], axis=1)

In [6]:
DATA = DATA.assign(Diagnosis=DATA.Diagnosis.map({
                                                                'SR': 'SR',
                                                                'BF': 'BF',
                                                                'RBBB': 'RBBB',
                                                                'LBBB': 'LBBB',
                                                                'LAFB': 'LAFB',
                                                                'LAFB , LVH': 'LAFB',
                                                                'Microvoltages , BF': 'BF',
                                                                'Microvoltages , RBBB': 'RBBB',
                                                                'Microvoltages , LAFB': 'LAFB', 
                                                                'LVH , BF': 'BF',
                                                                'LVH , RBBB': 'RBBB',
                                                                'LVH , LBBB': 'LBBB'
                                                            }))
Reduction_map = {'BF': 'Disease', 
                 'LBBB': 'Disease', 
                 'RBBB': 'Disease',
                 'LAFB': 'Disease',
                 'SR': 'Normal'}

if MULTI_CLASS==False:
    DATA = DATA.assign(Diagnosis=DATA.Diagnosis.map(Reduction_map))
DATA = DATA.dropna(subset=['Diagnosis'])


# Make tree

In [7]:
rules_path = r'T:\laupodteam\AIOS\Bram\notebooks\code_dev\miniECG_interpretation\TreeBuilder\assets\conduction_tree.json'

TreeKwargs = {
    'criterion':'gini', 
    'splitter':'best', 
    'max_depth':3, 
    'min_samples_split':10, 
    'min_samples_leaf': 5, 
    'min_weight_fraction_leaf':0.05, 
    'max_features':None, 
    'random_state':7, 
    'max_leaf_nodes':50,
    'class_weight': 'balanced'
}

In [8]:
rules_loader = ctree.LoadRules(rules_path)
processed_rules = rules_loader.get_processed_rules()

In [9]:
processed_rules.features_to_use_next

[]

In [10]:
SplitColumn = rules_loader.fold_split_col
TargetCol = rules_loader.target_col
IgnoreCols = rules_loader.ignore_cols
FeaturesToUse = rules_loader.features_to_use

In [11]:
if len(FeaturesToUse)>0:
    keep_columns = set(FeaturesToUse).difference(set(IgnoreCols))
else:
    keep_columns = [c for c in DATA.columns if c not in IgnoreCols]

Splitter = RepeatedStratifiedKFold(n_splits=num_splits, 
                                   n_repeats=num_repeats, 
                                   random_state=7)

In [12]:
X = DATA[[c for c in keep_columns if c not in [SplitColumn, TargetCol]]]
Y = DATA[TargetCol]

In [13]:
lb = LabelBinarizer()
lbe = LabelEncoder()
lbe.fit(Y)
TargetMap = {k:v for k,v in enumerate(lbe.classes_)}

In [14]:
TargetMap.values()

dict_values(['BF', 'LAFB', 'LBBB', 'RBBB', 'SR'])

In [15]:
X.to_parquet(os.path.join(output_dir, f'data{MULTI_CLASS_STRING}.parquet'))

In [None]:
results_list = []
for i, (train_index, test_index) in tqdm.tqdm(enumerate(Splitter.split(X, Y)),
                                              total=num_splits * num_repeats):
    result_df = pd.DataFrame()
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    y_train_encoded = lbe.transform(Y_train)
    y_test_encoded = lbe.transform(Y_test)
        
    Imputer = IterativeImputer(LinearRegression(), 
                           add_indicator=True, max_iter=25, verbose=1)
    Imputer.fit(X_train)
    
    X_train_imputed = Imputer.transform(X_train)
    X_test_imputed = Imputer.transform(X_test)
    
    X_train_imputed = pd.DataFrame(data=X_train_imputed,
                                   columns=Imputer.get_feature_names_out())
    
    X_test_imputed = pd.DataFrame(data=X_test_imputed,
                                   columns=Imputer.get_feature_names_out())
    
    clf = ctree.CustomDecisionTreeV2(custom_rules=processed_rules,
                                 prune_threshold=None,
                                 Tree_kwargs=TreeKwargs,
                                 TargetMap = TargetMap, 
                                 tot_max_depth=5)
    clf_base = DecisionTreeClassifier(**TreeKwargs)
    
    clf_base.fit(X_train_imputed, y_train_encoded)
    clf.fit(X_train_imputed, y_train_encoded)
    enriched_rules = clf.get_enriched_rules()
    final_tree = clf.get_custom_rules_model()
    
    Fold = i % num_splits
    Repeat = i // num_splits
    
    
    json.dump(final_tree, open(os.path.join(output_dir, f"tree_Fold{Fold}_{Repeat}{MULTI_CLASS_STRING}.json"), mode='w'))
    ctree.update_html(tree=final_tree, 
                      html_path="../src/treeTemplate.html", 
                      output_path=os.path.join(output_dir, f"tree_Fold{Fold}_{Repeat}{MULTI_CLASS_STRING}.html"))
    
    cust_probas_train = clf.predict_proba(X_train_imputed)
    cust_probas_test = clf.predict_proba(X_test_imputed)
    
    base_probas_train = clf_base.predict_proba(X_train_imputed)
    base_probas_test = clf_base.predict_proba(X_test_imputed)
    
    result_df['indices'] = np.hstack([train_index, test_index])
    result_df['Fold'] = Fold
    result_df['Repeat'] = Repeat
    result_df['Y_true'] = np.hstack([Y_train.values, Y_test.values])
    result_df[[f'Y_pred_normal_DT_{cname}' for cname in TargetMap.values()]] = np.vstack([base_probas_train, base_probas_test])
    result_df[[f'Y_pred_custom_DT_{cname}' for cname in TargetMap.values()]] = np.vstack([cust_probas_train, cust_probas_test])
    result_df['Dataset'] = ['train' for _ in train_index]+['test' for _ in test_index]
    
    results_list.append(result_df)
    
Final_results = pd.concat(results_list, axis=0)
Final_results.to_csv(os.path.join(output_dir, f"results{MULTI_CLASS_STRING}.csv"), index=False, sep=";")
Final_results.to_parquet(os.path.join(output_dir, f"results{MULTI_CLASS_STRING}.parquet"))


  0%|          | 0/100 [00:00<?, ?it/s]

[IterativeImputer] Completing matrix with shape (1215, 63)
