In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.insert(0, os.path.join(os.path.abspath('.'),'..', 'src'))
import tree_utils, ctree

import numpy as np
import scipy as sc
from scipy import stats
import pandas as pd
import pickle
import json

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

In [2]:
data_dir = r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data\E_ResearchData\2_ResearchData\Parquet'

In [3]:
MIN_MORPHO_PRESENCE = 0.15 # %


In [4]:
DATA = pd.read_parquet(os.path.join(data_dir, 'DATA.parquet'))

morphology_columns = [c for c in DATA.columns if 'morphology' in c]
lead_columns = [c for c in DATA.columns if ('lead' in c) & ('morphology' not in c)]

for c in morphology_columns:
    DATA.loc[:, c] = DATA[c].apply(lambda x: x[0].strip(",").strip(" "))
    DATA.loc[:, c] = DATA[c].apply(lambda x: x if x.strip()!="" else "none")

OneHot = OneHotEncoder(drop=None, sparse_output=False, min_frequency=MIN_MORPHO_PRESENCE)
MorphologyOneHot = pd.DataFrame(data=OneHot.fit_transform(DATA[morphology_columns]), 
                            columns=OneHot.get_feature_names_out(morphology_columns),
                            index=DATA.index)

In [5]:
DATA = DATA.drop(morphology_columns, axis=1)
DATA = pd.concat([DATA, MorphologyOneHot], axis=1)

In [6]:
DATA = DATA.assign(Diagnosis=DATA.Diagnosis.map({
                                                                'SR': 'SR',
                                                                'BF': 'BF',
                                                                'RBBB': 'RBBB',
                                                                'LBBB': 'LBBB',
                                                                'LAFB': 'LAFB',
                                                                'LAFB , LVH': 'LAFB',
                                                                'Microvoltages , BF': 'BF',
                                                                'Microvoltages , RBBB': 'RBBB',
                                                                'Microvoltages , LAFB': 'LAFB', 
                                                                'LVH , BF': 'BF',
                                                                'LVH , RBBB': 'RBBB',
                                                                'LVH , LBBB': 'LBBB'
                                                            }))
Reduction_map = {'BF': 'Disease', 
                 'LBBB': 'Disease', 
                 'RBBB': 'Disease',
                 'LAFB': 'Disease',
                 'SR': 'Normal'}

DATA = DATA.assign(Diagnosis=DATA.Diagnosis.map(Reduction_map))
DATA = DATA.dropna(subset=['Diagnosis'])


# Make tree

In [7]:
rules_path = r'T:\laupodteam\AIOS\Bram\notebooks\code_dev\miniECG_interpretation\TreeBuilder\assets\conduction_tree.json'

TreeKwargs = {
    'criterion':'gini', 
    'splitter':'best', 
    'max_depth':5, 
    'min_samples_split':10, 
    'min_samples_leaf': 5, 
    'min_weight_fraction_leaf':0.05, 
    'max_features':None, 
    'random_state':7, 
    'max_leaf_nodes':50,
    'class_weight': 'balanced'
}

In [80]:
rules_loader = ctree.LoadRules(rules_path)
processed_rules = rules_loader.get_processed_rules()

In [81]:
processed_rules.features_to_use_next

[]

In [10]:
SplitColumn = rules_loader.fold_split_col
TargetCol = rules_loader.target_col
IgnoreCols = rules_loader.ignore_cols
FeaturesToUse = rules_loader.features_to_use

In [11]:
if len(FeaturesToUse)>0:
    keep_columns = set(FeaturesToUse).difference(set(IgnoreCols))
else:
    keep_columns = [c for c in DATA.columns if c not in IgnoreCols]
    
y_train = DATA.loc[DATA[SplitColumn]=='training', TargetCol]
y_test = DATA.loc[DATA[SplitColumn]=='test', TargetCol]
X_train = DATA.loc[DATA[SplitColumn]=='training', keep_columns].drop([TargetCol, SplitColumn], 
                                                                   axis=1)
X_test = DATA.loc[DATA[SplitColumn]=='test', keep_columns].drop([TargetCol, SplitColumn],
                                                              axis=1)

In [12]:
lb = LabelBinarizer()
lbe = LabelEncoder()
lbe.fit(y_train)

y_train_encoded = lbe.transform(y_train)
y_test_encoded = lbe.transform(y_test)

In [13]:
missings = X_train.isna().sum()
missings[missings>0].shape

(34,)

In [14]:
Imputer = IterativeImputer(LinearRegression(), 
                           add_indicator=True, max_iter=20, verbose=1)
Imputer.fit(X_train)

X_train_imputed = Imputer.transform(X_train)
X_test_imputed = Imputer.transform(X_test)

X_train_imputed = pd.DataFrame(data=X_train_imputed,
                               columns=Imputer.get_feature_names_out())

X_test_imputed = pd.DataFrame(data=X_test_imputed,
                               columns=Imputer.get_feature_names_out())

[IterativeImputer] Completing matrix with shape (1081, 63)
[IterativeImputer] Change: 58.31640962580376, scaled tolerance: 1.812 
[IterativeImputer] Change: 36.94952394885863, scaled tolerance: 1.812 
[IterativeImputer] Change: 13.00268372738287, scaled tolerance: 1.812 
[IterativeImputer] Change: 3.476502693501251, scaled tolerance: 1.812 
[IterativeImputer] Change: 3.0009677362383527, scaled tolerance: 1.812 
[IterativeImputer] Change: 2.397634645181521, scaled tolerance: 1.812 
[IterativeImputer] Change: 1.8144918346494925, scaled tolerance: 1.812 
[IterativeImputer] Change: 1.3742119567232078, scaled tolerance: 1.812 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (1081, 63)
[IterativeImputer] Completing matrix with shape (270, 63)


In [82]:
clf = ctree.CustomDecisionTreeV2(custom_rules=processed_rules,
                                 prune_threshold=None,
                                 Tree_kwargs=TreeKwargs)
clf_base = DecisionTreeClassifier(**TreeKwargs)

In [83]:
clf_base.fit(X_train_imputed, y_train_encoded)

In [84]:
clf.fit(X_train_imputed, y_train_encoded)

INFO:ctree:Starting fit method
INFO:ctree:Number of features: 97
INFO:ctree:Number of classes: 2
INFO:ctree:Features to consider: ['qrs_vector mean lead_0', 'p_vector mean lead_0', 't_vector mean lead_0', 'qrs_ampl mean lead_0', 'qrs_vector mean lead_1', 'p_vector mean lead_1', 't_vector mean lead_1', 'qrs_ampl mean lead_1', 'qrs_vector mean lead_2', 'p_vector mean lead_2', 't_vector mean lead_2', 'qrs_ampl mean lead_2', 'qrs_vector mean lead_3', 'p_vector mean lead_3', 't_vector mean lead_3', 'qrs_ampl mean lead_3', 'qrs_vector mean lead_4', 'p_vector mean lead_4', 't_vector mean lead_4', 'qrs_ampl mean lead_4', 'qrs_vector mean lead_5', 'p_vector mean lead_5', 't_vector mean lead_5', 'qrs_ampl mean lead_5', 'qrs_vector mean lead_6', 'p_vector mean lead_6', 't_vector mean lead_6', 'qrs_ampl mean lead_6', 'qrs_vector mean lead_7', 'p_vector mean lead_7', 't_vector mean lead_7', 'qrs_ampl mean lead_7', 'QQ_intervals mean', 'HRs mean', 'PQ_duration mean', 'QT_duration mean', 'QTc_duratio

In [85]:
enriched_rules = clf.get_enriched_rules()

In [86]:
final_tree = clf.get_custom_rules_model()

In [87]:
ctree.update_html(tree=final_tree, html_path="../src/treeTemplate.html")

In [35]:
base_probas_train = clf_base.predict_proba(X_train_imputed)
base_probas_test = clf_base.predict_proba(X_test_imputed)

In [36]:
cust_probas_train = clf.predict_proba(X_train_imputed)
cust_probas_test = clf.predict_proba(X_test_imputed)

In [37]:
np.corrcoef(base_probas_test[:,1], cust_probas_test[:,1])[0,1]

0.8628637495085457

In [92]:
sel = X_train_imputed[X_train_imputed['QRS_duration mean']>=110].index

In [101]:
1-y_train_encoded[sel].mean(), len(sel)

(0.9642857142857143, 448)