In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.insert(0, os.path.join(os.path.abspath('.'),'..', 'src'))
import tree_utils, ctree

import numpy as np
import scipy as sc
from scipy import stats
import pandas as pd
import pickle

from sklearn.preprocessing import OneHotEncoder

In [2]:
data_dir = r'J:\Onderzoek\21-763_rvanes_MiniECG-2-Data\E_ResearchData\2_ResearchData\Parquet'

In [3]:
MIN_MORPHO_PRESENCE = 0.10 # %


In [4]:
DATA = pd.read_parquet(os.path.join(data_dir, 'DATA.parquet'))

morphology_columns = [c for c in DATA.columns if 'morphology' in c]
lead_columns = [c for c in DATA.columns if ('lead' in c) & ('morphology' not in c)]

for c in morphology_columns:
    DATA.loc[:, c] = DATA[c].apply(lambda x: x[0].strip(",").strip(" "))
    DATA.loc[:, c] = DATA[c].apply(lambda x: x if x.strip()!="" else "none")

OneHot = OneHotEncoder(drop=None, sparse_output=False, min_frequency=MIN_MORPHO_PRESENCE)
MorphologyOneHot = pd.DataFrame(data=OneHot.fit_transform(DATA[morphology_columns]), 
                            columns=OneHot.get_feature_names_out(morphology_columns),
                            index=DATA.index)

In [5]:
DATA = DATA.drop(morphology_columns, axis=1)
DATA = pd.concat([DATA, MorphologyOneHot], axis=1)

# Make tree

In [6]:
rules_path = r'T:\laupodteam\AIOS\Bram\notebooks\code_dev\miniECG_interpretation\TreeBuilder\assets\conduction_tree.json'

TreeKwargs = {
    'criterion':'gini', 
    'splitter':'best', 
    'max_depth':10, 
    'min_samples_split':10, 
    'min_samples_leaf': 5, 
    'min_weight_fraction_leaf':0.05, 
    'max_features':None, 
    'random_state':7, 
    'max_leaf_nodes':50,
    'class_weight': 'balanced'
}

In [7]:
rules_loader = ctree.LoadRules(rules_path)
processed_rules = rules_loader.get_processed_rules()

In [8]:
processed_rules.features_to_use_next

[]

In [9]:
SplitColumn = rules_loader.fold_split_col
TargetCol = rules_loader.target_col
IgnoreCols = rules_loader.ignore_cols
FeaturesToUse = rules_loader.features_to_use

In [10]:
if len(FeaturesToUse)>0:
    keep_columns = set(FeaturesToUse).difference(set(IgnoreCols))
else:
    keep_columns = [c for c in DATA.columns if c not in IgnoreCols]
    
y_train = DATA.loc[DATA[SplitColumn]=='training', TargetCol]
y_test = DATA.loc[DATA[SplitColumn]=='test', TargetCol]
X_train = DATA.loc[DATA[SplitColumn]=='training', keep_columns].drop([TargetCol, SplitColumn], 
                                                                   axis=1)
X_test = DATA.loc[DATA[SplitColumn]=='test', keep_columns].drop([TargetCol, SplitColumn],
                                                              axis=1)

In [11]:
clf = ctree.CustomDecisionTreeV2(custom_rules=processed_rules, Tree_kwargs=TreeKwargs)

In [12]:
clf.fit(X_train, y_train)
# Make predictions
y_pred = clf.predict(X_test)

ValueError: Found array with 0 sample(s) (shape=(0, 67)) while a minimum of 1 is required by DecisionTreeClassifier.

In [None]:
# Calculate accuracy
accuracy = ctree.accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")