# Decision Tree

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

In [2]:
# load the df
df = pd.read_csv('../../data/df_dummies.csv')

In [3]:
# train_test_split
features = df.drop(columns=['heart_attack_risk','income','smoking'])
target = df['heart_attack_risk']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [4]:
# normalization
normalizer = MinMaxScaler()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm,columns=X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm,columns=X_test.columns)

## smote

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
sm = SMOTE(random_state=123,sampling_strategy=1.0)
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train_norm,y_train)

In [7]:
y_train_SMOTE.value_counts()

heart_attack_risk
0    4510
1    4510
Name: count, dtype: int64

In [8]:
# decision tree
tree = DecisionTreeClassifier(max_depth=10,random_state=10)

tree.fit(X_train_SMOTE, y_train_SMOTE)

pred_tree = tree.predict(X_test_norm)

In [9]:
print(classification_report(y_test,pred_tree))

              precision    recall  f1-score   support

           0       0.64      0.89      0.75      1114
           1       0.40      0.13      0.19       639

    accuracy                           0.61      1753
   macro avg       0.52      0.51      0.47      1753
weighted avg       0.55      0.61      0.54      1753



In [10]:
confusion_matrix(y_test, pred_tree)

array([[989, 125],
       [561,  78]], dtype=int64)

In [11]:
import plotly.express as px

In [12]:
cm = pd.DataFrame(confusion_matrix(y_test, pred_tree))
# Rename columns to predicted values - 0 = No Risk, 1 = Risk
cm.rename({0: 'No - True', 1: 'Yes - True'}, axis=1, inplace=True)
# Rename rows to real values - 0 = No Risk, 1 = Risk
cm.rename({0: 'No - Pred', 1: 'Yes - Pred'}, axis=0, inplace=True)
px.imshow(cm, text_auto=True, color_continuous_scale='RdBu', color_continuous_midpoint=0)

In [13]:
#check what are the most relevant features

tree_importance = {feature : importance for feature, importance in zip(X_train_norm.columns, tree.feature_importances_)}
tree_importance     

{'Unnamed: 0': 0.03092122846398604,
 'age': 0.038370590858409416,
 'cholesterol': 0.034850913332456455,
 'heart_rate': 0.038731535845582254,
 'diabetes': 0.0011749692144734502,
 'family_history': 0.001678527449247786,
 'smoking': 0.006070546073498451,
 'obesity': 0.004297615377681912,
 'alcohol_consumption': 0.0030353371373897466,
 'exercise_hours_per_week': 0.03856181012282725,
 'previous_heart_problems': 0.00322277270255575,
 'medication_use': 0.0,
 'stress_level': 0.009354855791900525,
 'sedentary_hours_per_day': 0.04940655554487886,
 'bmi': 0.052211189884456805,
 'triglycerides': 0.027577416231183288,
 'physical_activity_days_per_week': 0.01847525149979347,
 'sleep_hours_per_day': 0.6284750092920618,
 'sex_Male': 0.0,
 'continent_Asia': 0.0005595091497492623,
 'continent_Australia': 0.0,
 'continent_Europe': 0.002765734650640463,
 'continent_North America': 0.0,
 'continent_South America': 0.005486537158182294,
 'hemisphere_Southern Hemisphere': 0.0007833128096489668,
 'diet_Health

In [14]:
from sklearn.tree import export_text

tree_viz = export_text(tree, feature_names=list(X_train_norm.columns))
print(tree_viz)


|--- sleep_hours_per_day <= 1.00
|   |--- sleep_hours_per_day <= 0.83
|   |   |--- sleep_hours_per_day <= 0.83
|   |   |   |--- sleep_hours_per_day <= 0.67
|   |   |   |   |--- sleep_hours_per_day <= 0.00
|   |   |   |   |   |--- bmi <= 0.47
|   |   |   |   |   |   |--- cholesterol <= 0.15
|   |   |   |   |   |   |   |--- bmi <= 0.24
|   |   |   |   |   |   |   |   |--- heart_rate <= 0.99
|   |   |   |   |   |   |   |   |   |--- sedentary_hours_per_day <= 0.60
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- sedentary_hours_per_day >  0.60
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- heart_rate >  0.99
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- bmi >  0.24
|   |   |   |   |   |   |   |   |--- physical_activity_days_per_week <= 0.25
|   |   |   |   |   |   |   |   |   |--- age <= 0.11
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |  