Importing all Libraries


In [None]:
import pandas as panda #Manipulação de dados
import numpy as np #Cálculos numéricos/arrays
import matplotlib.pyplot as plot #Graficos
import os #Para trabalhar com caminhos de arquivos
from sklearn.model_selection import train_test_split #Split
from sklearn.tree import DecisionTreeClassifier #Decision tree

Accessing and prepairing data


In [None]:
cript_dir = os.path.dirname(os.path.abspath(__file__))
csv_path = os.path.join(script_dir, 'hungarian_heart_diseases.csv')
data = panda.read_csv(csv_path)

#todas as linhas, todas as colunas exceto a ultima
x = data.iloc[:, : -1]
#todas as linhas, apenas a ultima coluna
y = data.iloc[:, -1]

**1st Question**

In [None]:
#Divide a data e, dois grupos. 80% treino, 20% teste
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=1
)

leaf_capacity = [1, 3, 5, 10, 25, 50, 100]
train_accuracy = []
test_accuracy = []

#Para cada valor de leaf_capacity, cria uma árvore de decisão
#Depois calcula a accuracy de treino e de teste
for leaf in leaf_capacity:
    tree = DecisionTreeClassifier(min_samples_leaf=leaf, random_state=1)
    tree.fit(X_train, y_train)
    train_accuracy.append(tree.score(X_train, y_train))
    test_accuracy.append(tree.score(X_test, y_test))

Plotting the accuracy graph

In [None]:
plot.figure(figsize=(12,8))
#plot.figure(figsize=(8,4))
#Cria os gráficos
plot.plot(leaf_capacity, train_accuracy, marker='o', label="Train Accuracy")
#Circulos para treino
plot.plot(leaf_capacity, test_accuracy, marker='s', label="Test Accuracy")
#Quadrados para teste
plot.xlabel("Min Samples per Leaf")
plot.ylabel("Accuracy")
plot.title("Decision Tree Accuracy by Minimum Samples per Leaf")
plot.legend()
plot.grid(True)
plot.savefig('80-20.png', dpi=300, bbox_inches='tight')
plot.close()

<img src="80-20.png" alt="Decision Tree Accuracy" width="1200">

<div style="max-width: 800px; text-align: left;">

**2nd Question -> Graph Analysis**

By analyzing the graph, we can conclude that for a minimal leaf capacity of 1, the training capacity is 100%, which indicates overfitting, an overly complicated tree, but with a reduced test accuracy (around 77%), meaning that the tree is memorizing data. For bigger minimal leaf capacities, both the training accuracy lowers and the testing accuracy increases, evidence of a more generalized tree. The best performance is with a minimal leaf capacity of 25, where the testing accuracy hits its peak (around 86%).

For values >25, we encounter a case of underfitting, both accuracies plummeting, because the tree is excessively generalized, nodes are very restrictive and do not allow for necessary splitting.

</div>


**3rd Question**

In [None]:
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree
import matplotlib.pyplot as plot
import pandas as panda
import os
from sklearn.tree import DecisionTreeClassifier #Decision tree

script_dir = os.path.dirname(os.path.abspath(__file__))
csv_path = os.path.join(script_dir, 'hungarian_heart_diseases.csv')
data = panda.read_csv(csv_path)

#todas as linhas, todas as colunas exceto a ultima
x = data.iloc[:, : -1]
#todas as linhas, apenas a ultima coluna
y = data.iloc[:, -1]


60-20-20 split

In [None]:
#60-40 split (60 para o train, 40 para o resto)
x_train, x_rest, y_train, y_rest = train_test_split(
    x, y, test_size=0.4, stratify=y, random_state=1
)
#20-20 split (20 para validação, 20 para teste)
#Do resto (40%)
x_val, x_test, y_val, y_test = train_test_split(
    x_rest, y_rest, test_size=0.5, stratify=y_rest, random_state=1
)

Finding the best parameters

In [None]:
#Limitações dos hiperparâmetros
parameters = {"max_depth": [2, 3, 4], "min_samples_split": range(2, 101)}
best_model = None
best_params = None
best_val_acc = 0
best_test_acc = 0

for params in ParameterGrid(parameters):
    classifier = DecisionTreeClassifier(
        max_depth = params["max_depth"],
        min_samples_split = params["min_samples_split"],
        random_state = 1
    )
    classifier.fit(x_train, y_train)
    val_acc = classifier.score(x_val, y_val)
    test_acc = classifier.score(x_test, y_test)

    #Encontramos um melhor no teste
    #At least 80, 78.5% in test
    #if val_acc >= 0.80 and test_acc >= 0.785 and val_acc > best_val_acc:
    if val_acc >= 0.80 and test_acc >= 0.785 and test_acc > best_test_acc:
        best_model = classifier
        best_val_acc = val_acc
        best_params = params
        best_test_acc = test_acc

Plotting

In [None]:
#O melhor de todos
plot.figure(figsize=(12,8))
#plot.title("Decision Tree")
#plot.legend()
plot.suptitle("Decision Tree", fontsize=16, y=0.95)
plot.tight_layout()
plot_tree(best_model, feature_names=x.columns, class_names=["Normal","Heart Disease"], filled=True)
#plot.show()
plot.savefig('60-20-20.png', dpi=300, bbox_inches='tight')
plot.close()

<img src="60-20-20.png" alt="Decision Tree" width="1200">

Decision tree explanation and analysis


<div style="max-width: 800px; text-align: left">
What identifies heart disease->

- With chest_pain <= 3.5 AND old_peak <= 0.5: 1 heart_disease -> 98.8% NORMAL

- With chest_pain <= 3.5 AND old_peak > 0.5: 6 heart_disease -> 66.7% NORMAL

- With chest_pain > 3.5: 10 normal -> 85.51% HEART DISEASE

- With chest_pain > 3.5 AND old_peak <= 0.75: 8 heart_disease -> 65.22% NORMAL    

- With chest_pain > 3.5 AND old_peak > 0.75: 36 heart_disease -> 100% HEART DISEASE

The principal factors that indicate a higher risk of heart disease for hungarian citizens are an elevated chest pain, being male,
higher ST deviation as a result of physical exercise, especially if there is discrepancies in a resting electrocardiographic test.
</div>