In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
from IPython.display import Image
import pydotplus

df = pd.read_csv('heart-disease-dataset2.csv')

df = df.replace('?', np.nan).apply(pd.to_numeric, errors='coerce')

df.fillna(df.mean(), inplace=True)


X = df.drop(columns=['result'])  
y = df['result']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def decision_tree_analysis(criterion='gini', max_depth=None):
    dt = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy with {criterion} and max_depth={max_depth}: {accuracy}')
    
    dot_data = export_graphviz(dt, out_file=None, feature_names=X.columns, class_names=True, filled=True, rounded=True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    image_path = f'decision_tree_{criterion}_depth_{max_depth}.png'
    graph.write_png(image_path)
    print(f'Decision Tree visualization saved as: {image_path}')


decision_tree_analysis(criterion='gini')


decision_tree_analysis(criterion='gini', max_depth=3)


decision_tree_analysis(criterion='entropy')


print("\nComparison of Decision Tree Accuracies:\n")
print("- Decision Tree with Gini Index and no max depth (max_depth=None) has an accuracy of 49.18%. This model might be overfitting as it is allowed to grow without restrictions.")
print("- Decision Tree with Gini Index and max depth set to 3 (max_depth=3) shows improved accuracy at 54.10%. Limiting the depth likely helped in reducing overfitting, resulting in better performance on the test set.")
print("- Decision Tree with Entropy and no max depth (max_depth=None) also has an accuracy of 49.18%, indicating that simply changing the criterion to Entropy without limiting depth does not significantly affect performance.")
print("\nOverall, the Decision Tree with Gini Index and a max depth of 3 provides the best accuracy among the three, suggesting that controlling the depth of the tree can be beneficial for preventing overfitting and improving model performance on unseen data.")


Accuracy with gini and max_depth=None: 0.4918032786885246
Decision Tree visualization saved as: decision_tree_gini_depth_None.png
Accuracy with gini and max_depth=3: 0.5409836065573771
Decision Tree visualization saved as: decision_tree_gini_depth_3.png
Accuracy with entropy and max_depth=None: 0.4918032786885246
Decision Tree visualization saved as: decision_tree_entropy_depth_None.png

Comparison of Decision Tree Accuracies:

- Decision Tree with Gini Index and no max depth (max_depth=None) has an accuracy of 49.18%. This model might be overfitting as it is allowed to grow without restrictions.
- Decision Tree with Gini Index and max depth set to 3 (max_depth=3) shows improved accuracy at 54.10%. Limiting the depth likely helped in reducing overfitting, resulting in better performance on the test set.
- Decision Tree with Entropy and no max depth (max_depth=None) also has an accuracy of 49.18%, indicating that simply changing the criterion to Entropy without limiting depth does not s