In [None]:

import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.path.abspath('__file__')), 'oracle'))

data = oracle.q3_hyper(23475)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from oracle import q3_hyper

#Load dataset
columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"]
df = pd.read_csv("heart+disease/processed.cleveland.data", names=columns, na_values='?') 

#Data Cleaning
df.dropna(inplace=True)
df["target"] = (df["target"] > 0).astype(int)  # Convert target to binary (0 = No Disease, 1 = Disease)

#Split Data
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Get HYperparameters
data = q3_hyper(23475)
criterion, splitter, max_depth = data

#Train Decision Tree
clf = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth)
clf.fit(X_train, y_train)

#Measure Performance
y_pred = clf.predict(X_test)

#defining Orgininal functions
def accuracy(y_true, y_pred):
    
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def precision(y_true, y_pred):
    
    TP = np.sum((y_pred == 1) & (y_true == 1))  # True Positives
    FP = np.sum((y_pred == 1) & (y_true == 0))  # False Positives
    if (TP + FP) > 0 :
        return TP / (TP + FP) 
    else :
        0

def recall(y_true, y_pred):
  
    TP = np.sum((y_pred == 1) & (y_true == 1))  # True Positives
    FN = np.sum((y_pred == 0) & (y_true == 1))  # False Negatives
    if (TP + FN) > 0 :
        return TP / (TP + FN)
    else :
        return 0

def f1_score(y_true, y_pred):
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    if (precision + recall) > 0 :
        return 2 * (precision * recall) / (precision + recall) 
    else :
        0

accuracy = accuracy(y_test.to_numpy(), y_pred)
precision = precision(y_test.to_numpy(), y_pred)
recall = recall(y_test.to_numpy(), y_pred)
f1 = f1_score(y_test.to_numpy(), y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

#Visualize Decision Tree using Matplotlib
plt.figure(figsize=(100, 80))
plot_tree(clf, feature_names=X.columns, class_names=["No Disease", "Disease"], filled=True)
plt.savefig("decision_tree.png")
plt.show()

#Find Important Feature
feature_importances = pd.Series(clf.feature_importances_, index=X.columns)
most_important_feature = feature_importances.idxmax()
print(f"Most Important Feature: {most_important_feature}")

