# 1. Install scikit-learn & required library
This project require numpy, pandas, matplotlib,... and scikit-learn be installed. Run the following code to install the requirements.

In [None]:
from tornado.escape import json_encode
from webencodings import labels
!python -m pip install -U scikit-learn
!python -m pip show scikit-learn
!python -c "import sklearn; sklearn.show_versions()"
!python -m pip install matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 2. Preparing the datasets
The following code will download the dataset heart-disease from URL.

• Binary class dataset: The [UCI Heart Disease dataset](https://archive.ics.uci.edu/dataset/45/heart+disease) is used for classifying whether a
patient has a heart disease or not based on age, blood pressure, cholesterol level, and other
medical indicators. This dataset includes 303 samples, with labels indicating presence (1) or
absence (0) of heart disease. Experiments with the Cleveland database have concentrated on
simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0).

In [None]:
heart_disease_db_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
heart_disease_columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]

def split_dataset(_dataset: pd.DataFrame, targets: list):
	"""
	Split input dataset into feature and target by input targets
	:return: dict["feature"]
	"""
	existing_columns = [col for col in targets if col in _dataset.columns]
	missing_columns = [col for col in targets if col not in _dataset.columns]
	if missing_columns:
		print("These columns are not found in the dataset:", missing_columns)
	return {
		"feature": _dataset.drop(existing_columns, axis=1),
		"target": _dataset[existing_columns],
	}

# fetch dataset from url
raw_heart_db = pd.read_csv(heart_disease_db_url, names=heart_disease_columns)
raw_heart_db = raw_heart_db.replace('?', np.nan)
raw_heart_db = raw_heart_db.dropna()
raw_heart_db = raw_heart_db.astype(float)
raw_heart_db['num'] = raw_heart_db['num'].apply(lambda x: 1 if x > 0 else 0)

dataset = split_dataset(raw_heart_db, targets=['num'])  # adding columns name

# 3. Prepare, Building and Evaluating with the decision tree
> Required to run the code at [Section #2](#2-preparing-the-datasets) before continue!
### 3.1 Prepare the splits for the building of the decision tree
This following code splits the dataset into multiple splits with defined ratio.
The splits are structured as following`[ratio: float => (feature_train, feature_test, label_train, label_test)]`

In [None]:
from sklearn.model_selection import train_test_split

split_ratios = [0.6, 0.4, 0.2, 0.1] # train/test 60/40 40/60 80/20 90/10
random_seed = 42                    # 42 for testing and cultural reference :)

def prepare_dataset(features, labels, test_size, seed=None):
	"""
	:param test_size: Test size ratio (test/(train+test))
	:return: feature_train, feature_test, label_train, label_test
	"""
	return train_test_split(features, labels, test_size=test_size, stratify=labels, random_state=seed, shuffle=True)

def prepare_all_splits(features, labels, seed=None):
    splits = {}
    for split_ratio in split_ratios:
        splits[split_ratio] = prepare_dataset(features, labels, test_size=split_ratio, seed=seed)
    return splits

dataset_splits = prepare_all_splits(dataset['feature'], dataset['target'], seed=random_seed)

### 3.2 Training

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def train_and_evaluate(feature_train, label_train, max_depth=None) -> DecisionTreeClassifier:
	dtc = DecisionTreeClassifier(criterion='entropy', random_state=random_seed, max_depth=max_depth)
	dtc.fit(feature_train, label_train) # train
	return dtc

In [None]:
def run_all_splits(dataset_name):
	for test_size in split_ratios:
		_feature_train, _feature_test, _label_train, _label_test = dataset_splits[test_size]
		dtc = train_and_evaluate(_feature_train, _label_train, max_depth=2)
		plt.figure(dpi=1200)
		tree.plot_tree(dt, rounded=True, filled=True, feature_names=feature_train.columns, class_names=["No Decease", "Decease"])
		plt.show()
		_label_predict = dtc.predict(_feature_test) # predict the label of feature_test based on the tree
		print(f"\n=== {dataset_name} Train/Test {round(test_size * 100)}/{100 - round(test_size * 100)} ===")
		print(classification_report(_label_test, _label_predict, target_names=["No Decease", "Decease"]))

		cm = confusion_matrix(_label_test, _label_predict)
		disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["No Decease", "Decease"])
		fig, ax = plt.subplots(figsize=(8, 6))
		disp.plot(ax=ax, cmap='Blues')
		plt.title(f"Confusion Matrix (Depth=2, {100 - round(test_size * 100)}/{round(test_size * 100)})")
		plt.yticks(rotation=90)
		plt.show()

run_all_split("Heart Disease")

In [None]:


label = [int(x > 0) for x in dataset['target']['num']]
print(len(label))

feature_train, feature_test, label_train, label_test = prepare_dataset(dataset['feature'], label, test_size=0.2) # 80 / 20 train/test
print(len(feature_train), len(feature_test), len(label_train), len(label_test))
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=2, criterion='entropy', random_state=42)
dt.fit(feature_train, label_train)

# print(dt.score(feature_test, label_test))

from sklearn import tree
import matplotlib.pyplot as plt
plt.figure(dpi=1200)
# features_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]
tree.plot_tree(dt, rounded=True, filled=True, feature_names=feature_train.columns, class_names=["No Decease", "Decease"])
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

def plot_confusion(y_true, y_pred, title):
	print(classification_report(y_true, y_pred, target_names=['No Decease', 'Decease']))
	cm = confusion_matrix(y_true, y_pred)
	disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["No Decease", "Decease"])
	fig, ax = plt.subplots(figsize=(8, 6))
	disp.plot(ax=ax, cmap='Blues_r')
	plt.title(title)
	plt.yticks(rotation=90)
	plt.show()

y_pred = dt.predict(feature_test)
plot_confusion(label_test, y_pred, f'Confusion Matrix')

# 1.4