1- We will start by data preparation and processing

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.compose import ColumnTransformer

path = "/content/drive/MyDrive/heart.csv"
df = pd.read_csv(path)
print(df.head())
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']
np.random.seed(42)
# Stratified Split: 70% Train, 10% Validation, 20% Test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=42)


categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [col for col in X_train.columns if col not in categorical_features]), # Apply StandardScaler to numerical features
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features), # Apply OneHotEncoder to categorical features, sparse=False for numpy array
    ])

# Fit and transform the data
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

print("Data prepared: Train =", len(y_train), ", Val =", len(y_val), ", Test =", len(y_test))


   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
Data prepared: Train = 642 , Val = 92 , Test = 184


**2 - Implementation**

We will implement 3 different classifiers and compare between their results

**2.1 - Decision Tree**

In [22]:
class Node:
    def __init__(self,left=None,right=None,feature=None,threshold=None,*,value=None):
        self.left=left
        self.right=right
        self.value=value
        self.feature=feature
        self.threshold=threshold

    def is_leaf(self):
        return self.value is not None



In [28]:
from collections import Counter

class DecisionTree:
    def __init__(self,max_depth=100,min_samples_split=2):
        self.root=None
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split

    def fit(self,x,y):
        self.n_feautures=x.shape[1]
        self.root=self.grow_tree(x,y)

    def grow_tree(self,x,y,depth=0):
        n_samples,n_features=x.shape
        n_labels=len(np.unique(y))

        #check the stopping criteria
        if(depth>=self.max_depth or n_labels==1 or n_samples<self.min_samples_split):
            leaf_value=self.most_common_label(y)
            return Node(value=leaf_value)
        #find the best split
        feat_indxs= np.random.choice(n_features,self.n_feautures,replace=False)
        best_feat,best_thresh=self.best_split(x,y,feat_indxs)

        #create child nodes
        left_idxs,right_idxs=self.split(x[:,best_feat],best_thresh)
        left=self.grow_tree(x[left_idxs,:],y[left_idxs],depth+1)
        right=self.grow_tree(x[right_idxs,:],y[right_idxs],depth+1)
        return Node(left,right,best_feat,best_thresh)

    def best_split(self,x,y,feat_indxs):
        best_gain=-1
        best_feat,best_thresh=None,None

        for feat_index in feat_indxs:
            x_column=x[:,feat_index]
            thresholds=np.unique(x_column)
            for thr in thresholds:
                gain=self.information_gain(y,x_column,thr)

                if gain>best_gain:
                    best_gain=gain
                    best_feat=feat_index
                    best_thresh=thr

        return best_feat,best_thresh

    def information_gain(self,y,x_column,threshold):
        #parent entropy
        parent_entropy=self.entropy(y)
        #create children
        left_idx,right_idx=self.split(x_column,threshold)
        if len(left_idx)==0 or len(right_idx)==0:
            return 0
        #calculate the weighted avg.entropy of the children
        n=len(y)
        n_l,n_r=len(left_idx),len(right_idx)
        e_l,e_r=self.entropy(y[left_idx]),self.entropy(y[right_idx])
        child_entropy=(n_l/n)*e_l+(n_r/n)*e_r
        #calculate the information gain
        ig=parent_entropy-child_entropy
        return ig

    def split(self,x_column,threshold):
        left_idxs=np.argwhere(x_column<=threshold).flatten()
        right_idxs=np.argwhere(x_column>threshold).flatten()
        return left_idxs,right_idxs

    def entropy(self,y):
        hist= np.bincount(y)
        ps= hist/len(y)
        return -np.sum([p*np.log2(p) for p in ps if p>0])

    def most_common_label(self,y):
        counter= Counter(y)
        most_common=counter.most_common(1)[0][0]
        return most_common

    def predict(self,x):
        return np.array([self.traverse_tree(xi,self.root) for xi in x])

    def traverse_tree(self,x,node):
        if node.is_leaf():
            return node.value
        if x[node.feature]<=node.threshold:
            return self.traverse_tree(x,node.left)
        return self.traverse_tree(x,node.right)
    def accuracy(self,y_pred,y_test):
        return np.sum(y_pred==y_test)/len(y_test)

Then we will train the decision tree on different depth values and see the scores

In [35]:

best_depth = None
best_acc = 0
for depth in range(1, 101, 10):  # Example tuning loop
    clf = DecisionTree(max_depth=depth)
    clf.fit(X_train, y_train.to_numpy())
    val_pred = clf.predict(X_val)
    val_acc = clf.accuracy(val_pred, y_val.to_numpy())
    if val_acc > best_acc:
        best_acc = val_acc
        best_depth = depth

clf=DecisionTree(best_depth)
clf.fit(np.vstack((X_train,X_val)), np.hstack((y_train,y_val)))
test_pred = clf.predict(X_test)
test_acc = clf.accuracy(test_pred, y_test.to_numpy())

dt_f1 = f1_score(y_test, test_pred)

print(f"DT Accuracy: {test_acc:.4f}, F1-Score: {dt_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))


(642,) (92,)
DT Accuracy: 0.8261, F1-Score: 0.8351
Confusion Matrix:
 [[71 11]
 [21 81]]


**3.2 - Bagging**

In [38]:
class Bagging:
    def __init__(self, base_learner, n_estimators, sample_size=0.8):
        self.base_learner = base_learner
        self.n_estimators = n_estimators
        self.sample_size = sample_size

    def fit(self,x,y):
        n_samples=int(self.sample_size*len(x))
        self.models=[]
        for i in range(self.n_estimators):
            indxs=np.random.choice(len(x),n_samples,replace=True)
            x_sample,y_sample=x[indxs],y[indxs]
            model = DecisionTree()
            model.fit(x_sample,y_sample)
            self.models.append(model)

    def predict(self,x):
        predictions=np.array([model.predict(x) for model in self.models])
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
    def accuracy(self,y_pred,y_test):
        return np.sum(y_pred==y_test)/len(y_test)

Then we will train the bagging algorithm and calculate the accuracy and score


In [39]:
best_estimators = None
best_acc = 0
for estimators in range(10, 20, 30):
    clf = Bagging(base_learner=DecisionTree(), n_estimators=estimators)
    clf.fit(X_train, y_train.to_numpy())
    val_pred = clf.predict(X_val)
    val_acc = clf.accuracy(val_pred, y_val.to_numpy())
    if val_acc > best_acc:
        best_acc = val_acc
        best_estimators = estimators

clf=Bagging(base_learner=DecisionTree(), n_estimators=best_estimators)
clf.fit(np.vstack((X_train,X_val)), np.hstack((y_train,y_val)))
test_pred = clf.predict(X_test)
bagging_acc = clf.accuracy(test_pred, y_test.to_numpy())

bagging_f1 = f1_score(y_test, test_pred)

print(f"Bagging Accuracy: {bagging_acc:.4f}, F1-Score: {bagging_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))

Bagging Accuracy: 0.8696, F1-Score: 0.8763
Confusion Matrix:
 [[75  7]
 [17 85]]


Then finally, We will implement the ADAboost classifier

In [40]:
class decisionStump:
    def __init__(self):
        self.polarity = 1
        self.feature_index = None
        self.threshold = None
        self.alpha = None

    def predict(self,X):
        n_samples = X.shape[0]
        X_column = X[:,self.feature_index]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold]= -1
        else:
            predictions[X_column > self.threshold]= -1
        return predictions

In [42]:
class adaboost:
    def __init__(self,n_clf=50):
        self.n_clf = n_clf

    def fit(self,X,y):
        n_samples,n_features = X.shape
        #initialize weights
        w = np.full(n_samples, (1/n_samples))
        self.clfs = []

        for i in range(self.n_clf):
            clf= decisionStump()
            min_error = float('inf')
            for feature_i in range(n_features):
                X_column =X[:,feature_i]
                thresholds = np.unique(X_column)
                for threshold in (thresholds):
                    p=1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1
                    misclassified = w[y!=predictions]
                    error = sum(misclassified)
                    if error > 0.5:
                        error = 1-error
                        p = -1

                    if error < min_error:
                        min_error = error
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_index = feature_i
            eps=1e-10
            clf.alpha = 0.5*np.log((1.0-min_error)/(min_error+eps))

            predictions = clf.predict(X)

            w *= np.exp(-clf.alpha*y*predictions)
            w /= np.sum(w)

            self.clfs.append(clf)

    def predict(self,X):
        clf_preds = [clf.alpha*clf.predict(X) for clf in self.clfs]
        ypred = np.sum(clf_preds,axis=0)
        return np.sign(ypred)
    def accuracy(self,y_pred,y_test):
        return np.sum(y_pred==y_test)/len(y_test)


And again, the training loop.

In [43]:
best_estimators = None
best_acc = 0
for estimators in range(10, 20, 30):
    clf = adaboost(n_clf=estimators)
    clf.fit(X_train,  np.where(y_train==0,-1,1))
    val_pred = clf.predict(X_val)
    val_pred = np.where(val_pred==-1,0,1)
    val_acc = clf.accuracy(val_pred, y_val.to_numpy())
    if val_acc > best_acc:
        best_acc = val_acc
        best_estimators = estimators

clf=adaboost(best_estimators)
temp =  np.hstack((y_train,y_val))
clf.fit(np.vstack((X_train,X_val)),np.where(temp==0,-1,1))
test_pred = clf.predict(X_test)
test_pred = np.where(test_pred==-1,0,1)
ada_acc = clf.accuracy(test_pred, y_test.to_numpy())

ada_f1 = f1_score(y_test, test_pred)

print(f"Adaboost Accuracy: {ada_acc:.4f}, F1-Score: {ada_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))

Adaboost Accuracy: 0.8478, F1-Score: 0.8557
Confusion Matrix:
 [[73  9]
 [19 83]]


4-Bonus part

We will Start by preparing the data and splitting it into train, validation and test

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.compose import ColumnTransformer

path = "/content/drive/MyDrive/heart.csv"
df = pd.read_csv(path)
print(df.head())

   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


Here comes the splitting and standardizing

In [11]:

X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

# Stratified Split: 70% Train, 10% Validation, 20% Test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=42)


categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [col for col in X_train.columns if col not in categorical_features]), # Apply StandardScaler to numerical features
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features), # Apply OneHotEncoder to categorical features, sparse=False for numpy array
    ])

# Fit and transform the data
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

print("Data prepared: Train =", len(y_train), ", Val =", len(y_val), ", Test =", len(y_test))


Data prepared: Train = 642 , Val = 92 , Test = 184


After preparing and preprocessing the non-numerical value using one-hot encoding, we will begin by training a KNN model using different values for k.

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

best_k = None
best_val = 0

for k in [3, 5, 7, 11,13,15]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    val = accuracy_score(y_val, knn.predict(X_val))

    print(f"K={k}, accuracy: {val:.4f}")

    if val > best_val:
        best_val = val
        best_k = k

knn_final = KNeighborsClassifier(n_neighbors=best_k)
knn_final.fit(np.vstack((X_train, X_val)), np.hstack((y_train, y_val)))

y_pred_knn = knn_final.predict(X_test)
knn_acc = accuracy_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)

print(f"KNN Accuracy: {knn_acc:.4f}, F1-Score: {knn_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


K=3, accuracy: 0.8696
K=5, accuracy: 0.8913
K=7, accuracy: 0.9130
K=11, accuracy: 0.9348
K=13, accuracy: 0.9022
K=15, accuracy: 0.9130
KNN Accuracy: 0.8967, F1-Score: 0.9073
Confusion Matrix:
 [[72 10]
 [ 9 93]]


Next, We are implementing Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

best_C = None
best_val = 0

for C in [0.01, 0.1, 1, 10]:
    log_reg = LogisticRegression(C=C, max_iter=1000, class_weight='balanced', random_state=42)
    log_reg.fit(X_train, y_train)
    val = accuracy_score(y_val, log_reg.predict(X_val))

    print(f"C={C}, Accuracy: {val:.4f}")

    if val > best_val:
        best_val = val
        best_C = C

log_reg_final = LogisticRegression(C=best_C, max_iter=1000, class_weight='balanced', random_state=42)
log_reg_final.fit(np.vstack((X_train, X_val)), np.hstack((y_train, y_val)))

y_pred_log = log_reg_final.predict(X_test)
log_acc = accuracy_score(y_test, y_pred_log)
log_f1 = f1_score(y_test, y_pred_log)

print(f"Logistic Regression Accuracy: {log_acc:.4f}, F1-Score: {log_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


C=0.01, Accuracy: 0.9022
C=0.1, Accuracy: 0.9022
C=1, Accuracy: 0.9022
C=10, Accuracy: 0.8913
Logistic Regression Test Accuracy: 0.8641, F1-Score: 0.8744
Confusion Matrix:
 [[72 10]
 [15 87]]


And finally, We get to the Feedforward NN

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).reshape(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

class FNN(nn.Module):
    def __init__(self, input_size):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

model = FNN(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(500):  # Train for 100 epochs
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

with torch.no_grad():
    y_pred_fnn = model(X_test_tensor).numpy().round()
    fnn_acc = accuracy_score(y_test, y_pred_fnn)
    fnn_f1 = f1_score(y_test, y_pred_fnn)

print(f"FNN Test Accuracy: {fnn_acc:.4f}, F1-Score: {fnn_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_fnn))


Epoch 0, Loss: 0.7321
Epoch 50, Loss: 0.2943
Epoch 100, Loss: 0.2397
Epoch 150, Loss: 0.2016
Epoch 200, Loss: 0.1588
Epoch 250, Loss: 0.1193
Epoch 300, Loss: 0.0882
Epoch 350, Loss: 0.0651
Epoch 400, Loss: 0.0486
Epoch 450, Loss: 0.0355
FNN Test Accuracy: 0.8315, F1-Score: 0.8410
Confusion Matrix:
 [[71 11]
 [20 82]]


And here is the comparison between the three of them:

In [18]:
print(f"KNN - Accuracy: {knn_acc:.4f}, F1-Score: {knn_f1:.4f}")
print(f"Logistic Regression - Accuracy: {log_acc:.4f}, F1-Score: {log_f1:.4f}")
print(f"FNN - Accuracy: {fnn_acc:.4f}, F1-Score: {fnn_f1:.4f}")

KNN - Accuracy: 0.8967, F1-Score: 0.9073
Logistic Regression - Accuracy: 0.8641, F1-Score: 0.8744
FNN - Accuracy: 0.8315, F1-Score: 0.8410
