MLDL week 3 onwards

In [1]:
# DECISION_TREE_MODEL.ipynb - Part 1

## Imports
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold  # For cross-validation/hyperparameter tuning
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

## Load Cleaned Data
train_cleaned = pd.read_csv("cardio_train_cleaned.csv")
test_cleaned = pd.read_csv("cardio_test_cleaned.csv")

X_train = train_cleaned.drop('cardio', axis=1).values
y_train = train_cleaned['cardio'].values
X_test = test_cleaned.drop('cardio', axis=1).values
y_test = test_cleaned['cardio'].values


In [2]:
train_cleaned

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cholesterol_2,cholesterol_3,gluc_2,gluc_3,cardio
0,0.997653,0,0.072978,-0.640172,-0.401468,-0.140036,0,0,1,False,False,False,False,1
1,-0.335337,0,-0.310332,3.735009,0.198765,-0.140036,0,0,1,False,False,False,False,0
2,0.553323,0,0.072978,-0.428470,1.399229,-0.140036,0,0,1,False,False,False,False,1
3,-1.372106,0,-1.076953,-0.993009,-1.001700,-1.202733,0,0,1,False,False,False,False,1
4,0.701433,0,-1.843574,-2.192656,-1.001700,-1.202733,0,0,1,True,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54795,-1.075886,1,-0.310332,-0.216768,-0.401468,-0.140036,0,0,1,False,False,False,False,0
54796,-1.520216,1,2.117301,2.182525,0.318811,1.135201,1,0,1,True,False,False,True,0
54797,-0.927776,1,1.350680,0.559474,4.400392,3.048057,1,0,1,False,False,False,False,1
54798,-1.075886,0,-0.310332,0.418339,1.999462,-0.140036,1,0,1,False,False,False,False,1


In [5]:
test_cleaned

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cholesterol_2,cholesterol_3,gluc_2,gluc_3,cardio
0,-0.483447,1,-0.310332,-0.569605,-0.401468,-0.140036,1,1,1,False,True,False,True,0
1,-0.187227,0,-1.588034,0.136070,-0.401468,-0.140036,0,0,1,False,True,False,True,1
2,-0.187227,0,0.584059,-0.216768,-0.401468,-0.140036,0,0,0,False,False,False,False,0
3,-0.483447,0,-0.565873,-1.063577,-1.001700,-1.202733,0,0,1,False,False,False,False,0
4,-0.631557,0,-0.438102,-1.769251,-1.601933,-1.202733,0,0,0,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13696,0.108993,1,0.456289,0.136070,-0.401468,0.391313,0,0,1,False,True,False,False,1
13697,0.997653,1,0.328519,0.065502,0.198765,-0.140036,0,0,1,False,False,False,False,0
13698,0.108993,1,1.222910,2.535362,1.399229,1.985359,1,1,1,False,False,False,False,1
13699,-0.187227,0,-0.054792,-0.993009,-0.401468,-0.140036,0,0,1,False,False,False,False,0


In [5]:
# DECISION_TREE_MODEL.ipynb - Part 2

def calculate_gini_impurity(y):
    """Calculate Gini Impurity for a set of labels y."""
    if len(y) == 0:
        return 0
    counts = Counter(y)
    probabilities = [count / len(y) for count in counts.values()]
    gini = 1.0 - sum(p**2 for p in probabilities)
    return gini

def calculate_information_gain(parent_y, left_y, right_y):
    """Calculate the Information Gain from a split using Gini Index."""
    p_gini = calculate_gini_impurity(parent_y)
    n = len(parent_y)
    n_left = len(left_y)
    n_right = len(right_y)
    weighted_gini = (n_left / n) * calculate_gini_impurity(left_y) + \
                    (n_right / n) * calculate_gini_impurity(right_y)
    info_gain = p_gini - weighted_gini
    return info_gain


In [7]:
# DECISION_TREE_MODEL.ipynb - Part 3

class Node:
    """A single node in the Decision Tree."""
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf(self):
        return self.value is not None

class CustomDecisionTree:
    """Decision Tree Classifier implemented from scratch."""
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def _split_data(self, X, y, feature_idx, threshold):
        left_indices = X[:, feature_idx] <= threshold
        right_indices = X[:, feature_idx] > threshold
        X_left, y_left = X[left_indices], y[left_indices]
        X_right, y_right = X[right_indices], y[right_indices]
        return X_left, y_left, X_right, y_right

    def _find_best_split(self, X, y):
        best_gain = -1
        best_split = None
        n_features = X.shape[1]
        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = self._split_data(X, y, feature_idx, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
                gain = calculate_information_gain(y, y_left, y_right)
                if gain > best_gain:
                    best_gain = gain
                    best_split = {
                        'feature_idx': feature_idx,
                        'threshold': threshold,
                        'X_left': X_left, 'y_left': y_left,
                        'X_right': X_right, 'y_right': y_right
                    }
        return best_split, best_gain

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))
        
        if n_labels == 1:
            return Node(value=y[0])
        if depth >= self.max_depth:
            most_common = Counter(y).most_common(1)[0][0]
            return Node(value=most_common)
        if n_samples < self.min_samples_split:
            most_common = Counter(y).most_common(1)[0][0]
            return Node(value=most_common)

        best_split, best_gain = self._find_best_split(X, y)
        if best_gain < 0:
            most_common = Counter(y).most_common(1)[0][0]
            return Node(value=most_common)

        left_child = self._build_tree(best_split['X_left'], best_split['y_left'], depth + 1)
        right_child = self._build_tree(best_split['X_right'], best_split['y_right'], depth + 1)

        return Node(
            feature_idx=best_split['feature_idx'],
            threshold=best_split['threshold'],
            left=left_child,
            right=right_child
        )

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        feature_val = x[node.feature_idx]
        if feature_val <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])


In [9]:
# DECISION_TREE_MODEL.ipynb - Part 4

# Train the custom model
dt_custom = CustomDecisionTree(max_depth=5, min_samples_split=20)
dt_custom.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_custom.predict(X_test)

# Compute metrics
print("## Model Evaluation (Custom Decision Tree) ##")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

# Overfitting/Underfitting Check
y_train_pred = dt_custom.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("\n## Overfitting/Underfitting Check ##")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")


## Model Evaluation (Custom Decision Tree) ##
Accuracy: 0.7279
Precision: 0.7741
Recall: 0.6349
F1 Score: 0.6976

## Overfitting/Underfitting Check ##
Training Accuracy: 0.7317
Test Accuracy: 0.7279


In [10]:
# DECISION_TREE_MODEL.ipynb - Part 5

dt_sk = DecisionTreeClassifier(random_state=42)

param_grid = {
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_split': [10, 50, 100, 200],
    'criterion': ['gini']
}

grid_search = GridSearchCV(
    estimator=dt_sk,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

X_train_full = train_cleaned.drop('cardio', axis=1)
y_train_full = train_cleaned['cardio']

grid_search.fit(X_train_full, y_train_full)

print("\n## Hyperparameter Tuning Results ##")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")

best_dt = grid_search.best_estimator_
y_test_tuned_pred = best_dt.predict(X_test)
print(f"Test Accuracy (Tuned Model): {accuracy_score(y_test, y_test_tuned_pred):.4f}")


Fitting 3 folds for each of 20 candidates, totalling 60 fits

## Hyperparameter Tuning Results ##
Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 50}
Best Cross-Validation Score: 0.7307
Test Accuracy (Tuned Model): 0.7279




In [11]:
import joblib
# Save your best estimator to a file
joblib.dump(best_dt, 'cardio_model.pkl')

['cardio_model.pkl']