In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
# Uncomment these on first run to be able to access src
# import sys
# sys.path.append("../src")
from src.utils import load_or_download_data
from src.features.preprocess_data import DataProcessor
from src.models.train_model import ModelTrainer, HyperparameterTuner

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# import sys
# print(sys.path)


['/home/lqueiros/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python312.zip', '/home/lqueiros/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python3.12', '/home/lqueiros/.local/share/uv/python/cpython-3.12.7-linux-x86_64-gnu/lib/python3.12/lib-dynload', '', '/home/lqueiros/projects/ml_zoomcamp_2024_midterm_project/heart_disease_risk_prediction/.venv/lib/python3.12/site-packages', '..', '..', '..', '..', '..', '../', '../utils', '../src']


### About The Dataset :

**age**: Age of the patient

**sex**: Sex of the patient
 - 1 = Male
 
 - 0 = Female

**cp**: Chest pain type
 - 1 = Typical Angina

 - 2 = Atypical Angina

 - 3 = Non-anginal Pain

 - 4 = Asymptomatic

**trtbps**: Resting blood pressure (in mm Hg)

**chol**: Cholestoral in mg/dl fetched via BMI sensor

**fbs**: (fasting blood sugar > 120 mg/dl)
 - 1 = True

 - 0 = False

**restecg**: Resting electrocardiographic results
 - 0 = Normal 

 - 1 = ST-T wave normality

 - 2 = Left ventricular hypertrophy

**thalachh**: Maximum heart rate achieved

**oldpeak**: Previous peak. ST depression induced by exercise relative to rest

**slp**: Slope. The slope of the peak exercise ST segment
 - 0 = unsloping

 - 1 = flat

 - 2 = downsloping

**ca**: Number of major vessels ~ (0,3)

**thall**: Thalassemia. Thalium Stress Test result ~ (0,3)
 - 0 = null

 - 1 = fixed defect

 - 2 = normal

 - 3 = reversable defect

**exang**: Exercise induced angina 
 - 1 = Yes

 - 0 = No

**num**: Target variable. Diagnosis of risk for heart disease (angiographic disease status).

Original values range from 0 to 4, considering the significance of 0 to be "no risk" and 1,2,3,4 to be "at risk" with the higher the number corresponding to a higher risk. However, following the example of other researchers, a simplification is performed to change the target output to binary:

 - 0 = < 50% diameter narrowing. less chance of heart disease

 - 1 = > 50% diameter narrowing. more chance of heart disease

## Loading the data


In [56]:
# This function will look for the data in the expected folder, if not present, it will download it from its source
heart_df =  load_or_download_data()
print(heart_df.shape)
heart_df.head()

File found at ../data/raw/heart_disease_original_data.csv. Loading data...
(303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


## Preprocessing the data

In [57]:
# Define the numerical columns
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Define the categorical columns by excluding numerical ones
categorical_cols = [col for col in heart_df.columns if col not in numerical_cols]

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'num']
Numerical Columns: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']


In [None]:
# Instantiating the data processing class
data_processor = DataProcessor(heart_df, numerical_cols, categorical_cols, 'num')

In [59]:
# Dropping rows with missing values
original_number_of_rows = heart_df.shape[0]
heart_df_cleaned = data_processor.clean_data().data

In [61]:
rows_dropped = original_number_of_rows - heart_df_cleaned.shape[0]
print(f"There were {rows_dropped} rows dropped due to containing missing data")

There were 6 rows dropped due to containing missing data


In [62]:
# Binarizing the target and changing the categorical features data type
heart_df_processed = data_processor.preprocess_data().data

In [64]:
heart_df_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       297 non-null    int64   
 1   sex       297 non-null    category
 2   cp        297 non-null    category
 3   trestbps  297 non-null    int64   
 4   chol      297 non-null    int64   
 5   fbs       297 non-null    category
 6   restecg   297 non-null    category
 7   thalach   297 non-null    int64   
 8   exang     297 non-null    category
 9   oldpeak   297 non-null    float64 
 10  slope     297 non-null    category
 11  ca        297 non-null    category
 12  thal      297 non-null    category
 13  num       297 non-null    category
dtypes: category(9), float64(1), int64(4)
memory usage: 17.8 KB


## Modelling

### Spliting the data into train, test and validation

In [23]:
X_train, y_train, X_test, y_test, X_val, y_val = data_processor.split_data()

Train set: (177, 13), Validation set: (60, 13), Test set: (60, 13)


### Training baseline models

In [24]:
# Define the models in a dictionary
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Classifier': SVC(),
    'XGBoost Classifier': XGBClassifier(eval_metric="logloss", enable_categorical=True),
    'Decision Tree': DecisionTreeClassifier()
}

In [None]:
# Instantiating the model trainer class
ModelTrainer(models).train_models(X_train=X_train, y_train=y_train).evaluate_models(X_test=X_test, y_test=y_test)

Training Logistic Regression...
Training Support Vector Classifier...
Training XGBoost Classifier...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Decision Tree...
************
All models trained! Next step is performance evaluation.
************
************
Evaluating Logistic Regression...
************
Confusion Matrix for Logistic Regression: 
[[28  2]
 [ 6 24]]
Accuracy of Logistic Regression: 86.67%
Precision of Logistic Regression: 92.31%
Recall of Logistic Regression: 80.00%
F1 Score of Logistic Regression: 85.71%


Evaluating Support Vector Classifier...
************
Confusion Matrix for Support Vector Classifier: 
[[25  5]
 [18 12]]
Accuracy of Support Vector Classifier: 61.67%
Precision of Support Vector Classifier: 70.59%
Recall of Support Vector Classifier: 40.00%
F1 Score of Support Vector Classifier: 51.06%


Evaluating XGBoost Classifier...
************
Confusion Matrix for XGBoost Classifier: 
[[25  5]
 [ 7 23]]
Accuracy of XGBoost Classifier: 80.00%
Precision of XGBoost Classifier: 82.14%
Recall of XGBoost Classifier: 76.67%
F1 Score of XGBoost Classifier: 79.31%


Evaluating Decision Tree...
**********

### Hyperparameter Tuning

In [None]:
# Defining the parameter grid
model_param_grids = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "params": {
            "penalty": ["l1", "l2"],
            "C": [0.01, 0.1, 1, 5, 10],
            "solver": ["liblinear"],# "saga"],
        },
    },
    "Support Vector Classifier": {
        "model": SVC(),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "kernel": ["linear", "rbf", "poly", "sigmoid"],
            "gamma": ["scale", "auto"],
        },
    },
    "XGBoost Classifier": {
    "model": XGBClassifier(eval_metric="logloss", enable_categorical=True),
    "params": {
        "learning_rate": [0.02, 0.05, 0.1],  # Smaller learning rates for better generalization
        "n_estimators": [50, 100],    # Number of boosting rounds
        "max_depth": [3, 5, 10],      # Limit depth to control model complexity
        "min_child_weight": [1, 3],   # Minimum sum of instance weights in a child
        "gamma": [0, 0.1, 0.3],       # Minimum loss reduction to make a split
        "subsample": [0.8, 1.0],      # Fraction of samples for each tree
        "colsample_bytree": [0.8, 1.0],  # Fraction of features for each tree
        "reg_alpha": [0, 0.1, 1.0],   # L1 regularization
        "reg_lambda": [1.0, 2.0],     # L2 regularization
        },
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "criterion": ["gini", "entropy"],
            "max_depth": [5, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 5],
        },
    },
}

In [None]:
# Instantiating the hyperparameter tuner class
model_tuner = HyperparameterTuner(model_param_grids)

#### Scoring metrics

Because this is a medical problematic, the best score to tune the models to is recall, in order to reduce the number of false negatives

In [None]:
# Tuning the models and retrieving the best performant ones on recall
best_models = model_tuner.tune_models(X_train=X_train, y_train=y_train)

Tuning hyperparameters for Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


Best Parameters for Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Best Recall Score for Logistic Regression: 0.772

Tuning hyperparameters for Support Vector Classifier...
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters for Support Vector Classifier: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best Recall Score for Support Vector Classifier: 0.736

Tuning hyperparameters for XGBoost Classifier...
Fitting 5 folds for each of 2592 candidates, totalling 12960 fits
Best Parameters for XGBoost Classifier: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 50, 'reg_alpha': 0, 'reg_lambda': 1.0, 'subsample': 0.8}
Best Recall Score for XGBoost Classifier: 0.784

Tuning hyperparameters for Decision Tree...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'min_s

In [None]:
# Looking into the trained models
best_models

{'Logistic Regression': {'best_estimator': LogisticRegression(C=10, penalty='l1', solver='liblinear'),
  'best_params': {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'},
  'best_score': np.float64(0.7720588235294118)},
 'Support Vector Classifier': {'best_estimator': SVC(C=0.1, kernel='linear'),
  'best_params': {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'},
  'best_score': np.float64(0.7360294117647059)},
 'XGBoost Classifier': {'best_estimator': XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                enable_categorical=True, eval_metric='logloss',
                feature_types=None, gamma=0, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=0.05, max_bin=None, max_cat_threshold=None,
                max_cat_to_onehot=None, max_delta_step=None, max_depth=3

In [None]:
# Selecting the best performing model
best_model = model_tuner.get_best_models()

Best Model: XGBoost Classifier
Best Estimator: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric='logloss',
              feature_types=None, gamma=0, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=3, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=50,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)
Best Params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 50, 'reg_alpha': 0, 'reg_lambda': 1.0, 'subsample': 0.8}
Best Score: 0.783

In [70]:
print(f"The best performing model after tuning is {best_model.best_model[0]}")

The best performing model after tuning is XGBoost Classifier


#### Saving the model

In [43]:
model_tuner.save_best_model_to_pickle(filepath="../models/trained_model")

Best model saved to ../models/trained_model.pkl


#### Loading the model

In [None]:
# Load the saved model for validation purposes only
with open("../models/trained_model.pkl", "rb") as f:
    best_model = pickle.load(f)


In [74]:
# Predict on the validation set
predictions = best_model.predict(X_val)

In [75]:

# Example: Calculate recall
recall = recall_score(y_val, predictions)
print(f"Recall on validation set: {recall:.3f}")

Recall on validation set: 0.833
