In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

%matplotlib inline

In [None]:
# Load the data
heart = pd.read_csv('/content/drive/MyDrive/HSM_564/heart.csv')

# Display the first few rows
heart.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
heart.shape

(918, 12)

In [None]:
import sys
# append the function .py file file path to the system path
# function.py is uploaded to /content/drive/MyDrive/Functions folder
if '/content/drive/MyDrive/HSM_564' not in sys.path:
  sys.path.append('/content/drive/MyDrive/HSM_564')

In [None]:
import function as fun

In [None]:
# check the data types
heart.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [None]:
fun.metadata(heart)

Unnamed: 0,column_name,datatype,missing_percent,unique,mean,std,min,25%,50%,75%,max
0,Age,int64,0.0,50,53.510893,9.432617,28.0,47.0,54.0,60.0,77.0
1,Sex,object,0.0,2,,,,,,,
2,ChestPainType,object,0.0,4,,,,,,,
3,RestingBP,int64,0.0,67,132.396514,18.514154,0.0,120.0,130.0,140.0,200.0
4,Cholesterol,int64,0.0,222,198.799564,109.384145,0.0,173.25,223.0,267.0,603.0
5,FastingBS,int64,0.0,2,,,,,,,
6,RestingECG,object,0.0,3,,,,,,,
7,MaxHR,int64,0.0,119,136.809368,25.460334,60.0,120.0,138.0,156.0,202.0
8,ExerciseAngina,object,0.0,2,,,,,,,
9,Oldpeak,float64,0.0,53,0.887364,1.06657,-2.6,0.0,0.6,1.5,6.2


In [None]:
heart.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical variables into integer labels
heart_data_encoded = heart.copy()  # Create a copy of the original DataFrame

# List of categorical columns to encode
categorical_columns = ['Age', 'Sex', 'ChestPainType', 'FastingBS',
       'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Apply label encoding to each categorical column
for column in categorical_columns:
    heart_data_encoded[column] = label_encoder.fit_transform(heart[column])

heart_data_encoded.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,12,1,1,140,289,0,1,172,0,0.0,2,0
1,21,0,2,160,180,0,1,156,0,1.0,1,1
2,9,1,1,130,283,0,2,98,0,0.0,2,0
3,20,0,0,138,214,0,1,108,1,1.5,1,1
4,26,1,2,150,195,0,1,122,0,0.0,2,0


# Gradient Boosting Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, confusion_matrix, classification_report
from scipy.stats import randint, uniform

# Load your dataset
data = heart_data_encoded

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('HeartDisease', axis=1), data['HeartDisease'], test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(2, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20)
}

# Instantiate the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)

# Instantiate the RandomizedSearchCV object
random_search = RandomizedSearchCV(gb_clf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, n_jobs=-1, random_state=42, scoring='precision')

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Get the best estimator
best_gb_clf = random_search.best_estimator_

# Make predictions on the training data
train_predictions = best_gb_clf.predict(X_train)

# Make predictions on the testing data
test_predictions = best_gb_clf.predict(X_test)

# Calculate precision
train_precision = precision_score(y_train, train_predictions)
test_precision = precision_score(y_test, test_predictions)

# Print precision scores
print(f'Training Precision: {train_precision:.2f}')
print(f'Test Precision: {test_precision:.2f}')

# Print confusion matrices
print("Confusion Matrix - Training Data")
print(confusion_matrix(y_train, train_predictions))

print("Confusion Matrix - Test Data")
print(confusion_matrix(y_test, test_predictions))

# Print classification reports
print("Classification Report - Training Data")
print(classification_report(y_train, train_predictions))

print("Classification Report - Test Data")
print(classification_report(y_test, test_predictions))

# Select rows 1-20 of your data to make predictions
prediction_data = data.iloc[1:21]

# Make predictions on the selected rows
predictions = best_gb_clf.predict(prediction_data.drop('HeartDisease', axis=1))

# Get the actual outputs
actual_outputs = prediction_data['HeartDisease'].values

# Print the predictions and actual values for the selected rows
print("\nPredictions on selected rows:")
for idx, (prediction, actual) in enumerate(zip(predictions, actual_outputs), start=1):
    print(f'Row {idx}: Predicted: {prediction}, Actual: {actual}')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Training Precision: 0.92
Test Precision: 0.90
Confusion Matrix - Training Data
[[300  33]
 [ 22 379]]
Confusion Matrix - Test Data
[[67 10]
 [16 91]]
Classification Report - Training Data
              precision    recall  f1-score   support

           0       0.93      0.90      0.92       333
           1       0.92      0.95      0.93       401

    accuracy                           0.93       734
   macro avg       0.93      0.92      0.92       734
weighted avg       0.93      0.93      0.92       734

Classification Report - Test Data
              precision    recall  f1-score   support

           0       0.81      0.87      0.84        77
           1       0.90      0.85      0.88       107

    accuracy                           0.86       184
   macro avg       0.85      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184


Predictions on selected rows:
Row 1: Predicted: 0, Actual: 1

Above, the training performance is better than the test performance, exceeding .05 meaning there is overfitting.

So I decided to do another run, the hyperparameter ranges are reduced to avoid complex models.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, confusion_matrix, classification_report
from scipy.stats import randint, uniform

# Load your dataset
data = heart_data_encoded

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('HeartDisease', axis=1), data['HeartDisease'], test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': randint(50, 100),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(2, 4),
    'min_samples_split': randint(10, 50),
    'min_samples_leaf': randint(10, 50),
    'max_features': ['sqrt', 'log2', 1.0],
    'subsample': uniform(0.5, 0.5)  # ensures subsample is in the range (0.5, 1.0]
}

# Instantiate the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)

# Instantiate the RandomizedSearchCV object with cross-validation
random_search = RandomizedSearchCV(gb_clf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, n_jobs=-1, random_state=42, scoring='precision')

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Get the best estimator
best_gb_clf = random_search.best_estimator_

# Implement early stopping by monitoring the validation score
best_gb_clf.set_params(n_iter_no_change=10, validation_fraction=0.2)

# Fit the model again with early stopping
best_gb_clf.fit(X_train, y_train)

# Make predictions on the training data
train_predictions = best_gb_clf.predict(X_train)

# Make predictions on the testing data
test_predictions = best_gb_clf.predict(X_test)

# Calculate precision
train_precision = precision_score(y_train, train_predictions)
test_precision = precision_score(y_test, test_predictions)

# Print precision scores
print(f'Training Precision: {train_precision:.2f}')
print(f'Test Precision: {test_precision:.2f}')

# Print confusion matrices
print("Confusion Matrix - Training Data")
print(confusion_matrix(y_train, train_predictions))

print("Confusion Matrix - Test Data")
print(confusion_matrix(y_test, test_predictions))

# Print classification reports
print("Classification Report - Training Data")
print(classification_report(y_train, train_predictions))

print("Classification Report - Test Data")
print(classification_report(y_test, test_predictions))

# Select rows 1-20 of your data to make predictions
prediction_data = data.iloc[1:21]

# Make predictions on the selected rows
predictions = best_gb_clf.predict(prediction_data.drop('HeartDisease', axis=1))

# Get the actual outputs
actual_outputs = prediction_data['HeartDisease'].values

# Print the predictions and actual values for the selected rows
print("\nPredictions on selected rows:")
for idx, (prediction, actual) in enumerate(zip(predictions, actual_outputs), start=1):
    print(f'Row {idx}: Predicted: {prediction}, Actual: {actual}')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Training Precision: 0.89
Test Precision: 0.90
Confusion Matrix - Training Data
[[289  44]
 [ 34 367]]
Confusion Matrix - Test Data
[[67 10]
 [15 92]]
Classification Report - Training Data
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       333
           1       0.89      0.92      0.90       401

    accuracy                           0.89       734
   macro avg       0.89      0.89      0.89       734
weighted avg       0.89      0.89      0.89       734

Classification Report - Test Data
              precision    recall  f1-score   support

           0       0.82      0.87      0.84        77
           1       0.90      0.86      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.87      0.86      0.86       184


Predictions on selected rows:
Row 1: Predicted: 0, Actual: 1

# HistGradientBoostingClassifier with hyper paramater tuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import precision_score, confusion_matrix, classification_report
from scipy.stats import randint, uniform

# Load your dataset
data = heart_data_encoded

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('HeartDisease', axis=1), data['HeartDisease'], test_size=0.2, random_state=42)

# Define the hyperparameter grid with more constraints to reduce overfitting
param_dist = {
    'max_iter': randint(50, 150),
    'learning_rate': uniform(0.01, 0.05),  # Reduced learning rate
    'max_depth': randint(2, 6),  # Reduced maximum depth
    'min_samples_leaf': randint(20, 50),  # Increased min_samples_leaf
    'max_leaf_nodes': randint(20, 50),  # Reduced max_leaf_nodes
    'l2_regularization': uniform(0.5, 1.5),  # Introduced regularization
    'max_bins': randint(200, 255),  # Adjust max bins to avoid overfitting
    'early_stopping': [True],  # Enable early stopping
    'validation_fraction': [0.2],  # Fraction of data to set aside as validation set for early stopping
    'n_iter_no_change': [10]  # Number of iterations with no improvement to stop
}

# Instantiate the HistGradientBoostingClassifier
hist_gb_clf = HistGradientBoostingClassifier(random_state=42)

# Instantiate the RandomizedSearchCV object with cross-validation
random_search = RandomizedSearchCV(hist_gb_clf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, n_jobs=-1, random_state=42, scoring='precision')

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Get the best estimator
best_hist_gb_clf = random_search.best_estimator_

# Make predictions on the training data
train_predictions = best_hist_gb_clf.predict(X_train)

# Make predictions on the testing data
test_predictions = best_hist_gb_clf.predict(X_test)

# Calculate precision
train_precision = precision_score(y_train, train_predictions)
test_precision = precision_score(y_test, test_predictions)

# Print precision scores
print(f'Training Precision: {train_precision:.2f}')
print(f'Test Precision: {test_precision:.2f}')

# Print confusion matrices
print("Confusion Matrix - Training Data")
print(confusion_matrix(y_train, train_predictions))

print("Confusion Matrix - Test Data")
print(confusion_matrix(y_test, test_predictions))

# Print classification reports
print("Classification Report - Training Data")
print(classification_report(y_train, train_predictions))

print("Classification Report - Test Data")
print(classification_report(y_test, test_predictions))

# Select rows 1-20 of your data to make predictions
prediction_data = data.iloc[1:21]

# Make predictions on the selected rows
predictions = best_hist_gb_clf.predict(prediction_data.drop('HeartDisease', axis=1))

# Get the actual outputs
actual_outputs = prediction_data['HeartDisease'].values

# Print the predictions and actual values for the selected rows
print("\nPredictions on selected rows:")
for idx, (prediction, actual) in enumerate(zip(predictions, actual_outputs), start=1):
    print(f'Row {idx}: Predicted: {prediction}, Actual: {actual}')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Training Precision: 0.89
Test Precision: 0.92
Confusion Matrix - Training Data
[[289  44]
 [ 28 373]]
Confusion Matrix - Test Data
[[69  8]
 [14 93]]
Classification Report - Training Data
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       333
           1       0.89      0.93      0.91       401

    accuracy                           0.90       734
   macro avg       0.90      0.90      0.90       734
weighted avg       0.90      0.90      0.90       734

Classification Report - Test Data
              precision    recall  f1-score   support

           0       0.83      0.90      0.86        77
           1       0.92      0.87      0.89       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184


Predictions on selected rows:
Row 1: Predicted: 0, Actual: 1

# CatBoost Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import precision_score, confusion_matrix, classification_report
from catboost import CatBoostClassifier, Pool
from scipy.stats import randint, uniform

# Load your dataset
data = heart_data_encoded

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('HeartDisease', axis=1), data['HeartDisease'], test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_dist = {
    'iterations': randint(50, 200),
    'learning_rate': uniform(0.01, 0.1),
    'depth': randint(2, 6),  # Reduced depth to prevent overfitting
    'l2_leaf_reg': uniform(1, 10),
    'border_count': randint(32, 128),
    'subsample': uniform(0.5, 0.5)  # ensures subsample is in the range (0.5, 1.0]
}

# Instantiate the CatBoostClassifier
cat_clf = CatBoostClassifier(loss_function='Logloss', eval_metric='Precision', random_seed=42, verbose=0, early_stopping_rounds=10)

# Instantiate the RandomizedSearchCV object with cross-validation
random_search = RandomizedSearchCV(cat_clf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, n_jobs=-1, random_state=42, scoring='precision')

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Get the best estimator
best_cat_clf = random_search.best_estimator_

# Make predictions on the training data
train_predictions = best_cat_clf.predict(X_train)

# Make predictions on the testing data
test_predictions = best_cat_clf.predict(X_test)

# Calculate precision
train_precision = precision_score(y_train, train_predictions)
test_precision = precision_score(y_test, test_predictions)

# Print precision scores
print(f'Training Precision: {train_precision:.2f}')
print(f'Test Precision: {test_precision:.2f}')

# Print confusion matrices
print("Confusion Matrix - Training Data")
print(confusion_matrix(y_train, train_predictions))

print("Confusion Matrix - Test Data")
print(confusion_matrix(y_test, test_predictions))

# Print classification reports
print("Classification Report - Training Data")
print(classification_report(y_train, train_predictions))

print("Classification Report - Test Data")
print(classification_report(y_test, test_predictions))

# Cross-validation
cv_scores = cross_val_score(best_cat_clf, X_train, y_train, cv=5, scoring='precision')
print(f'Cross-validated precision scores: {cv_scores}')
print(f'Mean cross-validated precision: {cv_scores.mean():.2f}')

# Select rows 1-20 of your data to make predictions
prediction_data = data.iloc[1:21]

# Make predictions on the selected rows
predictions = best_cat_clf.predict(prediction_data.drop('HeartDisease', axis=1))

# Get the actual outputs
actual_outputs = prediction_data['HeartDisease'].values

# Print the predictions and actual values for the selected rows
print("\nPredictions on selected rows:")
for idx, (prediction, actual) in enumerate(zip(predictions, actual_outputs), start=1):
    print(f'Row {idx}: Predicted: {prediction}, Actual: {actual}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Training Precision: 0.89
Test Precision: 0.89
Confusion Matrix - Training Data
[[287  46]
 [ 24 377]]
Confusion Matrix - Test Data
[[66 11]
 [14 93]]
Classification Report - Training Data
              precision    recall  f1-score   support

           0       0.92      0.86      0.89       333
           1       0.89      0.94      0.92       401

    accuracy                           0.90       734
   macro avg       0.91      0.90      0.90       734
weighted avg       0.91      0.90      0.90       734

Classification Report - Test Data
              precision    recall  f1-score   support

           0       0.82      0.86      0.84        77
           1       0.89      0.87      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.87      0.86      0.86       184

Cross-validated precision scores: [0.92207792 0.9        0.85