In [265]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC
from sklearn.model_selection import (train_test_split,
                                     GridSearchCV,
                                     StratifiedKFold, KFold)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, 
                              GradientBoostingClassifier, 
                              AdaBoostClassifier,
                              StackingClassifier)
from xgboost import XGBClassifier  
from lightgbm import LGBMClassifier
from sklearn.metrics import (accuracy_score,
                             roc_auc_score)

import warnings
warnings.filterwarnings('ignore')


In [266]:
# Load the dataset
train = pd.read_csv('Train Dataset.csv')
test = pd.read_csv('Test Dataset.csv')
sub = pd.read_csv('Sample Submission.csv')
var = pd.read_csv('Variable_Definitions.csv')

In [267]:
# Change column names to lowercase in the training dataset
train.columns = train.columns.str.lower()

# Change column names to lowercase in the testing dataset
test.columns = test.columns.str.lower()

In [268]:
# View the first five rows of the train dataset
train.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,16167,33,0,1,158,205,1,0,154,0,1.5,1,4,1,1
1,11275,53,1,2,198,154,0,1,104,0,0.8,2,1,0,0
2,13251,37,1,2,101,202,1,0,155,0,2.1,1,3,1,1
3,19921,75,0,0,113,306,1,2,88,1,4.9,0,2,2,1
4,11293,35,1,2,139,419,1,1,166,1,0.9,2,4,0,1


In [269]:
train['ca'].value_counts()

2    1505
4    1488
3    1448
1    1439
0    1423
Name: ca, dtype: int64

In [270]:
train.describe()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0,7303.0
mean,15021.535396,53.172669,0.499658,1.502533,147.447487,342.80597,0.493085,1.013008,136.506093,0.503218,3.129851,0.99151,2.019033,1.502259,0.813501
std,2886.02608,14.18597,0.500034,1.115594,31.099538,127.291998,0.499986,0.815806,38.141966,0.500024,1.79116,0.817291,1.410546,1.113137,0.389535
min,10001.0,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12521.5,41.0,0.0,1.0,120.0,231.0,0.0,0.0,104.0,0.0,1.6,0.0,1.0,1.0,1.0
50%,15054.0,53.0,0.0,1.0,148.0,341.0,0.0,1.0,137.0,1.0,3.1,1.0,2.0,1.0,1.0
75%,17513.5,65.0,1.0,3.0,174.0,450.0,1.0,2.0,170.0,1.0,4.7,2.0,3.0,2.0,1.0
max,19998.0,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [271]:
# View the first five rows of the test dataset
test.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,16501,70,1,0,163,495,0,2,170,1,2.0,1,0,1
1,10444,61,1,0,131,238,0,2,74,1,4.9,2,2,2
2,14288,53,1,0,95,558,1,1,73,1,0.7,1,1,0
3,10409,37,0,1,178,287,0,1,192,1,5.7,1,0,0
4,17330,35,0,3,104,281,0,0,122,0,1.3,1,4,3


In [272]:
test.describe()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0,2697.0
mean,14939.832036,53.064516,0.488691,1.504264,146.384872,344.027809,0.497219,1.021135,135.993326,0.484613,3.142714,1.011494,2.032258,1.473489
std,2888.940621,14.350978,0.499965,1.112747,30.64058,127.265038,0.500085,0.81159,38.419913,0.499856,1.810003,0.814976,1.397353,1.120744
min,10000.0,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0
25%,12455.0,41.0,0.0,1.0,121.0,233.0,0.0,0.0,103.0,0.0,1.6,0.0,1.0,0.0
50%,14854.0,53.0,0.0,1.0,147.0,344.0,0.0,1.0,136.0,0.0,3.2,1.0,2.0,1.0
75%,17465.0,66.0,1.0,3.0,172.0,455.0,1.0,2.0,170.0,1.0,4.7,2.0,3.0,2.0
max,19999.0,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0


In [273]:
train.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,16167,33,0,1,158,205,1,0,154,0,1.5,1,4,1,1
1,11275,53,1,2,198,154,0,1,104,0,0.8,2,1,0,0
2,13251,37,1,2,101,202,1,0,155,0,2.1,1,3,1,1
3,19921,75,0,0,113,306,1,2,88,1,4.9,0,2,2,1
4,11293,35,1,2,139,419,1,1,166,1,0.9,2,4,0,1


In [274]:
train.dtypes

id            int64
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [275]:
# Cholesterol-to-Age Ratio
train['chol_age_ratio'] = train['chol'] / train['age']
test['chol_age_ratio'] = test['chol'] / test['age']

# Blood Pressure-to-Age Ratio
train['bp_age_ratio'] = train['trestbps'] / train['age']
test['bp_age_ratio'] = test['trestbps'] / test['age']

# Heart Rate Reserve
train['heart_rate_reserve'] = 220 - train['age'] - train['thalach']
test['heart_rate_reserve'] = 220 - test['age'] - test['thalach']

# Cholesterol-Blood Pressure Interaction
train['chol_trestbps_interaction'] = train['chol'] * train['trestbps']
test['chol_trestbps_interaction'] = test['chol'] * test['trestbps']

# ST Depression Severity
train['st_depression_severity'] = train['oldpeak'] * train['exang']
test['st_depression_severity'] = test['oldpeak'] * test['exang']

# Vessels Blocked Proportion
train['vessels_blocked_proportion'] = train['ca'] / 3
test['vessels_blocked_proportion'] = test['ca'] / 3

# Cholesterol-to-Heart Rate Ratio
train['chol_heart_rate_ratio'] = train['chol'] / train['thalach']
test['chol_heart_rate_ratio'] = test['chol'] / test['thalach']

# Resting ECG Abnormality Indicator
train['restecg_abnormal'] = train['restecg'].apply(lambda x: 1 if x != 0 else 0)
test['restecg_abnormal'] = test['restecg'].apply(lambda x: 1 if x != 0 else 0)

# 9. BMI Estimate (Cholesterol-to-Age Ratio)
train['bmi_estimate'] = train['chol'] / train['age']
test['bmi_estimate'] = test['chol'] / test['age']

# 10. Exercise Impact Index (based on heart rate, angina, oldpeak)
train['exercise_impact_index'] = (train['thalach'] - (train['exang'] * train['oldpeak'])) / 10
test['exercise_impact_index'] = (test['thalach'] - (test['exang'] * test['oldpeak'])) / 10


In [276]:
train.head()

Unnamed: 0,id,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,...,chol_age_ratio,bp_age_ratio,heart_rate_reserve,chol_trestbps_interaction,st_depression_severity,vessels_blocked_proportion,chol_heart_rate_ratio,restecg_abnormal,bmi_estimate,exercise_impact_index
0,16167,33,0,1,158,205,1,0,154,0,...,6.212121,4.787879,33,32390,0.0,1.333333,1.331169,0,6.212121,15.4
1,11275,53,1,2,198,154,0,1,104,0,...,2.90566,3.735849,63,30492,0.0,0.333333,1.480769,1,2.90566,10.4
2,13251,37,1,2,101,202,1,0,155,0,...,5.459459,2.72973,28,20402,0.0,1.0,1.303226,0,5.459459,15.5
3,19921,75,0,0,113,306,1,2,88,1,...,4.08,1.506667,57,34578,4.9,0.666667,3.477273,1,4.08,8.31
4,11293,35,1,2,139,419,1,1,166,1,...,11.971429,3.971429,19,58241,0.9,1.333333,2.524096,1,11.971429,16.51


In [277]:
# Specify the columns to scale
columns_to_scale = [
    'age', 'trestbps', 'chol', 'thalach', 'chol_age_ratio',
    'bp_age_ratio', 'heart_rate_reserve', 'st_depression_severity',
    'vessels_blocked_proportion', 'chol_heart_rate_ratio', 'bmi_estimate',
    'exercise_impact_index'
]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
train[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])

# Transform the test data
test[columns_to_scale] = scaler.transform(test[columns_to_scale])


In [278]:
missing_values_train = train.isnull().sum()
missing_values_train = missing_values_train[missing_values_train > 0]
print(missing_values_train) 

missing_values_test = test.isnull().sum()
missing_values_test = missing_values_train[missing_values_train > 0]
print(missing_values_test) 

Series([], dtype: int64)
Series([], dtype: int64)


In [279]:
# Separate features and target variables
X = train.drop(columns=['id', 'target'])
y = train['target']

#### Best model

In [280]:
# Define base models
base_models = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)) 
]

# Use RandomForest as the meta-model
meta_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Set up KFold cross-validation
fold = KFold(n_splits=12, shuffle=True, random_state=2023)

# Initialize variables to store results
accuracies = []
roc_aucs = []

best_model = None
best_accuracy = 0
best_auc = 0

# Track models and their performance for each fold
fold_number = 1
for train_index, test_index in fold.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Create stacking model
    stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
    
    # Train the model on the current fold
    stacked_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions for validation set
    y_pred_fold = stacked_model.predict(X_val_fold)
    y_pred_proba_fold = stacked_model.predict_proba(X_val_fold)[:, 1]
    
    # Calculate accuracy and ROC AUC for the current fold
    accuracy = accuracy_score(y_val_fold, y_pred_fold)
    roc_auc = roc_auc_score(y_val_fold, y_pred_proba_fold)
    
    
    accuracies.append(accuracy)
    roc_aucs.append(roc_auc)
    
    print(f"Fold {fold_number} - Accuracy: {accuracy}, ROC AUC: {roc_auc}")
    
    # Update the best model if current fold performs better
    if accuracy > best_accuracy or roc_auc > best_auc:
        best_accuracy = accuracy
        best_auc = roc_auc
        best_model = stacked_model  # Save the best model from this fold
    
    fold_number += 1

print(f"Best Model - Accuracy: {best_accuracy}, ROC AUC: {best_auc}")

# Calculate and print mean accuracy and mean ROC AUC
mean_accuracy = sum(accuracies) / len(accuracies)
mean_roc_auc = sum(roc_aucs) / len(roc_aucs)

print(f"Mean Accuracy: {mean_accuracy}, Mean ROC AUC: {mean_roc_auc}")

Fold 1 - Accuracy: 0.8243021346469622, ROC AUC: 0.8879697372796808
Fold 2 - Accuracy: 0.8243021346469622, ROC AUC: 0.8905712186255172
Fold 3 - Accuracy: 0.825944170771757, ROC AUC: 0.8812933254109725
Fold 4 - Accuracy: 0.819376026272578, ROC AUC: 0.8913121586662833
Fold 5 - Accuracy: 0.8029556650246306, ROC AUC: 0.8771642808452624
Fold 6 - Accuracy: 0.8407224958949097, ROC AUC: 0.9125867311140563
Fold 7 - Accuracy: 0.7914614121510674, ROC AUC: 0.8762917830814286
Fold 8 - Accuracy: 0.8092105263157895, ROC AUC: 0.8881055581666579
Fold 9 - Accuracy: 0.8108552631578947, ROC AUC: 0.8860806863818912
Fold 10 - Accuracy: 0.8108552631578947, ROC AUC: 0.8987930600955494
Fold 11 - Accuracy: 0.7861842105263158, ROC AUC: 0.8726478319146344
Fold 12 - Accuracy: 0.8092105263157895, ROC AUC: 0.8712673075227093
Best Model - Accuracy: 0.8407224958949097, ROC AUC: 0.9125867311140563
Mean Accuracy: 0.8129483190735458, Mean ROC AUC: 0.8861736399253869


In [281]:
# Use the best model to make predictions on the test dataset
y_test_pred = best_model.predict(test.drop(columns='id', axis=1))
y_test_pred_proba = best_model.predict_proba(test.drop(columns='id', axis=1))[:, 1] 

In [282]:
sub.head()

Unnamed: 0,Id
0,16501
1,10444
2,14288
3,10409
4,17330


In [283]:
sub['target'] =  y_test_pred

In [284]:
sub.head()

Unnamed: 0,Id,target
0,16501,1
1,10444,1
2,14288,1
3,10409,1
4,17330,1


In [285]:
# Save to CSV
sub.to_csv('best_fold_stacking_submission23.csv', index=False)

print("Submission file created using best fold model.")

Submission file created using best fold model.
