In [129]:
# Import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

In [130]:
# Loading the dataset
df = pd.read_excel('/content/va_dataset_full.xlsx')

In [131]:
# Identifying duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Removing duplicates
df = df.drop_duplicates()
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 1503
Number of duplicate rows: 0


In [132]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [133]:
# Shape and head of the dataset
print("Dataset shape: ",df.shape,"\n")
df.head()

Dataset shape:  (6338, 21) 



Unnamed: 0,age,had_diabetes,had_heart_disease,had_hypertension,had_obesity,had_stroke,had_blue_lips,had_ankle_swelling,had_puffiness,had_diff_breathing,...,fast_breathing,had_wheezed,had_chest_pain,chest_pain_duration,physical_action_painful,pain_location,urine_stop,had_lost_consciousness,had_confusion,heart_disease
0,76,No,Yes,No,No,No,No,No,No,Yes,...,Yes,No,No,Don't Know,No,Don't Know,No,No,No,1
1,50,No,No,Yes,No,Yes,No,No,No,Yes,...,Don't Know,No,No,Don't Know,No,Don't Know,No,Yes,No,1
2,61,Yes,Yes,Yes,No,No,No,Yes,No,Yes,...,Yes,No,No,Don't Know,No,Don't Know,Yes,No,No,1
3,78,No,No,No,No,No,No,No,No,Yes,...,No,No,No,Don't Know,No,Don't Know,No,Yes,No,1
4,70,No,Yes,Yes,No,No,No,No,No,Yes,...,Yes,Yes,Yes,0.5-24 hours,Yes,Upper/middle chest,No,No,No,1


In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6338 entries, 0 to 7840
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      6338 non-null   object
 1   had_diabetes             6334 non-null   object
 2   had_heart_disease        6335 non-null   object
 3   had_hypertension         6335 non-null   object
 4   had_obesity              6333 non-null   object
 5   had_stroke               6333 non-null   object
 6   had_blue_lips            6337 non-null   object
 7   had_ankle_swelling       6335 non-null   object
 8   had_puffiness            6338 non-null   object
 9   had_diff_breathing       6338 non-null   object
 10  breathing_on_off         6338 non-null   object
 11  fast_breathing           6326 non-null   object
 12  had_wheezed              6325 non-null   object
 13  had_chest_pain           6328 non-null   object
 14  chest_pain_duration      6338 non-null   obje

In [135]:
# Print unique values for each categorical column
for column in categorical_cols:
    print('Column:', column)
    print(df[column].unique())
    print()

Column: age
[76 50 61 78 70 82 54 62 52 45 60 "Don't Know" 69 75 49 43 71 33 59 64 90
 55 65 81 42 80 84 53 72 74 36 73 57 48 51 67 34 38 79 58 66 40 87 35 68
 92 77 56 63 85 89 41 22 37 30 96 25 91 102 46 32 44 47 17 28 39 83 19 86
 24 95 26 88 93 98 16 20 18 23 94 15 31 29 21 13 27 12 14 97 100 99 11]

Column: had_diabetes
['No' 'Yes' "Don't Know" nan 'Refused to Answer']

Column: had_heart_disease
['Yes' 'No' "Don't Know" 'Refused to Answer' nan]

Column: had_hypertension
['No' 'Yes' "Don't Know" 'Refused to Answer' nan]

Column: had_obesity
['No' 'Yes' "Don't Know" nan 'Refused to Answer']

Column: had_stroke
['No' 'Yes' "Don't Know" nan]

Column: had_blue_lips
['No' 'Yes' "Don't Know" nan]

Column: had_ankle_swelling
['No' 'Yes' "Don't Know" nan]

Column: had_puffiness
['No' 'Yes' "Don't Know" 'Refused to Answer']

Column: had_diff_breathing
['Yes' 'No' "Don't Know"]

Column: breathing_on_off
['Continuous' 'On and Off' "Don't Know"]

Column: fast_breathing
['Yes' "Don't Know" 'No'

In [136]:
# Check the distribution of the target variable
target_distribution = df['heart_disease'].value_counts()

# Display the class imbalance
print("Class Distribution in Target Variable (y):")
print(target_distribution)

Class Distribution in Target Variable (y):
heart_disease
0    5345
1     993
Name: count, dtype: int64


In [137]:
# Checking for null values
print(df.isnull().sum())

print("\nThere are no null values in this dataset")

age                         0
had_diabetes                4
had_heart_disease           3
had_hypertension            3
had_obesity                 5
had_stroke                  5
had_blue_lips               1
had_ankle_swelling          3
had_puffiness               0
had_diff_breathing          0
breathing_on_off            0
fast_breathing             12
had_wheezed                13
had_chest_pain             10
chest_pain_duration         0
physical_action_painful     0
pain_location               0
urine_stop                  5
had_lost_consciousness      1
had_confusion               3
heart_disease               0
dtype: int64

There are no null values in this dataset


In [138]:
for col in categorical_cols:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)

print("\nNull values in categorical columns replaced with the most frequent class")



Null values in categorical columns replaced with the most frequent class


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)


In [139]:
df['age'] = df['age'].replace("Don't Know", '-1')

# Convert 'class' column to integers
df['age'] = df['age'].astype(int)

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6338 entries, 0 to 7840
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age                      6338 non-null   int64 
 1   had_diabetes             6338 non-null   object
 2   had_heart_disease        6338 non-null   object
 3   had_hypertension         6338 non-null   object
 4   had_obesity              6338 non-null   object
 5   had_stroke               6338 non-null   object
 6   had_blue_lips            6338 non-null   object
 7   had_ankle_swelling       6338 non-null   object
 8   had_puffiness            6338 non-null   object
 9   had_diff_breathing       6338 non-null   object
 10  breathing_on_off         6338 non-null   object
 11  fast_breathing           6338 non-null   object
 12  had_wheezed              6338 non-null   object
 13  had_chest_pain           6338 non-null   object
 14  chest_pain_duration      6338 non-null   obje

In [141]:
# Extract features (X) and target (y)
X = df.drop('heart_disease', axis=1)
y = df['heart_disease']

In [142]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [143]:
# Select numerical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Select categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

In [144]:
# Get categorical column indices
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_cols]

# Apply SMOTENC to handle imbalanced classes
smote_nc = SMOTENC(categorical_features=categorical_indices, random_state=42)
X_train, y_train = smote_nc.fit_resample(X_train, y_train)

# Check the new distribution
balanced_distribution = pd.Series(y_train).value_counts()
print("\nClass Distribution After Applying SMOTE:")
print(balanced_distribution)


Class Distribution After Applying SMOTE:
heart_disease
0    4276
1    4276
Name: count, dtype: int64


In [145]:
if len(categorical_cols) > 0:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # Fit the encoder on the training data and transform it
    X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
    X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_cols))

    # Transform the testing data using the same encoder
    X_test_encoded = encoder.transform(X_test[categorical_cols])
    X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

    # Combine encoded categorical features with numerical ones for both X_train and X_test
    X_train = pd.concat([X_train.drop(columns=categorical_cols).reset_index(drop=True), X_train_encoded], axis=1)
    X_test = pd.concat([X_test.drop(columns=categorical_cols).reset_index(drop=True), X_test_encoded], axis=1)
else:
    # If no categorical columns, just reset the index for both
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

X_train.head()

Unnamed: 0,age,had_diabetes_Don't Know,had_diabetes_No,had_diabetes_Yes,had_heart_disease_Don't Know,had_heart_disease_No,had_heart_disease_Refused to Answer,had_heart_disease_Yes,had_hypertension_Don't Know,had_hypertension_No,...,urine_stop_Don't Know,urine_stop_No,urine_stop_Refused to Answer,urine_stop_Yes,had_lost_consciousness_Don't Know,had_lost_consciousness_No,had_lost_consciousness_Yes,had_confusion_Don't Know,had_confusion_No,had_confusion_Yes
0,51,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,62,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,16,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,22,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,65,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [146]:
# Initialize the LightGBM model (with class_weight set as before)
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    metric='auc',
    learning_rate=0.1,
    max_depth=20,
    n_estimators=200,
    num_leaves=70,
    random_state=42,
    class_weight='balanced'
)

# Initialize the Random Forest model (with class_weight set as before)
rf_model = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    criterion='gini',
    max_depth=20,
    min_samples_leaf=1,
    min_samples_split=5,
    max_features='sqrt',
    n_estimators=200,
    bootstrap=True,
    class_weight='balanced',
    min_impurity_decrease=0.01
)

# Initialize the Support Vector Machine (SVM) model with probability=True to enable soft voting
svm_model = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)

# Create the ensemble model using soft voting
ensemble_model = VotingClassifier(
    estimators=[
        ('lgb', lgb_model),
        ('rf', rf_model),
        ('svm', svm_model)
    ],
    voting='soft'  # Change to 'soft' for probability-based voting
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Perform cross-validation for the ensemble model
cv_scores = cross_val_score(ensemble_model, X_train, y_train, cv=5)
mean_cv_score = np.mean(cv_scores)

print(f"Ensemble Model Mean CV Score: {mean_cv_score:.2f}")

# Evaluate the ensemble model
y_train_pred = ensemble_model.predict(X_train)
y_test_pred = ensemble_model.predict(X_test)
y_test_prob = ensemble_model.predict_proba(X_test)[:, 1]  # Probability for ROC-AUC

# Evaluation Metrics
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Print Detailed Performance Report
print("\nEnsemble Model Performance:\n")
print(f"Accuracy: {test_acc:.2f}\n")

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

print(f"ROC-AUC Score: {roc_auc:.2f}\n")

print("Confusion Matrix:")
print(conf_matrix)


[LightGBM] [Info] Number of positive: 4276, number of negative: 4276
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 206
[LightGBM] [Info] Number of data points in the train set: 8552, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3421, number of negative: 3420
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 6841, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[L