### Key facts according to WHO (World Health Organaizations)

   - Cardiovascular diseases (CVDs) are the leading cause of death globally.  
   - An estimated 17.9 million people died from CVDs in 2019, representing 32% of all global deaths. Of these deaths,
    85% were due to heart attack and stroke.  
   - Over three quarters of CVD deaths take place in low- and middle-income countries.  
   - Out of the 17 million premature deaths (under the age of 70) due to noncommunicable diseases in 2019, 38% were caused by CVDs.  
   - Most cardiovascular diseases can be prevented by addressing behavioural risk factors such as tobacco use, unhealthy diet and obesity, physical inactivity and harmful use of alcohol.  
   - It is important to detect cardiovascular disease as early as possible so that management with counselling and medicines can begin.  



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

In [51]:
data = pd.read_csv("../2_Preprocessing/preprocessed_data.csv")
data.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,ca,num,...,slope_flat,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect,Category_bps_Elevated,Category_bps_Hypertension Stage 1,Category_bps_Hypertension Stage 2,Category_bps_Hypertensive Crisis,Category_bps_Normal
0,1.007024,1,0.717289,-0.236151,1,0.490336,0,1.368309,0.0,0,...,0,0,1,0,0,0,0,1,0,0
1,1.431447,1,1.553893,0.787312,0,-1.180391,1,0.612046,3.0,2,...,1,0,0,1,0,0,0,1,0,0
2,1.431447,1,-0.677052,-0.313393,0,-0.345027,1,1.651908,2.0,1,...,1,0,0,0,1,1,0,0,0,0
3,-1.751727,1,-0.119315,0.09213,0,1.962168,0,2.502703,0.0,0,...,0,0,0,1,0,0,1,0,0,0
4,-1.327304,0,-0.119315,-0.796158,0,1.365479,0,0.517513,0.0,0,...,0,1,0,1,0,0,1,0,0,0


In [52]:
print(data['num'].value_counts())

num
0    411
1    265
2    109
3    106
4     28
Name: count, dtype: int64


##### There are 4 level of Cardic Disease, we have to deduce them into 1 for disease and 0 for  no disease we are doing this because the data set is small 

In [53]:
data['target'] = data['num'].apply(lambda x: 1 if x >= 1 else 0)
data['target'].value_counts()

target
1    508
0    411
Name: count, dtype: int64

In [54]:
data = data.drop('num', axis=1)
data.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,ca,cp_asymptomatic,...,slope_upsloping,thal_fixed defect,thal_normal,thal_reversable defect,Category_bps_Elevated,Category_bps_Hypertension Stage 1,Category_bps_Hypertension Stage 2,Category_bps_Hypertensive Crisis,Category_bps_Normal,target
0,1.007024,1,0.717289,-0.236151,1,0.490336,0,1.368309,0.0,0,...,0,1,0,0,0,0,1,0,0,0
1,1.431447,1,1.553893,0.787312,0,-1.180391,1,0.612046,3.0,1,...,0,0,1,0,0,0,1,0,0,1
2,1.431447,1,-0.677052,-0.313393,0,-0.345027,1,1.651908,2.0,1,...,0,0,0,1,1,0,0,0,0,1
3,-1.751727,1,-0.119315,0.09213,0,1.962168,0,2.502703,0.0,0,...,0,0,1,0,0,1,0,0,0,0
4,-1.327304,0,-0.119315,-0.796158,0,1.365479,0,0.517513,0.0,0,...,1,0,1,0,0,1,0,0,0,0


### Split data into train and test

In [55]:
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression and Random Forest 

In [56]:
# logistic regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy of logistic Regression: {accuracy_logreg*100}")
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest: {accuracy_rf*100}")

Accuracy of logistic Regression: 79.8913043478261
Accuracy of Random Forest: 82.6086956521739


In [57]:
# Evaluate Logistic Regression
print("=== Logistic Regression ===")
print(f"Accuracy: {accuracy_logreg}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:")
print(classification_report(y_test, y_pred_logreg))

# Evaluate Decision Tree
print("=== Decision Tree ===")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

=== Logistic Regression ===
Accuracy: 0.7989130434782609
Confusion Matrix:
[[57 18]
 [19 90]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.76      0.75        75
           1       0.83      0.83      0.83       109

    accuracy                           0.80       184
   macro avg       0.79      0.79      0.79       184
weighted avg       0.80      0.80      0.80       184

=== Decision Tree ===
Accuracy: 0.8260869565217391
Confusion Matrix:
[[60 15]
 [17 92]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79        75
           1       0.86      0.84      0.85       109

    accuracy                           0.83       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.83      0.83      0.83       184



### Cross Validation 

In [58]:
scoring = ['accuracy', 'precision', 'recall', 'f1']

lr_cv_result = cross_validate(logreg, X, y, cv=5, scoring=scoring)
print("LogisticRegression")
print(f"Cross-validation Accuracy: {np.mean(lr_cv_result['test_accuracy'])}")
print(f"Cross-validation Precision: {np.mean(lr_cv_result['test_precision'])}")
print(f"Cross-validation Recall: {np.mean(lr_cv_result['test_recall'])}")
print(f"Cross-validation F1-Score: {np.mean(lr_cv_result['test_f1'])}")

LogisticRegression
Cross-validation Accuracy: 0.7920705630791163
Cross-validation Precision: 0.8184366254673169
Cross-validation Recall: 0.8269850514463212
Cross-validation F1-Score: 0.8164920506792693


In [59]:
rf_cv_results = cross_validate(rf, X, y, cv=5, scoring=scoring)
print("Random Forest")
print(f"Cross-validation Accuracy: {np.mean(rf_cv_results['test_accuracy'])}")
print(f"Cross-validation Precision: {np.mean(rf_cv_results['test_precision'])}")
print(f"Cross-validation Recall: {np.mean(rf_cv_results['test_recall'])}")
print(f"Cross-validation F1-score: {np.mean(rf_cv_results['test_f1'])}")


Random Forest
Cross-validation Accuracy: 0.7615942028985507
Cross-validation Precision: 0.7889543595241768
Cross-validation Recall: 0.8132207338380898
Cross-validation F1-score: 0.7920845926728279


### For Heart Disease Detection:

 #### Recall is the most critical metric

### Why?

- Recall = True Positives / (True Positives + False Negatives)

- It tells us how many actual heart disease cases we successfully caught.

- In healthcare, missing a disease case (false negative) can be dangerous.

#### Model
Logistic Regression  Recall *0.8269* 

Random Forest Recall *0.8132*



## XGBoost

In [60]:
xgb = XGBClassifier(eval_metric='logloss', random_state=42)

cv_results = cross_validate(xgb, X, y, cv=5, scoring=scoring)

print("=== XGBoost Cross-Validation Results ===")
print(f"Accuracy: {cv_results['test_accuracy'].mean():.4f}")
print(f"Precision: {cv_results['test_precision'].mean():.4f}")
print(f"Recall: {cv_results['test_recall'].mean():.4f}")
print(f"F1 Score: {cv_results['test_f1'].mean():.4f}")

=== XGBoost Cross-Validation Results ===
Accuracy: 0.7137
Precision: 0.7505
Recall: 0.7680
F1 Score: 0.7493


since the best performing model is logistic regression we will now hyper tune the logistic regression for better performance 

### Hyper Tunning model

In [61]:
param_grid = {
     'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg'],
    'max_iter': [100, 200, 500]
}

grid = GridSearchCV(estimator=logreg, param_grid=param_grid, 
                    scoring='recall', cv=5, n_jobs=-1)

grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best Recall Score:", grid.best_score_)

Best Params: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Recall Score: 0.8308677926616191


### Ensemble tech

In [65]:
lr_best = LogisticRegression(C=0.01, max_iter=100, penalty='l2', solver='liblinear', random_state=42)
ensemble = VotingClassifier(
    estimators=[
    ('lr', lr_best),
    ('rf', rf),
    ('xgb', xgb)]
    ,voting='soft')

all_cv_results = cross_validate(ensemble, X, y, cv=5, scoring=scoring)

print("=== Ensemble VotingClassifier Results ===")
print(f"Accuracy: {all_cv_results['test_accuracy'].mean():.4f}")
print(f"Precision: {all_cv_results['test_precision'].mean():.4f}")
print(f"Recall: {all_cv_results['test_recall'].mean():.4f}")
print(f"F1 Score: {all_cv_results['test_f1'].mean():.4f}")

=== Ensemble VotingClassifier Results ===
Accuracy: 0.7453
Precision: 0.7774
Recall: 0.8015
F1 Score: 0.7784


#### Increasing Recall using SMOTE (Synthetic Minority Oversampling Technique) can help increase recall by balancing the dataset

In [67]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# First, split your data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE on training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_resampled.value_counts())


Before SMOTE: target
1    406
0    329
Name: count, dtype: int64
After SMOTE: target
1    406
0    406
Name: count, dtype: int64


In [None]:

lr_model = LogisticRegression(C=0.01, max_iter=100, penalty='l2', solver='liblinear', random_state=42)
lr_model.fit(X_resampled, y_resampled)
y_pred_best_lr = lr_model.predict(X_test)
print(classification_report(y_test, y_pred_best_lr))


              precision    recall  f1-score   support

           0       0.79      0.74      0.77        82
           1       0.80      0.84      0.82       102

    accuracy                           0.80       184
   macro avg       0.80      0.79      0.80       184
weighted avg       0.80      0.80      0.80       184



the recall increased only by 1%

### Training the best model and saving it...

In [69]:
best_logreg = LogisticRegression(C=0.01, max_iter=100, penalty='l2', solver='liblinear', random_state=42)
best_logreg.fit(X_train, y_train)

y_pred_best_lr = logreg.predict(X_test)
print(classification_report(y_test, y_pred_best_lr))


              precision    recall  f1-score   support

           0       0.82      0.77      0.79        82
           1       0.82      0.86      0.84       102

    accuracy                           0.82       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.82      0.82      0.82       184



In [70]:
import joblib 
joblib.dump(best_logreg, "logistic_regression.pkl")
print("Model Saved Sucessfully")

Model Saved Sucessfully
