In [149]:
import pandas as pd
import numpy as np
import os

RANDOM_STATE = 404

In [150]:
df = pd.read_csv('../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [151]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [152]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y) 
y

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [153]:
from sklearn.preprocessing import StandardScaler

# Select columns for scaling (excluding binary or categorical features)
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform your data (only for numeric columns)
X_scaled = X.copy()
X_scaled[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Apply one-hot encoding to categorical columns
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
X_encoded = pd.get_dummies(X[categorical_columns])

# Concatenate scaled numeric columns with one-hot encoded categorical columns
X_final = pd.concat([X_scaled[numeric_columns], X_encoded], axis=1)

# Ensure X_final is a DataFrame
X_final = pd.DataFrame(X_final)
X = X_final

X

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,gluc,smoke,alco,active
0,-0.436062,0.443452,-0.847873,-0.122182,-0.088238,2,1,1,0,0,1
1,0.307686,-1.018168,0.749831,0.072610,-0.035180,1,3,1,0,0,1
2,-0.247997,0.078047,-0.708942,0.007679,-0.141297,1,3,1,0,0,0
3,-0.748152,0.565254,0.541435,0.137541,0.017879,2,1,1,0,0,1
4,-0.808543,-1.018168,-1.264666,-0.187113,-0.194356,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,-0.092762,0.443452,0.124642,-0.057251,-0.088238,2,1,1,1,0,1
69996,1.269492,-0.774565,3.597913,0.072610,-0.035180,1,2,2,0,0,1
69997,-0.163286,2.270477,2.139139,0.332333,-0.035180,2,3,1,0,1,0
69998,1.200589,-0.165556,-0.153219,0.040145,-0.088238,1,1,2,0,0,0


In [154]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

### Random Forest classifier

In [155]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

rfModel = RandomForestClassifier(random_state=RANDOM_STATE)

rfModel.fit(X_train, y_train)

rf_pred = rfModel.predict(X_test)

rf_accuracy = metrics.accuracy_score(y_test, rf_pred)*100
print(f"Accuracy without CV: {rf_accuracy:.2f}")

Accuracy without CV: 71.36


In [156]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
}

#  Best parameters for RF
rf_best_params = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'max_features': [None],
}

# Create grid search
rf_gridsearch = GridSearchCV(estimator=rfModel,param_grid=rf_best_params, cv=5, scoring='accuracy',n_jobs=-1)

# Fit grid search
rf_gridsearch.fit(X_train, y_train)

In [157]:
best_params = rf_gridsearch.best_params_
best_estimator = rf_gridsearch.best_estimator_
rf_pred_CV = best_estimator.predict(X_test)
rf_accuracy_cv = metrics.accuracy_score(y_test, rf_pred_CV)*100
print(f"Best Accuracy: {rf_accuracy_cv:.2f}")

Best Accuracy: 73.46


In [158]:
from sklearn.metrics import classification_report

classification_report_str = classification_report(y_test, rf_pred_CV, digits=4)

print("Classification Report for RF with CV:\n", classification_report_str)

Classification Report for RF with CV:
               precision    recall  f1-score   support

           0     0.7235    0.7645    0.7435      7041
           1     0.7473    0.7044    0.7252      6959

    accuracy                         0.7346     14000
   macro avg     0.7354    0.7345    0.7343     14000
weighted avg     0.7353    0.7346    0.7344     14000



### MLP

In [159]:
from sklearn.neural_network import MLPClassifier

mlpModel = MLPClassifier(random_state=1)
mlpModel.fit(X_train, y_train)
mlp_pred = mlpModel.predict(X_test)

mlp_accuracy = metrics.accuracy_score(y_test, mlp_pred)*100
print(f"Accuracy without CV: {mlp_accuracy:.2f}")

Accuracy without CV: 73.51


In [160]:
mlp_best_params = {
    'activation': ['tanh'],
    'alpha': [0.01],
    'hidden_layer_sizes': [(50, 50)],
    'max_iter': [300],
    'solver': ['adam'],
}

mlp_gridsearch = GridSearchCV(estimator=mlpModel, param_grid=mlp_best_params, cv=5, scoring='accuracy', n_jobs=-1)
mlp_gridsearch.fit(X_train, y_train)

In [161]:
mlp_best_estimator = mlp_gridsearch.best_estimator_
mlp_pred_CV = mlp_best_estimator.predict(X_test)
mlp_accuracy_cv = metrics.accuracy_score(y_test, mlp_pred_CV)*100
print(f"Best Accuracy: {mlp_accuracy_cv:.2f}")  

Best Accuracy: 73.68


In [162]:
classification_report_str = classification_report(y_test, mlp_pred_CV, digits=4)

print("Classification Report for MLP with CV:\n", classification_report_str)

Classification Report for MLP with CV:
               precision    recall  f1-score   support

           0     0.7237    0.7709    0.7466      7041
           1     0.7518    0.7023    0.7262      6959

    accuracy                         0.7368     14000
   macro avg     0.7378    0.7366    0.7364     14000
weighted avg     0.7377    0.7368    0.7365     14000



### SVM

In [163]:
from sklearn.svm import SVC

svmModel = SVC(random_state=1)
svmModel.fit(X_train, y_train)
svm_pred = mlpModel.predict(X_test)

svm_accuracy = metrics.accuracy_score(y_test, svm_pred)*100
print(f"Accuracy without CV: {svm_accuracy:.2f}")

Accuracy without CV: 73.51


In [164]:
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

svm_best_params = {
    'C': [10],
    'kernel': ['rbf'],
    'gamma': ['auto'],
}

# Create grid search
svm_gridsearch = GridSearchCV(estimator=svmModel, param_grid=svm_best_params, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search
svm_gridsearch.fit(X_train, y_train)

In [165]:
svm_best_params = svm_gridsearch.best_params_
svm_best_estimator = svm_gridsearch.best_estimator_
svm_pred_CV = svm_best_estimator.predict(X_test)
svm_accuracy_cv = metrics.accuracy_score(y_test, svm_pred_CV)*100
print(f"Best Accuracy: {svm_accuracy_cv:.2f}")

Best Accuracy: 73.24


In [166]:
classification_report_str = classification_report(y_test, svm_pred_CV, digits=4)

print("Classification Report for SVM with CV:\n", classification_report_str)

Classification Report for SVM with CV:
               precision    recall  f1-score   support

           0     0.7259    0.7516    0.7385      7041
           1     0.7393    0.7129    0.7259      6959

    accuracy                         0.7324     14000
   macro avg     0.7326    0.7322    0.7322     14000
weighted avg     0.7326    0.7324    0.7322     14000



In [167]:
print(f"RF accuracy without CV : {rf_accuracy:.2f}")
print(f"RF accuracy with CV    : {rf_accuracy_cv:.2f}")
print(f"MLP accuracy without CV : {mlp_accuracy:.2f}")
print(f"MLP accuracy with CV   : {mlp_accuracy_cv:.2f}")
print(f"SVM accuracy without CV : {svm_accuracy:.2f}")
print(f"SVM accuracy with CV   : {svm_accuracy_cv:.2f}")

RF accuracy without CV : 71.36
RF accuracy with CV    : 73.46
MLP accuracy without CV : 73.51
MLP accuracy with CV   : 73.68
SVM accuracy without CV : 73.51
SVM accuracy with CV   : 73.24


In [168]:
from joblib import dump

# Save best model to a file
dump(svm_best_estimator, 'cardio_classifier.h5')

['cardio_classifier.h5']