In [25]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

# Dictionary to store models and their results
models_dict = {}

In [26]:
df = pd.read_csv('../../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [27]:
X = df.drop(columns=['cardio'])
y = df['cardio']

## Data manipulation

In [28]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y) 
y

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [29]:
from sklearn.preprocessing import StandardScaler

# Select columns to be scaled
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

# Fit and transform your data (only for numeric columns)
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

X.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,-0.436062,2,0.443452,-0.847873,-0.122182,-0.088238,1,1,0,0,1
1,0.307686,1,-1.018168,0.749831,0.07261,-0.03518,3,1,0,0,1
2,-0.247997,1,0.078047,-0.708942,0.007679,-0.141297,3,1,0,0,0
3,-0.748152,2,0.565254,0.541435,0.137541,0.017879,1,1,0,0,1
4,-0.808543,1,-1.018168,-1.264666,-0.187113,-0.194356,1,1,0,0,0


In [30]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

## Random Forest classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

rfModel = RandomForestClassifier(random_state=RANDOM_STATE)

rfModel.fit(X_train, y_train)

rf_pred = rfModel.predict(X_test)

rf_accuracy = metrics.accuracy_score(y_test, rf_pred)*100

models_dict['Random Forest'] = {'model': rfModel, 'accuracy': rf_accuracy}

print(f"Accuracy without CV: {rf_accuracy:.2f}")

Accuracy without CV: 71.81


In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
}

rf_best_params = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'max_features': [None],
}

rf_gridsearch = GridSearchCV(estimator=rfModel,param_grid=rf_best_params, cv=5, scoring='accuracy',n_jobs=-1)
rf_gridsearch.fit(X_train, y_train)

rf_best_params = rf_gridsearch.best_params_
rf_best_estimator = rf_gridsearch.best_estimator_
rf_pred_CV = rf_best_estimator.predict(X_test)
rf_accuracy_cv = metrics.accuracy_score(y_test, rf_pred_CV)*100
classification_report_str = classification_report(y_test, rf_pred_CV, digits=4)

models_dict['Random Forest CV'] = {'model': rf_best_estimator, 'accuracy': rf_accuracy_cv}

print(f"Best Accuracy: {rf_accuracy_cv:.2f}")
print("Classification Report for RF with CV:\n", classification_report_str)

Best Accuracy: 73.47
Classification Report for RF with CV:
               precision    recall  f1-score   support

           0     0.7236    0.7647    0.7435      7041
           1     0.7474    0.7044    0.7253      6959

    accuracy                         0.7347     14000
   macro avg     0.7355    0.7345    0.7344     14000
weighted avg     0.7354    0.7347    0.7345     14000



## MLP

In [33]:
from sklearn.neural_network import MLPClassifier

mlpModel = MLPClassifier(random_state=RANDOM_STATE)
mlpModel.fit(X_train, y_train)
mlp_pred = mlpModel.predict(X_test)

mlp_accuracy = metrics.accuracy_score(y_test, mlp_pred)*100

models_dict['MLP'] = {'model': mlpModel, 'accuracy': mlp_accuracy}

print(f"Accuracy without CV: {mlp_accuracy:.2f}")

Accuracy without CV: 73.52


In [34]:
mlp_best_params = {
    'activation': ['tanh'],
    'alpha': [0.01],
    'hidden_layer_sizes': [(50, 50)],
    'max_iter': [300],
    'solver': ['adam'],
}

mlp_gridsearch = GridSearchCV(estimator=mlpModel, param_grid=mlp_best_params, cv=5, scoring='accuracy', n_jobs=-1)
mlp_gridsearch.fit(X_train, y_train)

mlp_best_estimator = mlp_gridsearch.best_estimator_
mlp_pred_CV = mlp_best_estimator.predict(X_test)
mlp_accuracy_cv = metrics.accuracy_score(y_test, mlp_pred_CV)*100
classification_report_str = classification_report(y_test, mlp_pred_CV, digits=4)

models_dict['MLP CV'] = {'model': mlp_best_estimator, 'accuracy': mlp_accuracy_cv}

print(f"Best Accuracy: {mlp_accuracy_cv:.2f}")  
print("Classification Report for MLP with CV:\n", classification_report_str)

Best Accuracy: 73.57
Classification Report for MLP with CV:
               precision    recall  f1-score   support

           0     0.7322    0.7482    0.7401      7041
           1     0.7395    0.7231    0.7312      6959

    accuracy                         0.7357     14000
   macro avg     0.7358    0.7356    0.7356     14000
weighted avg     0.7358    0.7357    0.7357     14000



## SVM

In [35]:
from sklearn.svm import SVC

svmModel = SVC(random_state=RANDOM_STATE)
svmModel.fit(X_train, y_train)
svm_pred = svmModel.predict(X_test)

svm_accuracy = metrics.accuracy_score(y_test, svm_pred)*100

models_dict['SVM'] = {'model': svmModel, 'accuracy': svm_accuracy}

print(f"Accuracy without CV: {svm_accuracy:.2f}")

Accuracy without CV: 73.03


In [36]:
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

svm_best_params = {
    'C': [10],
    'kernel': ['rbf'],
    'gamma': ['auto'],
}

svm_gridsearch = GridSearchCV(estimator=svmModel, param_grid=svm_best_params, cv=5, scoring='accuracy', n_jobs=-1)

svm_gridsearch.fit(X_train, y_train)

svm_best_params = svm_gridsearch.best_params_
svm_best_estimator = svm_gridsearch.best_estimator_
svm_pred_CV = svm_best_estimator.predict(X_test)
svm_accuracy_cv = metrics.accuracy_score(y_test, svm_pred_CV)*100
classification_report_str = classification_report(y_test, svm_pred_CV, digits=4)

models_dict['SVM CV'] = {'model': svm_best_estimator, 'accuracy': svm_accuracy_cv}

print(f"Best Accuracy: {svm_accuracy_cv:.2f}")
print("Classification Report for SVM with CV:\n", classification_report_str)

Best Accuracy: 73.24
Classification Report for SVM with CV:
               precision    recall  f1-score   support

           0     0.7259    0.7516    0.7385      7041
           1     0.7393    0.7129    0.7259      6959

    accuracy                         0.7324     14000
   macro avg     0.7326    0.7322    0.7322     14000
weighted avg     0.7326    0.7324    0.7322     14000



## Comparison of results

In [37]:
for key, value in models_dict.items():
    print(f"{key}: {value['accuracy']:.2f}")

Random Forest: 71.81
Random Forest CV: 73.47
MLP: 73.52
MLP CV: 73.57
SVM: 73.03
SVM CV: 73.24


In [39]:
from joblib import dump

# Select model with best accuracy
best_result = max(models_dict, key=lambda x: models_dict[x]['accuracy'])
best_model = models_dict[best_result]['model']
print(f"Best model: {best_result} with accuracy: {models_dict[best_result]['accuracy']:.2f}")

# Save best model to a file
dump(best_model, 'cardio_classifier.h5')

Best model: MLP CV with accuracy: 73.57


['cardio_classifier.h5']