In [12]:
import pandas as pd
import numpy as np
import os

RANDOM_STATE = 404

In [13]:
df = pd.read_csv('../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [14]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y) 
y

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [16]:
from sklearn.preprocessing import StandardScaler

# Select columns for scaling (excluding binary or categorical features)
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform your data (only for numeric columns)
X_scaled = X.copy()
X_scaled[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Apply one-hot encoding to categorical columns
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
X_encoded = pd.get_dummies(X[categorical_columns])

# Concatenate scaled numeric columns with one-hot encoded categorical columns
X_final = pd.concat([X_scaled[numeric_columns], X_encoded], axis=1)

# Ensure X_final is a DataFrame
X_final = pd.DataFrame(X_final)
X = X_final

X.describe()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,gender,cholesterol,gluc,smoke,alco,active
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,5.272227e-16,1.450116e-15,-2.905105e-16,7.623108000000001e-17,1.7459050000000003e-17,1.349571,1.366871,1.226457,0.088129,0.053771,0.803729
std,1.000007,1.000007,1.000007,1.000007,1.000007,0.476838,0.68025,0.57227,0.283484,0.225568,0.397179
min,-3.514407,-13.32014,-4.460075,-1.810381,-0.8841161,1.0,1.0,1.0,0.0,0.0,0.0
25%,-0.7315341,-0.652763,-0.639477,-0.05725127,-0.0882385,1.0,1.0,1.0,0.0,0.0,1.0
50%,0.09489744,0.07804703,-0.1532192,-0.05725127,-0.0882385,1.0,1.0,1.0,0.0,0.0,1.0
75%,0.7531244,0.6870554,0.5414349,0.07261016,-0.03517999,2.0,2.0,1.0,0.0,0.0,1.0
max,1.720199,10.43119,8.738353,103.1826,57.85165,2.0,3.0,3.0,1.0,1.0,1.0


In [17]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

### Random Forest classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

rfModel = RandomForestClassifier(random_state=RANDOM_STATE)

rfModel.fit(X_train, y_train)

rf_pred = rfModel.predict(X_test)

rf_accuracy = metrics.accuracy_score(y_test, rf_pred)*100
print(f"Accuracy without CV: {rf_accuracy:.2f}")

Accuracy without CV: 71.36


In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
}

#  Best parameters for RF
rf_best_params = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'max_features': [None],
}

# Create grid search
rf_gridsearch = GridSearchCV(estimator=rfModel,param_grid=rf_best_params, cv=5, scoring='accuracy',n_jobs=-1)

# Fit grid search
rf_gridsearch.fit(X_train, y_train)

In [20]:
best_params = rf_gridsearch.best_params_
best_estimator = rf_gridsearch.best_estimator_
rf_pred_CV = best_estimator.predict(X_test)
rf_accuracy_cv = metrics.accuracy_score(y_test, rf_pred_CV)*100
print(f"Best Accuracy: {rf_accuracy_cv:.2f}")

Best Accuracy: 73.46


In [21]:
from sklearn.metrics import classification_report

classification_report_str = classification_report(y_test, rf_pred_CV, digits=4)

print("Classification Report for RF with CV:\n", classification_report_str)

Classification Report for RF with CV:
               precision    recall  f1-score   support

           0     0.7235    0.7645    0.7435      7041
           1     0.7473    0.7044    0.7252      6959

    accuracy                         0.7346     14000
   macro avg     0.7354    0.7345    0.7343     14000
weighted avg     0.7353    0.7346    0.7344     14000



### MLP

In [22]:
from sklearn.neural_network import MLPClassifier

mlpModel = MLPClassifier(random_state=1)
mlpModel.fit(X_train, y_train)
mlp_pred = mlpModel.predict(X_test)

mlp_accuracy = metrics.accuracy_score(y_test, mlp_pred)*100
print(f"Accuracy without CV: {mlp_accuracy:.2f}")

Accuracy without CV: 73.51


In [23]:
mlp_best_params = {
    'activation': ['tanh'],
    'alpha': [0.01],
    'hidden_layer_sizes': [(50, 50)],
    'max_iter': [300],
    'solver': ['adam'],
}

mlp_gridsearch = GridSearchCV(estimator=mlpModel, param_grid=mlp_best_params, cv=5, scoring='accuracy', n_jobs=-1)
mlp_gridsearch.fit(X_train, y_train)

In [24]:
mlp_best_estimator = mlp_gridsearch.best_estimator_
mlp_pred_CV = mlp_best_estimator.predict(X_test)
mlp_accuracy_cv = metrics.accuracy_score(y_test, mlp_pred_CV)*100
print(f"Best Accuracy: {mlp_accuracy_cv:.2f}")  

Best Accuracy: 73.68


In [25]:
classification_report_str = classification_report(y_test, mlp_pred_CV, digits=4)

print("Classification Report for MLP with CV:\n", classification_report_str)

Classification Report for MLP with CV:
               precision    recall  f1-score   support

           0     0.7237    0.7709    0.7466      7041
           1     0.7518    0.7023    0.7262      6959

    accuracy                         0.7368     14000
   macro avg     0.7378    0.7366    0.7364     14000
weighted avg     0.7377    0.7368    0.7365     14000



### SVM

In [26]:
from sklearn.svm import SVC

svmModel = SVC(random_state=1)
svmModel.fit(X_train, y_train)
svm_pred = mlpModel.predict(X_test)

svm_accuracy = metrics.accuracy_score(y_test, svm_pred)*100
print(f"Accuracy without CV: {svm_accuracy:.2f}")

Accuracy without CV: 73.51


In [27]:
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

svm_best_params = {
    'C': [10],
    'kernel': ['rbf'],
    'gamma': ['auto'],
}

# Create grid search
svm_gridsearch = GridSearchCV(estimator=svmModel, param_grid=svm_best_params, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search
svm_gridsearch.fit(X_train, y_train)

In [28]:
svm_best_params = svm_gridsearch.best_params_
svm_best_estimator = svm_gridsearch.best_estimator_
svm_pred_CV = svm_best_estimator.predict(X_test)
svm_accuracy_cv = metrics.accuracy_score(y_test, svm_pred_CV)*100
print(f"Best Accuracy: {svm_accuracy_cv:.2f}")

Best Accuracy: 73.24


In [29]:
classification_report_str = classification_report(y_test, svm_pred_CV, digits=4)

print("Classification Report for SVM with CV:\n", classification_report_str)

Classification Report for SVM with CV:
               precision    recall  f1-score   support

           0     0.7259    0.7516    0.7385      7041
           1     0.7393    0.7129    0.7259      6959

    accuracy                         0.7324     14000
   macro avg     0.7326    0.7322    0.7322     14000
weighted avg     0.7326    0.7324    0.7322     14000



### NN

In [37]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Initialize a sequential model
model = Sequential()

# Add input layer
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.7328


### Comparison

In [38]:
print(f"RF accuracy without CV: {rf_accuracy:.2f}")
print(f"RF accuracy with CV: {rf_accuracy_cv:.2f}")
print(f"MLP accuracy without CV: {mlp_accuracy:.2f}")
print(f"MLP accuracy with CV: {mlp_accuracy_cv:.2f}")
print(f"SVM accuracy without CV: {svm_accuracy:.2f}")
print(f"SVM accuracy with CV: {svm_accuracy_cv:.2f}")
print(f"NN accuracy: {test_accuracy:.2f}")

RF accuracy without CV: 71.36
RF accuracy with CV: 73.46
MLP accuracy without CV: 73.51
MLP accuracy with CV: 73.68
SVM accuracy without CV: 73.51
SVM accuracy with CV: 73.24
NN accuracy: 0.73


In [39]:
from joblib import dump

# Save best model to a file
dump(svm_best_estimator, 'cardio_classifier.h5')

['cardio_classifier.h5']