In [4]:
import pandas as pd
import numpy as np

RANDOM_STATE = 404

In [5]:
df = pd.read_csv('../../data/cardio_train.csv', delimiter=';')
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [6]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y) 
y

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# Select columns to be scaled
numeric_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

# Fit and transform your data (only for numeric columns)
scaler = MinMaxScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# # Apply one-hot encoding to categorical columns
# encoder = OneHotEncoder(sparse_output=False)  # Create the encoder
# X_encoded = encoder.fit_transform(X[categorical_columns])  # Fit and transform the categorical data
# column_names = encoder.get_feature_names_out(categorical_columns)  # Get new column names for encoded features
# X_encoded = pd.DataFrame(X_encoded, columns=column_names)  # Create a DataFrame with the new column names

# # Drop original categorical columns and concatenate the new encoded DataFrame
# X = X.drop(categorical_columns, axis=1)
# X = pd.concat([X, X_encoded], axis=1)
    
X.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0.588076,2,0.579487,0.273684,0.016079,0.01355,1,1,0,0,1
1,0.730159,1,0.517949,0.394737,0.017934,0.014453,3,1,0,0,1
2,0.624003,1,0.564103,0.284211,0.017316,0.012647,3,1,0,0,0
3,0.528455,2,0.584615,0.378947,0.018553,0.015357,1,1,0,0,1
4,0.516918,1,0.517949,0.242105,0.015461,0.011743,1,1,0,0,0


In [9]:
# Dictionary to store models and their results
models_dict = {}

In [10]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

### Random Forest classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

rfModel = RandomForestClassifier(random_state=RANDOM_STATE)

rfModel.fit(X_train, y_train)

rf_pred = rfModel.predict(X_test)

rf_accuracy = metrics.accuracy_score(y_test, rf_pred)*100

models_dict['Random Forest'] = {'model': rfModel, 'accuracy': rf_accuracy}

print(f"Accuracy without CV: {rf_accuracy:.2f}")

Accuracy without CV: 71.89


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
}

rf_best_params = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'max_features': [None],
}

rf_gridsearch = GridSearchCV(estimator=rfModel,param_grid=rf_best_params, cv=5, scoring='accuracy',n_jobs=-1)
rf_gridsearch.fit(X_train, y_train)

rf_best_params = rf_gridsearch.best_params_
rf_best_estimator = rf_gridsearch.best_estimator_
rf_pred_CV = rf_best_estimator.predict(X_test)
rf_accuracy_cv = metrics.accuracy_score(y_test, rf_pred_CV)*100
classification_report_str = classification_report(y_test, rf_pred_CV, digits=4)

models_dict['Random Forest CV'] = {'model': rf_best_estimator, 'accuracy': rf_accuracy_cv}

print(f"Best Accuracy: {rf_accuracy_cv:.2f}")
print("Classification Report for RF with CV:\n", classification_report_str)

Best Accuracy: 73.45
Classification Report for RF with CV:
               precision    recall  f1-score   support

           0     0.7234    0.7644    0.7433      7041
           1     0.7471    0.7043    0.7251      6959

    accuracy                         0.7345     14000
   macro avg     0.7352    0.7343    0.7342     14000
weighted avg     0.7352    0.7345    0.7342     14000



### MLP

In [13]:
from sklearn.neural_network import MLPClassifier

mlpModel = MLPClassifier(random_state=1)
mlpModel.fit(X_train, y_train)
mlp_pred = mlpModel.predict(X_test)

mlp_accuracy = metrics.accuracy_score(y_test, mlp_pred)*100

models_dict['MLP'] = {'model': mlpModel, 'accuracy': mlp_accuracy}

print(f"Accuracy without CV: {mlp_accuracy:.2f}")

Accuracy without CV: 71.99


In [14]:
mlp_best_params = {
    'activation': ['tanh'],
    'alpha': [0.01],
    'hidden_layer_sizes': [(50, 50)],
    'max_iter': [300],
    'solver': ['adam'],
}

mlp_gridsearch = GridSearchCV(estimator=mlpModel, param_grid=mlp_best_params, cv=5, scoring='accuracy', n_jobs=-1)
mlp_gridsearch.fit(X_train, y_train)

mlp_best_estimator = mlp_gridsearch.best_estimator_
mlp_pred_CV = mlp_best_estimator.predict(X_test)
mlp_accuracy_cv = metrics.accuracy_score(y_test, mlp_pred_CV)*100
classification_report_str = classification_report(y_test, mlp_pred_CV, digits=4)

models_dict['MLP CV'] = {'model': mlp_best_estimator, 'accuracy': mlp_accuracy_cv}

print(f"Best Accuracy: {mlp_accuracy_cv:.2f}")  
print("Classification Report for MLP with CV:\n", classification_report_str)

Best Accuracy: 71.44
Classification Report for MLP with CV:
               precision    recall  f1-score   support

           0     0.6778    0.8235    0.7436      7041
           1     0.7718    0.6040    0.6776      6959

    accuracy                         0.7144     14000
   macro avg     0.7248    0.7137    0.7106     14000
weighted avg     0.7245    0.7144    0.7108     14000



### SVM

In [15]:
from sklearn.svm import SVC

svmModel = SVC(random_state=1)
svmModel.fit(X_train, y_train)
svm_pred = mlpModel.predict(X_test)

svm_accuracy = metrics.accuracy_score(y_test, svm_pred)*100

models_dict['SVM'] = {'model': svmModel, 'accuracy': svm_accuracy}

print(f"Accuracy without CV: {svm_accuracy:.2f}")

Accuracy without CV: 71.99


In [16]:
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

svm_best_params = {
    'C': [10],
    'kernel': ['rbf'],
    'gamma': ['auto'],
}

svm_gridsearch = GridSearchCV(estimator=svmModel, param_grid=svm_best_params, cv=5, scoring='accuracy', n_jobs=-1)

svm_gridsearch.fit(X_train, y_train)

svm_best_params = svm_gridsearch.best_params_
svm_best_estimator = svm_gridsearch.best_estimator_
svm_pred_CV = svm_best_estimator.predict(X_test)
svm_accuracy_cv = metrics.accuracy_score(y_test, svm_pred_CV)*100
classification_report_str = classification_report(y_test, svm_pred_CV, digits=4)

models_dict['SVM CV'] = {'model': svm_best_estimator, 'accuracy': svm_accuracy_cv}

print(f"Best Accuracy: {svm_accuracy_cv:.2f}")
print("Classification Report for SVM with CV:\n", classification_report_str)

Best Accuracy: 64.09
Classification Report for SVM with CV:
               precision    recall  f1-score   support

           0     0.6365    0.6670    0.6514      7041
           1     0.6459    0.6146    0.6299      6959

    accuracy                         0.6409     14000
   macro avg     0.6412    0.6408    0.6406     14000
weighted avg     0.6412    0.6409    0.6407     14000



### NN

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Initialize a sequential model
model = Sequential()

# Add input layer
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

models_dict['NN'] = {'model': model, 'accuracy': test_accuracy*100}

print(f"Test Accuracy: {test_accuracy:.4f}")




Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.6383


### Comparison of results

In [18]:
for key, value in models_dict.items():
    print(f"{key}: {value['accuracy']:.2f}")    

Random Forest: 71.89
Random Forest CV: 73.45
MLP: 71.99
MLP CV: 71.44
SVM: 71.99
SVM CV: 64.09
NN: 63.83


In [19]:
from joblib import dump

# Select model with best accuracy
best_result = max(models_dict, key=lambda x: models_dict[x]['accuracy'])
best_model = models_dict[best_result]['model']
print(f"Best model: {best_result} with accuracy: {models_dict[best_result]['accuracy']:.2f}")

# Save best model to a file
dump(model, 'cardio_classifier.h5')

Best model: Random Forest CV with accuracy: 73.45


['cardio_classifier.h5']