## 1. Import the Cleaned data-set 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_cardio.csv')
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
68542,19240,2,168,76.0,120,80,1,1,1,0,1,0
68543,22601,1,158,126.0,140,90,2,2,0,0,1,1
68544,19066,2,183,105.0,180,90,3,1,0,1,0,1
68545,22431,1,163,72.0,135,80,1,2,0,0,0,1


## 2. Convert the 'age' column from days to years named 'age_years'
Find Min, Mean, Max so we can detect outliers

In [3]:
df2 = df.copy()

In [4]:
df2['age_years'] = df2['age'] // 365
print("Min: ",df2['age_years'].min())
print("Max: ",df2['age_years'].max())
print("Mean: ",df2['age_years'].mean())

Min:  29
Max:  64
Mean:  52.83016032795017


## 3. From 'weight' and 'height' column generate new 'bmi' column by using formula weight/((height/100)^2)
Find Min, Mean, Max so we can detect outliers

In [5]:
df2['bmi'] = df2['weight'] / ((df2['height']/100)**2)
print("Min: ",df2['bmi'].min())
print("Max: ",df2['bmi'].max())
print("Mean: ",df2['bmi'].mean())

Min:  10.726643598615919
Max:  108.16984681537221
Mean:  27.44538875860487


## 4. Replace that ouliers by filtering of the real values
15 <= bmi <= 60 is called real values according to madical\
also check Min, Mean, Max

In [6]:
df2 = df2[(df2['bmi'] >= 15) & (df2['bmi'] <= 60)]
print("Min: ",df2['bmi'].min())
print("Max: ",df2['bmi'].max())
print("Mean: ",df2['bmi'].mean())

Min:  15.012197410395949
Max:  59.52380952380953
Mean:  27.436717582998327


## 5. Create new column from 'ap_hi' and 'ap_lo' columns called 'bp_diff' which indicate the BP Difference
Find Min, Mean, Max so we can detect outliers

In [7]:
df2['bp_diff'] = df2['ap_hi'] - df2['ap_lo']
print("Min: ",df2['bp_diff'].min())
print("Max: ",df2['bp_diff'].max())
print("Mean: ",df2['bp_diff'].mean())

Min:  5
Max:  140
Mean:  45.37076880409873


## 6. Replace that ouliers by filter of the real values
10 <= bmi <= 80 is called real values according to madical\
also check Min, Mean, Max

In [8]:
df2 = df2[(df2['bp_diff'] >= 10) & (df2['bp_diff'] <= 80)]
print("Min: ",df2['bp_diff'].min())
print("Max: ",df2['bp_diff'].max())
print("Mean: ",df2['bp_diff'].mean())

Min:  10
Max:  80
Mean:  44.90876675998232


In [9]:
df2.shape

(67870, 15)

## 7. Split the data into train and test part.

In [10]:
from sklearn.model_selection import train_test_split

x = df2.drop(['cardio','age','weight','height'], axis=1)
y = df2['cardio']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.shape)
print(X_test.shape)

(54296, 11)
(13574, 11)


## 8. Now create the model and use tunning to find out the best model of the hyperparameters
for the data prediction we use the random forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [12]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')

param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [10, 14],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [5, 10]
}

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Best CV Recall:", grid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'max_depth': 14, 'min_samples_leaf': 5, 'min_samples_split': 20, 'n_estimators': 300}
Best CV Recall: 0.6909329336830273


## 9. Now predict the splitted test data and also train data so we can compare.
Accuracy measure\
Recall and F1-test\
Check Overfitting and Underfitting\
Confusion matrix for the test data

In [13]:
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_score

print("TRAIN METRICS")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Recall:", recall_score(y_train, y_train_pred))
print("F1:", f1_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred))

print("\nTEST METRICS")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1:", f1_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Confusion matrix of test data:\n", confusion_matrix(y_test, y_test_pred))

TRAIN METRICS
Accuracy: 0.7594666273758657
Recall: 0.7151742225552642
F1: 0.7451011007885081

TEST METRICS
Accuracy: 0.7302195373508178
Recall: 0.6813549160671463
F1: 0.7128743923474988
Confusion matrix of test data:
 [[5366 1536]
 [2126 4546]]


## 10. Now save the best model for further use.
joblib library will be used for that

In [15]:
import joblib

joblib.dump(best_model, "cardio_rf_model.pkl")

['cardio_rf_model.pkl']

## 11. Check by loading the saved pipeline as model that works or not.

In [16]:
model = joblib.load("cardio_rf_model.pkl")

In [17]:
y_model_pred = model.predict(X_test)
recall_score(y_test, y_model_pred)

0.6813549160671463