In [209]:
import numpy as np
import pandas as pd
import torch

import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

# Processing Data

In [210]:
def processing_data(file_path):

  if not isinstance(file_path, str):

    raise ValueError("Not a string.")

  # Reading Excel File
  df_heart_dis = pd.read_excel(file_path)

  # Removing Duplicate Rows
  df_heart_dis = df_heart_dis.drop_duplicates()

  # Removing Empty Rows
  df_heart_dis = df_heart_dis.dropna()

  # Removing Uncessary columns
  columns_to_drop = ["height", "weight", "age", "bp_category_encoded", "id"]

  df_heart_dis.drop(columns = columns_to_drop, inplace = True)

  # Converting Male(2) = 0 & Female(1) = 1
  df_heart_dis["gender"] = df_heart_dis["gender"].apply(lambda x: 0 if x == 2 else 1)

  # One Hot-Encoding
  labels = {
    1: "normal",
    2: "above_normal",
    3: "well_above_normal"
  }

  df_heart_dis["cholesterol"] = df_heart_dis["cholesterol"].map(labels)
  df_heart_dis["gluc"] = df_heart_dis["gluc"].map(labels)

  df_heart_dis = pd.get_dummies(df_heart_dis, columns = ["bp_category", "cholesterol", "gluc"], prefix = ["bp_category", "chol", "gluc"])

  dummy_cols = [col for col in df_heart_dis.columns if col.startswith(("bp_category", "chol", "gluc"))]

  df_heart_dis[dummy_cols] = df_heart_dis[dummy_cols].astype(int) # Converting True = 1 & False = 0

  return df_heart_dis

In [211]:
df_heart_dis = processing_data("cardiovascular_dataset.xlsx")

print("Class Distribution:\n", df_heart_dis["cardio"].value_counts(),"\n")
print(df_heart_dis.head(5))

Class Distribution:
 cardio
0    34533
1    33672
Name: count, dtype: int64 

   gender  ap_hi  ap_lo  smoke  alco  active  cardio  age_years        bmi  \
0       0    110     80      0     0       1       0         50  21.967120   
1       1    140     90      0     0       1       1         55  34.927679   
2       1    130     70      0     0       0       1         51  23.507805   
3       0    150    100      0     0       1       1         48  28.710479   
4       1    100     60      0     0       0       0         47  23.011177   

   bp_category_Elevated  bp_category_Hypertension Stage 1  \
0                     0                                 1   
1                     0                                 0   
2                     0                                 1   
3                     0                                 0   
4                     0                                 0   

   bp_category_Hypertension Stage 2  bp_category_Normal  chol_above_normal  \
0       

# Training, Validation & Testing

---



In [212]:
X_data = df_heart_dis.drop(columns = "cardio")
y_target = df_heart_dis["cardio"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X_data,
    y_target,
    test_size = 0.2, # 20% for testing
    random_state = 42,
    stratify = y_target
    )

X_train, X_val, y_train, y_val = model_selection.train_test_split(
    X_train,
    y_train,
    test_size = 0.125,       #10% of training for validation
    random_state = 42,
    stratify = y_train
)

x_train = torch.from_numpy(X_train.values).float()
x_val = torch.from_numpy(X_val.values).float()
x_test = torch.from_numpy(X_test.values).float()

y_train = torch.from_numpy(y_train.values).float()
y_val = torch.from_numpy(y_val.values).float()
y_test = torch.from_numpy(y_test.values).float()


print("Number of Features: {}".format(x_train.shape[1]))
print(x_train[0])
print(y_train[0])

print("\nNumber of Examples:{}".format(len(df_heart_dis)))
print("\nTraining Examples:{}".format(len(x_train)))
print("Validation Examples:{}".format(len(x_val)))
print("Testing Examples:{}".format(len(x_test)))

Number of Features: 18
tensor([  1.0000, 120.0000,  80.0000,   0.0000,   0.0000,   1.0000,  51.0000,
         21.9363,   0.0000,   1.0000,   0.0000,   0.0000,   0.0000,   1.0000,
          0.0000,   0.0000,   1.0000,   0.0000])
tensor(0.)

Number of Examples:68205

Training Examples:47743
Validation Examples:6821
Testing Examples:13641


# Classification Model

In [213]:
model = RandomForestClassifier(
    n_estimators = 250,        # number of trees
    max_depth = 25,          # tree depth (None = expand until pure)
    min_samples_split = 38,
    random_state = 42
)

# Training Model

In [214]:
def compute_metrics(true_labels, pred_labels):

  tp, fp = 0, 0
  tn, fn = 0, 0

  for true, pred in zip(true_labels, pred_labels):

    if true == 1 and pred == 1:

      tp += 1

    elif true == 0 and pred == 0:

      tn += 1

    elif true == 0 and pred == 1:

      fp += 1

    else:

      fn += 1

  accuracy = (tp + tn) / (tp + tn + fp + fn)
  precision = tp / (tp + fp) if (tp + fp) > 0 else 0
  recall = tp / (tp + fn) if (tp + fn) > 0 else 0
  f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return accuracy, precision, recall, f1_score

In [215]:
model.fit(x_train, y_train)

val_preds = model.predict(x_val)

accuracy, precision, recall, f1_score = compute_metrics(y_val, val_preds)

print("Accuracy:{}".format(accuracy * 100))
print("Precision:{}".format(precision))
print("Recall:{}".format(recall))
print("F1-Score:{}".format(f1_score))

Accuracy:72.92185896496115
Precision:0.7408111533586819
Recall:0.6943866943866944
F1-Score:0.7168480760386325


# Evaluating Model

In [216]:
test_preds = model.predict(x_test)

accuracy, precision, recall, f1_score = compute_metrics(y_test, test_preds)

print("Accuracy:{}".format(accuracy * 100))
print("Precision:{}".format(precision))
print("Recall:{}".format(recall))
print("F1-Score:{}".format(f1_score))

Accuracy:73.35239351953669
Precision:0.7510124736756845
Recall:0.6884466884466884
F1-Score:0.7183698768110328


# Saving Classifier


In [217]:
#import joblib

#joblib.dump(model, "random_forest_model.joblib")

['random_forest_model.joblib']