In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [97]:
df = pd.read_csv('mit_dataset3.csv')

df.head()

Unnamed: 0,HR,RR_mean,RR_std,Quality,RMSSD,pNN50,CV,SDSD,RR_Range,Label
0,103.846869,0.746667,0.209419,0.826844,0.256013,44.444444,0.280471,0.244591,0.830556,Suspicious
1,74.63396,0.811389,0.078227,0.927448,0.136827,33.333333,0.096412,0.136827,0.341667,Suspicious
2,74.86758,0.808889,0.078507,0.927208,0.137025,33.333333,0.097055,0.137002,0.341667,Suspicious
3,75.240104,0.805,0.07926,0.926561,0.136937,33.333333,0.098459,0.136924,0.341667,Suspicious
4,74.786788,0.81,0.079662,0.926216,0.138725,44.444444,0.098348,0.138626,0.341667,Suspicious


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105569 entries, 0 to 105568
Data columns (total 10 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   HR        105569 non-null  float64
 1   RR_mean   105569 non-null  float64
 2   RR_std    105569 non-null  float64
 3   Quality   105569 non-null  float64
 4   RMSSD     105569 non-null  float64
 5   pNN50     105569 non-null  float64
 6   CV        105569 non-null  float64
 7   SDSD      105569 non-null  float64
 8   RR_Range  105569 non-null  float64
 9   Label     105569 non-null  object 
dtypes: float64(9), object(1)
memory usage: 8.1+ MB


In [99]:
df.isnull().sum()

HR          0
RR_mean     0
RR_std      0
Quality     0
RMSSD       0
pNN50       0
CV          0
SDSD        0
RR_Range    0
Label       0
dtype: int64

In [100]:
df['Label'].value_counts()

Label
Normal        67681
Critical      31221
Suspicious     6667
Name: count, dtype: int64

In [101]:
# Independent Feature
X = df.drop('Label', axis=1)

# Dependent Feature
y = df['Label']

In [102]:
X.head()

Unnamed: 0,HR,RR_mean,RR_std,Quality,RMSSD,pNN50,CV,SDSD,RR_Range
0,103.846869,0.746667,0.209419,0.826844,0.256013,44.444444,0.280471,0.244591,0.830556
1,74.63396,0.811389,0.078227,0.927448,0.136827,33.333333,0.096412,0.136827,0.341667
2,74.86758,0.808889,0.078507,0.927208,0.137025,33.333333,0.097055,0.137002,0.341667
3,75.240104,0.805,0.07926,0.926561,0.136937,33.333333,0.098459,0.136924,0.341667
4,74.786788,0.81,0.079662,0.926216,0.138725,44.444444,0.098348,0.138626,0.341667


In [103]:
y.head()

0    Suspicious
1    Suspicious
2    Suspicious
3    Suspicious
4    Suspicious
Name: Label, dtype: object

# Train Test Split

In [104]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Random Forest Classifier

In [105]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

In [106]:
rf.fit(X_train, y_train)

In [107]:
y_test_pred = rf.predict(X_test)

# Performance Metrics for Random Forest

In [108]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score

print("Performance Metrics using Random Forest Classifier \n")

print("Accuracy Score: ")
print(accuracy_score(y_test, y_test_pred), "\n")

print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_test_pred), "\n")

print("Classification Report: ")
print(classification_report(y_test, y_test_pred))

Performance Metrics using Random Forest Classifier 

Accuracy Score: 
0.9143222506393862 

Confusion Matrix: 
[[ 5540   661    43]
 [  675 12823    38]
 [  225   167   942]] 

Classification Report: 
              precision    recall  f1-score   support

    Critical       0.86      0.89      0.87      6244
      Normal       0.94      0.95      0.94     13536
  Suspicious       0.92      0.71      0.80      1334

    accuracy                           0.91     21114
   macro avg       0.91      0.85      0.87     21114
weighted avg       0.91      0.91      0.91     21114



In [109]:
rf_model = RandomForestClassifier(
    n_estimators=400,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

In [110]:
rf_model.fit(X_train, y_train)

In [111]:
y_test_pred_2 = rf_model.predict(X_test)

In [112]:
print("Accuracy Score: ")
print(accuracy_score(y_test, y_test_pred_2), "\n")

print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_test_pred_2), "\n")

print("Classification Report: ")
print(classification_report(y_test, y_test_pred_2))

Accuracy Score: 
0.9158378327176281 

Confusion Matrix: 
[[ 5537   664    43]
 [  646 12855    35]
 [  227   162   945]] 

Classification Report: 
              precision    recall  f1-score   support

    Critical       0.86      0.89      0.88      6244
      Normal       0.94      0.95      0.94     13536
  Suspicious       0.92      0.71      0.80      1334

    accuracy                           0.92     21114
   macro avg       0.91      0.85      0.87     21114
weighted avg       0.92      0.92      0.92     21114



# Encoding Labels

In [113]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)


# Compute Class Weights

In [114]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)

weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)

class_weights = dict(zip(classes, weights))
print(class_weights)

{'Critical': np.float64(1.1271036019804888), 'Normal': np.float64(0.5199310493428141), 'Suspicious': np.float64(5.278767422963935)}


In [115]:
sample_weights = np.array([class_weights[i] for i in y_train])

# XGBoost Classifier

In [116]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)

xgb.fit(
    X_train,
    y_train_enc,
    sample_weight = sample_weights
)

In [117]:
y_pred_xgb = xgb.predict(X_test)

# Performance Metrics using XGBoost Classifier

In [118]:
print("Performance Metrics using XGBoost Classifier \n")

print("Accuracy Score: ")
print(accuracy_score(y_test_enc, y_pred_xgb), "\n")

print("Confusion Matrix: ")
print(confusion_matrix(y_test_enc, y_pred_xgb), "\n")

print("Classification Report: ")
print(classification_report(y_test_enc, y_pred_xgb))

Performance Metrics using XGBoost Classifier 

Accuracy Score: 
0.8701809226105901 

Confusion Matrix: 
[[ 5497   302   445]
 [ 1351 11768   417]
 [  150    76  1108]] 

Classification Report: 
              precision    recall  f1-score   support

           0       0.79      0.88      0.83      6244
           1       0.97      0.87      0.92     13536
           2       0.56      0.83      0.67      1334

    accuracy                           0.87     21114
   macro avg       0.77      0.86      0.81     21114
weighted avg       0.89      0.87      0.88     21114



# Gradient Boosting

In [119]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train, sample_weight=sample_weights)


In [120]:
y_pred_gb = gb.predict(X_test)

# Performance Metrics using Gradient Boosting

In [121]:
print("Performance Metrics using Gradient Boosting \n")

print("Accuracy Score: ")
print(accuracy_score(y_test, y_pred_gb), "\n")

print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred_gb), "\n")

print("Classification Report: ")
print(classification_report(y_test, y_pred_gb))

Performance Metrics using Gradient Boosting 

Accuracy Score: 
0.8257554229421237 

Confusion Matrix: 
[[ 5206   291   747]
 [ 1588 11216   732]
 [  238    83  1013]] 

Classification Report: 
              precision    recall  f1-score   support

    Critical       0.74      0.83      0.78      6244
      Normal       0.97      0.83      0.89     13536
  Suspicious       0.41      0.76      0.53      1334

    accuracy                           0.83     21114
   macro avg       0.70      0.81      0.74     21114
weighted avg       0.87      0.83      0.84     21114



In [122]:
rf.feature_importances_

array([0.10798825, 0.09754441, 0.0822581 , 0.07750123, 0.13412816,
       0.04925304, 0.17136714, 0.16621555, 0.11374413])

The order is

[HR,
 RR_mean,
 RR_std,
 Quality,
 RMSSD,
 pNN50,
 CV,
 SDSD,
 RR_range]

# Save the model in a .pkl file

In [123]:
import pickle

with open("cardioguard_rf_model.pkl", "wb") as file:
    pickle.dump(rf, file)

# Test Loading the model

In [133]:
with open("cardioguard_rf_model.pkl","rb") as file:
    loaded_model = pickle.load(file)

loaded_model.predict(X_test[:5])

array(['Critical', 'Critical', 'Normal', 'Normal', 'Critical'],
      dtype=object)

In [142]:
sample_input = np.array([
    103.846, 0.811, 0.078, 0.926, 0.138, 44.44, 0.097, 0.126, 0.222
]).reshape(1, -1)

prediction = loaded_model.predict(sample_input)
print("Prediction: ")
print(prediction)

print("\nProbability values of each class: ")
print("[[Critical   Normal  Suspicious]]")
print(loaded_model.predict_proba(sample_input))

Prediction: 
['Critical']

Probability values of each class: 
[[Critical   Normal  Suspicious]]
[[0.51 0.13 0.36]]
