# SCRIPT FOR ABNORMAL VITAL SIGNS PREDICTION

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(42)

In [2]:
vital_data = pd.read_csv("Human_vital_signs_R.csv", sep=',')
vital_data.head()

Unnamed: 0.1,Unnamed: 0,Time (s),HR (BPM),RESP (BPM),SpO2 (%),TEMP (*C),OUTPUT
0,0,0,94.0,21.0,97.0,36.2,Normal
1,1,1,94.0,25.0,97.0,36.2,Normal
2,2,2,101.0,25.0,93.0,38.0,Abnormal
3,3,3,55.0,11.0,100.0,35.0,Abnormal
4,4,4,93.0,26.0,95.0,37.0,Normal


In [3]:
vital_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25493 entries, 0 to 25492
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   25493 non-null  int64  
 1   Time (s)     25493 non-null  int64  
 2    HR (BPM)    25488 non-null  float64
 3    RESP (BPM)  25346 non-null  float64
 4    SpO2 (%)    25366 non-null  float64
 5   TEMP (*C)    25493 non-null  float64
 6   OUTPUT       25493 non-null  object 
dtypes: float64(4), int64(2), object(1)
memory usage: 1.4+ MB


In [4]:
vital_data.describe()

Unnamed: 0.1,Unnamed: 0,Time (s),HR (BPM),RESP (BPM),SpO2 (%),TEMP (*C)
count,25493.0,25493.0,25488.0,25346.0,25366.0,25493.0
mean,240.0,239.981132,89.127943,17.640496,96.716471,37.590123
std,138.855163,138.85523,13.220448,3.589381,3.323381,5.211265
min,0.0,-1.0,44.0,0.0,83.0,21.0
25%,120.0,120.0,81.0,16.0,95.0,34.0
50%,240.0,240.0,89.0,18.0,97.0,38.0
75%,360.0,360.0,95.0,20.0,99.0,41.0
max,480.0,480.0,139.0,34.0,111.0,49.0


In [5]:
# First lets check how many 0 values are in these columns 
featureList = [' HR (BPM)', ' RESP (BPM)', ' SpO2 (%)', 'TEMP (*C)']
vital_data[featureList].isin([0]).sum()

 HR (BPM)        0
 RESP (BPM)    219
 SpO2 (%)        0
TEMP (*C)        0
dtype: int64

Let's replace zeros with the mean.

In [13]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [14]:
vital_data["Unnamed: 0"] = vital_data["Unnamed: 0"].astype(np.float32)
vital_data["Time (s)"] = vital_data["Time (s)"].astype(np.float32)

# For Glucose
vital_data[" HR (BPM)"] = vital_data[" HR (BPM)"].replace({ 0 : vital_data[" HR (BPM)"].mean()})
vital_data[" HR (BPM)"] = vital_data[" HR (BPM)"].astype(np.float32)

# For BloodPressure
vital_data[" RESP (BPM)"] = vital_data[" RESP (BPM)"].replace({ 0 : vital_data[" RESP (BPM)"].mean()})
vital_data[" RESP (BPM)"] = vital_data[" RESP (BPM)"].astype(np.float32)

# For SkinThickness
vital_data[" SpO2 (%)"] = vital_data[" SpO2 (%)"].replace({ 0 : vital_data[" SpO2 (%)"].mean()})
vital_data[" SpO2 (%)"] = vital_data[" SpO2 (%)"].astype(np.float32)

# For BMI
vital_data["TEMP (*C)"] = vital_data["TEMP (*C)"].replace({ 0 : vital_data["TEMP (*C)"].mean()})
vital_data["TEMP (*C)"] = vital_data["TEMP (*C)"].astype(np.float32)

# Substitute normal with 0 and abnormal with 1
vital_data = vital_data.replace({'Normal': 0, 'Abnormal': 1})
vital_data["OUTPUT"] = vital_data["OUTPUT"].astype(np.float32)

# Lets checkif it worked
vital_data[featureList].isin([0]).sum()
print(vital_data.dtypes)
print(vital_data['TEMP (*C)'].head)
vital_data.drop([0])
vital_data = vital_data.reset_index()
vital_data = clean_dataset(vital_data)

index            int64
Unnamed: 0     float32
Time (s)       float32
 HR (BPM)      float32
 RESP (BPM)    float32
 SpO2 (%)      float32
TEMP (*C)      float32
OUTPUT         float32
dtype: object
<bound method NDFrame.head of 0        36.200001
1        36.200001
2        38.000000
3        35.000000
4        37.000000
           ...    
25488    33.000000
25489    36.400002
25490    36.200001
25491    37.000000
25492    37.000000
Name: TEMP (*C), Length: 25493, dtype: float32>


Now we do the train/test split.

In [15]:
print(vital_data.shape)
X = vital_data.drop(["OUTPUT"], axis=1)
y = vital_data["OUTPUT"]

print(X.shape)
print(y.shape)

(25214, 9)
(25214, 8)
(25214,)


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)
# train set 75% + test set 25%

In [17]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

We can find out which model has the best score without considering the parameters yet.

In [18]:
from sklearn.model_selection import KFold, cross_val_score
for model in [ 
    DummyClassifier,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier]:
    
    cls = model()
    kf = KFold(n_splits = 5)
    score = cross_val_score(cls, X_train, y_train, cv = kf, scoring="roc_auc", error_score='raise')
    
    print(f"{model.__name__:22}  AUC: \t {score.mean():.3f} STD: {score.std():.2f}")

DummyClassifier         AUC: 	 0.500 STD: 0.00
DecisionTreeClassifier  AUC: 	 0.998 STD: 0.00
KNeighborsClassifier    AUC: 	 0.937 STD: 0.00
GaussianNB              AUC: 	 0.989 STD: 0.00
SVC                     AUC: 	 0.737 STD: 0.01
RandomForestClassifier  AUC: 	 1.000 STD: 0.00


In [19]:
# Fitting the modle model
cls = RandomForestClassifier()

# Fitting the model
cls.fit(X_train, y_train)

# Prediction
y_preds = cls.predict(X_test)

In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1394
         1.0       1.00      1.00      1.00      4910

    accuracy                           1.00      6304
   macro avg       1.00      1.00      1.00      6304
weighted avg       1.00      1.00      1.00      6304



In [21]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_preds))

0.9992826398852224


## FINE TUNING

In [22]:
from sklearn.model_selection import RandomizedSearchCV

# Define a grid of hyperparameters
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Set n_jobs to -1 to use all cores (NOTE: n_jobs=-1 is broken as of 8 Dec 2019, using n_jobs=1 works)
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10, # try 10 models total
                            cv=5, # 5-fold cross-validation
                            verbose=2) # print out results

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train);

# Find the best hyperparameters
print(rs_clf.best_params_)

# Scoring automatically uses the best hyperparameters
rs_clf.score(X_test, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time=   2.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time=   2.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimato

0.9998413705583756

In [23]:
print(f"Final Score: {rs_clf.score(X_test, y_test)}")

Final Score: 0.9998413705583756


## SAVE THE MODEL

In [25]:
import joblib
import pickle

In [27]:
filename = "iot_model.joblib"
joblib.dump(model, filename)

['iot_model.joblib']