In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('dataset_1.7L.csv')
df.head()

Unnamed: 0,sex,age,inmsupr,pneumonia,diabetes,asthma,copd,hypertension,cardiovascular,renal_chronic,obesity,tobacco,days_prior_to_treatment,intubed,icu,dead
0,0,56,0,1,0,0,0,0,0,0,0,0,8,0,0,0
1,0,55,0,0,0,0,0,0,0,0,0,0,4,0,0,0
2,0,75,0,1,0,0,0,0,0,0,0,0,8,0,0,0
3,0,47,0,1,1,0,0,0,0,0,0,0,0,0,0,1
4,1,42,0,1,0,0,0,1,0,1,0,0,7,0,0,1


In [2]:
X = df.drop(['intubed','icu','dead'], axis=1)
y = df['intubed']

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [4]:
import pickle

pickle.dump(scaler, open('intubed_scaler.sav', 'wb'))

In [5]:


from imblearn.over_sampling import SMOTE
from collections import Counter

print("Before oversampling: ",Counter(y))
SMOTE = SMOTE()
X, y = SMOTE.fit_resample(X, y)
print("After oversampling: ",Counter(y))

Before oversampling:  Counter({0: 146074, 1: 31008})
After oversampling:  Counter({0: 146074, 1: 146074})


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [7]:
from sklearn.svm import SVC

clf_svm = SVC()
clf_svm.fit(X_train, Y_train)

SVC()

In [8]:
pickle.dump(clf_svm, open('intubed_svm.sav', 'wb'))

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

preds_svm = clf_svm.predict(X_test)

print("train Accuracy = {}".format(accuracy_score(Y_train, clf_svm.predict(X_train))))
print("test Accuracy = {}".format(accuracy_score(Y_test, preds_svm)))
print("Confusion Matrix")
print(confusion_matrix(Y_test, preds_svm))
print("Classification Report")
print(classification_report(Y_test, preds_svm))

train Accuracy = 0.6070777603778913
test Accuracy = 0.6055793256888584
Confusion Matrix
[[13827 15356]
 [ 7690 21557]]
Classification Report
              precision    recall  f1-score   support

           0       0.64      0.47      0.55     29183
           1       0.58      0.74      0.65     29247

    accuracy                           0.61     58430
   macro avg       0.61      0.61      0.60     58430
weighted avg       0.61      0.61      0.60     58430



In [10]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, Y_train)

RandomForestClassifier()

In [11]:
pickle.dump(clf_rf, open('intubed_rf.sav', 'wb'))

In [12]:
preds_rf = clf_rf.predict(X_test)

print("train Accuracy = {}".format(accuracy_score(Y_train, clf_rf.predict(X_train))))
print("test Accuracy = {}".format(accuracy_score(Y_test, preds_rf)))
print("Confusion Matrix")
print(confusion_matrix(Y_test, preds_rf))
print("Classification Report")
print(classification_report(Y_test, preds_rf))

train Accuracy = 0.8409365132338973
test Accuracy = 0.7414855382508985
Confusion Matrix
[[22043  7140]
 [ 7965 21282]]
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.76      0.74     29183
           1       0.75      0.73      0.74     29247

    accuracy                           0.74     58430
   macro avg       0.74      0.74      0.74     58430
weighted avg       0.74      0.74      0.74     58430



In [13]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
clf_lr.fit(X_train, Y_train)

LogisticRegression()

In [14]:
pickle.dump(clf_lr, open('intubed_lr.sav', 'wb'))

In [15]:
preds_lr = clf_lr.predict(X_test)

print("train Accuracy = {}".format(accuracy_score(Y_train, clf_lr.predict(X_train))))
print("test Accuracy = {}".format(accuracy_score(Y_test, preds_lr)))
print("Confusion Matrix")
print(confusion_matrix(Y_test, preds_lr))
print("Classification Report")
print(classification_report(Y_test, preds_lr))

train Accuracy = 0.5970314652701119
test Accuracy = 0.5981858634263221
Confusion Matrix
[[14353 14830]
 [ 8648 20599]]
Classification Report
              precision    recall  f1-score   support

           0       0.62      0.49      0.55     29183
           1       0.58      0.70      0.64     29247

    accuracy                           0.60     58430
   macro avg       0.60      0.60      0.59     58430
weighted avg       0.60      0.60      0.59     58430



In [16]:
from xgboost import XGBClassifier

clf_xgb = XGBClassifier()
clf_xgb.fit(X_train, Y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
pickle.dump(clf_xgb, open('intubed_xgb.sav', 'wb'))

In [18]:
preds_xgb = clf_xgb.predict(X_test)

print("train Accuracy = {}".format(accuracy_score(Y_train, clf_xgb.predict(X_train))))
print("test Accuracy = {}".format(accuracy_score(Y_test, preds_xgb)))
print("Confusion Matrix")
print(confusion_matrix(Y_test, preds_xgb))
print("Classification Report")
print(classification_report(Y_test, preds_xgb))

train Accuracy = 0.7718147511103124
test Accuracy = 0.7671401677220606
Confusion Matrix
[[24257  4926]
 [ 8680 20567]]
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.83      0.78     29183
           1       0.81      0.70      0.75     29247

    accuracy                           0.77     58430
   macro avg       0.77      0.77      0.77     58430
weighted avg       0.77      0.77      0.77     58430

