The model created here is for ICU prediction for patients admitted under 0-2hrs 

In [1]:
!pip install openpyxl

****Import Libraries****

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
pd.options.display.max_columns = None
pd.options.display.max_rows = None

**Import Dataset**

In [3]:
dataset = pd.read_excel("../input/covid19/Kaggle_Sirio_Libanes_ICU_Prediction.xlsx", engine="openpyxl")

**Exploratory Data Analysis**

In [4]:
dataset.head(5)

In [5]:
dataset.info()

In [6]:
#total number of patients
print(f"The total number of patients from the dataset is {dataset['PATIENT_VISIT_IDENTIFIER'].max()+1}")

In [7]:
# proportion of patients admitted/ not admitted to ICU 
ICU_prop = dataset[dataset['WINDOW'] == 'ABOVE_12']
#group by ICU admission
ICU_prop_main = ICU_prop.groupby('ICU')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()

labels = ["not-admitted", "admitted"]
plt.title('ICU admissions proportion', fontdict= {'fontsize' : 16}, pad=75)
plt.pie(ICU_prop_main['PATIENT_VISIT_IDENTIFIER'],textprops={'fontsize': 12},radius =2, labels = labels, startangle=90, autopct=lambda p : '{:.2f}%  ({:,.0f}patients)'.format(p,p * sum(ICU_prop_main['PATIENT_VISIT_IDENTIFIER'])/100))
plt.show()

In [8]:
#proportion of patients below or above 65
prop_65 = ICU_prop.groupby('AGE_ABOVE65')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()

labels = ["Below-65", "Above-65"]
plt.title('Proportion of ages; below/above 65', fontdict= {'fontsize' : 16}, pad=75)
plt.pie(prop_65['PATIENT_VISIT_IDENTIFIER'],textprops={'fontsize': 12},radius =2, labels = labels, startangle=90, autopct=lambda p : '{:.2f}%  ({:,.0f}patients)'.format(p,p * sum(prop_65['PATIENT_VISIT_IDENTIFIER'])/100))
plt.show()
#print(f"The proportion of patients above or below 65 is: *{prop_65['PATIENT_VISIT_IDENTIFIER'][0]} patients below 65 and *{prop_65['PATIENT_VISIT_IDENTIFIER'][1]} above 65")

In [9]:
#Age distribution of patients (below or above 65) according to ICU admissions
AGE_65_ICU = ICU_prop[ICU_prop['ICU'] == 1]
AGE_65_ICU = AGE_65_ICU.groupby('AGE_ABOVE65')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()

labels = ["Below-65", "Above-65"]
plt.title('ICU admissions proportion for ages below/above 65', fontdict= {'fontsize' : 16}, pad=75)
plt.pie(AGE_65_ICU['PATIENT_VISIT_IDENTIFIER'],textprops={'fontsize': 12},radius =2, labels = labels, startangle=90, autopct=lambda p : '{:.2f}%  ({:,.0f}patients)'.format(p,p * sum(AGE_65_ICU['PATIENT_VISIT_IDENTIFIER'])/100))
plt.show()


In [10]:
#Age distribution of patients(percentiles) according to ICU admissions
AGE_prop_percentil = ICU_prop[ICU_prop['ICU'] == 1]
AGE_prop_percentil = AGE_prop_percentil.groupby('AGE_PERCENTIL')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()
AGE_prop_percentil.head()
plt.figure(figsize=(8,5))
plt.bar(AGE_prop_percentil["AGE_PERCENTIL"],AGE_prop_percentil["PATIENT_VISIT_IDENTIFIER"])
plt.xticks(rotation = 70)
plt.ylabel("Patients Count")
plt.xlabel("Percentile")
plt.title('ICU admissions proportion according to AGE_PERCENTILE', fontdict= {'fontsize' : 14})
plt.show()

**Data PreProcessing & Feature Engineering**

In [11]:
dataset.shape

In [12]:
# create new column to indicate if a patient eventually went to ICU (ICU_SUM)
df_admitted = (dataset.groupby("PATIENT_VISIT_IDENTIFIER")["ICU"].sum()>0).reset_index()*1
df_admitted.columns = ["PATIENT_VISIT_IDENTIFIER", "ICU_SUM"]

In [13]:
dataset_admitted = pd.merge(dataset, df_admitted, on = "PATIENT_VISIT_IDENTIFIER")

In [14]:
#check for missing data
dataset.isna().sum()

In [15]:
# fill missing values
dataset_admitted.fillna(method='ffill', inplace = True)
dataset_admitted.fillna(method='bfill', inplace = True)

In [16]:
#drop rows with ICU == 1 ie drop data when the target variable is present, as stipulated by dataset author
dataset_ = dataset_admitted[dataset_admitted.ICU == 0].reset_index(drop= True)

In [17]:
#keeping only window 0-2 data
dataset_ = dataset_[dataset_.WINDOW == "0-2"].reset_index(drop = True)

In [18]:
#drop unnecessary columns
final_data = dataset_.drop(["PATIENT_VISIT_IDENTIFIER", "WINDOW", "ICU"],axis = 1)

In [19]:
#look for categorical columns and convert them
cat_columns = final_data.select_dtypes(object).columns 
#print()
final_data = pd.get_dummies(final_data, columns = cat_columns)

In [20]:
#drop duplicated columns
#columns were values are equal
final_data= final_data.reset_index().T.drop_duplicates().T.set_index('index')

In [21]:
final_data.head()

In [22]:
final_data.shape

In [23]:
#check for empty  or null cells
np.where(pd.isnull(final_data))

**Feature Selection**

In [24]:
#we reduce dataset  variables by checking correlations  with target column
corr_data = final_data.corrwith(final_data["ICU_SUM"])
print(corr_data)


In [25]:
corr_data.describe()

In [26]:
#select columns from correlation data with conditions
np_corr_data = np.array(corr_data)
columns = []
for i in np_corr_data:
  if(i):
    if(i>0.04):
      columns.append(True)
    elif(i<-0.02):
      columns.append(True)
    else:
      columns.append(False)
  else:
    columns.append(False)

print(len(columns), columns.count(True))
selection = np.array(columns)
#print(selection)
selected_final_data = final_data.loc[:, selection]
selected_final_data.head()

**MODEL AND EVALUATION**

In [27]:
#create x and y data
X_data = selected_final_data.drop(['ICU_SUM'], axis = 1)
Y_data = selected_final_data[['ICU_SUM']]

In [28]:
print(X_data.shape)
print(Y_data.shape)

In [29]:
#train test split
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.30, random_state=1)

In [30]:
#fit model
model =tree.DecisionTreeClassifier(criterion='entropy',max_depth=4,max_leaf_nodes=10)
model.fit(X_train,Y_train)

In [31]:
#model prediction 
y_pred = model.predict(X_test)

In [32]:
#accuracy, precision and recall
print("Accuracy:{:.6f}".format(metrics.accuracy_score(Y_test, y_pred)))
print("Precision:{:.6f}".format(metrics.precision_score(Y_test, y_pred)))
print("Recall:{:.6f}".format(metrics.recall_score(Y_test, y_pred)))