In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

## Regular learning using Tree and GridSearchCV

In [4]:
dataset = pd.read_csv("Covid Data.csv")
# https://www.kaggle.com/datasets/meirnizri/covid19-dataset

dataset['DATE_DIED'] = np.where(dataset['DATE_DIED'] == '9999-99-99', 1, 0)

In [5]:
X = dataset.drop(['DATE_DIED'], axis=1)
y = dataset.DATE_DIED
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [6]:
clf = DecisionTreeClassifier().fit(X_train,y_train)
clf.score(X_test, y_test)

0.936126708984375

In [7]:
from sklearn.model_selection import GridSearchCV
tree = DecisionTreeClassifier()
grid_search_cv = GridSearchCV(tree, {'max_depth': [5,10,15]})
grid_search_cv.fit(X_train, y_train)
predicted = grid_search_cv.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
print(f'Accuracy {accuracy_score(y_test, predicted)} \nRecall: {recall_score(y_test, predicted)} \nPrecission: {precision_score(y_test, predicted)}')
grid_search_cv.score(X_test, y_test)

Accuracy 0.9499931335449219 
Recall: 0.9849643556858032 
Precission: 0.9619873049819303


0.9499931335449219

In [25]:
feature_importance_df = pd.DataFrame({'features': list(X_train), 'features_importance': grid_search_cv.best_estimator_.feature_importances_})
feature_importance_df.sort_values('features_importance', ascending=False)

Unnamed: 0,features,features_importance
4,INTUBED,0.335844
3,PATIENT_TYPE,0.184196
19,ICU,0.127995
5,PNEUMONIA,0.117332
6,AGE,0.108805
18,CLASIFFICATION_FINAL,0.045614
8,DIABETES,0.025126
1,MEDICAL_UNIT,0.023038
12,HIPERTENSION,0.011216
2,SEX,0.004191


## Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [10]:
forest_clf = RandomForestClassifier()

In [11]:
parameters = {'n_estimators': [10,20,30], 'max_depth': [2,5,7,10]}

In [12]:
grid_search_cv = GridSearchCV(forest_clf, parameters)
grid_search_cv.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 5, 7, 10],
                         'n_estimators': [10, 20, 30]})

In [16]:
best_clf = grid_search_cv.best_estimator_
predicted = best_clf.predict(X_test)
print(f'Accuracy {accuracy_score(y_test, predicted)} \nRecall: {recall_score(y_test, predicted)} \nPrecission: {precision_score(y_test, predicted)}')
grid_search_cv.score(X_test, y_test)

Accuracy 0.9498252868652344 
Recall: 0.9894013730881311 
Precission: 0.9578496885945513


0.9498252868652344

### Most important features

In [21]:
feature_importance_df = pd.DataFrame({'features': list(X_train), 'feature_importance': best_clf.feature_importances_ })
# grid_search_cv.best_estimator_.feature_importances_
feature_importance_df.sort_values('feature_importance', ascending=False)

Unnamed: 0,features,feature_importance
4,INTUBED,0.335844
3,PATIENT_TYPE,0.184196
19,ICU,0.127995
5,PNEUMONIA,0.117332
6,AGE,0.108805
18,CLASIFFICATION_FINAL,0.045614
8,DIABETES,0.025126
1,MEDICAL_UNIT,0.023038
12,HIPERTENSION,0.011216
2,SEX,0.004191
