In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

## Regular learning using Tree and GridSearchCV

In [3]:
dataset = pd.read_csv("../Datasets/Covid+Data.csv")
# https://www.kaggle.com/datasets/meirnizri/covid19-dataset

dataset['DATE_DIED'] = np.where(dataset['DATE_DIED'] == '9999-99-99', 1, 0)

In [4]:
X = dataset.drop(['DATE_DIED'], axis=1)
y = dataset.DATE_DIED
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
clf = DecisionTreeClassifier().fit(X_train,y_train)
clf.score(X_test, y_test)

0.9359397888183594

In [6]:
from sklearn.model_selection import GridSearchCV
tree = DecisionTreeClassifier()
grid_search_cv = GridSearchCV(tree, {'max_depth': [5,10,15]})
grid_search_cv.fit(X_train, y_train)
predicted = grid_search_cv.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
print(f'Accuracy {accuracy_score(y_test, predicted)} \nRecall: {recall_score(y_test, predicted)} \nPrecission: {precision_score(y_test, predicted)}')
grid_search_cv.score(X_test, y_test)

Accuracy 0.9494743347167969 
Recall: 0.9855121404803479 
Precission: 0.9609409194765353


0.9494743347167969

In [8]:
feature_importance_df = pd.DataFrame({'features': list(X_train), 'features_importance': grid_search_cv.best_estimator_.feature_importances_})
feature_importance_df.sort_values('features_importance', ascending=False)

Unnamed: 0,features,features_importance
3,PATIENT_TYPE,0.535593
4,INTUBED,0.220269
6,AGE,0.12175
18,CLASIFFICATION_FINAL,0.049788
5,PNEUMONIA,0.029921
1,MEDICAL_UNIT,0.019832
19,ICU,0.00522
8,DIABETES,0.003323
16,RENAL_CHRONIC,0.003158
0,USMER,0.002484


## Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [10]:
forest_clf = RandomForestClassifier()

In [11]:
parameters = {'n_estimators': [10,20,30], 'max_depth': [2,5,7,10]}

In [12]:
grid_search_cv = GridSearchCV(forest_clf, parameters)
grid_search_cv.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 5, 7, 10],
                         'n_estimators': [10, 20, 30]})

In [13]:
best_clf = grid_search_cv.best_estimator_
predicted = best_clf.predict(X_test)
print(f'Accuracy {accuracy_score(y_test, predicted)} \nRecall: {recall_score(y_test, predicted)} \nPrecission: {precision_score(y_test, predicted)}')
grid_search_cv.score(X_test, y_test)

Accuracy 0.9495582580566406 
Recall: 0.988522551312885 
Precission: 0.9583348305006927


0.9495582580566406

### Most important features

In [14]:
feature_importance_df = pd.DataFrame({'features': list(X_train), 'feature_importance': best_clf.feature_importances_ })
# grid_search_cv.best_estimator_.feature_importances_
feature_importance_df.sort_values('feature_importance', ascending=False)

Unnamed: 0,features,feature_importance
4,INTUBED,0.297561
3,PATIENT_TYPE,0.184189
19,ICU,0.176515
6,AGE,0.122133
5,PNEUMONIA,0.101184
18,CLASIFFICATION_FINAL,0.050366
1,MEDICAL_UNIT,0.020555
8,DIABETES,0.013857
12,HIPERTENSION,0.012145
0,USMER,0.005182
