In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load preprocessed data
df = pd.read_csv('US_COVID-19_Deaths_by_Sex_and_Age.csv')

# Drop the month of 2020.1, 2020.2 and 2021.12
# In 2020.1 and 2020.2 the covid was not pandemic yet in US so most data with zeros
# 2021.12 is current month and just started with about 1 week so not enough datapoints for the whole month
df = df[df['Month'] != 1]
df = df[df['Month'] != 2]
df = df[df['Month'] != 24]
df

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Sex,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Rank
32,12/08/2021,01/01/2021,01/31/2021,By Month,2021,13,United States,-1,2.5,1,135,3,0,0,4,0
33,12/08/2021,01/01/2021,01/31/2021,By Month,2021,13,United States,1,2.5,4,138,4,1,0,7,1
34,12/08/2021,01/01/2021,01/31/2021,By Month,2021,13,United States,-1,0.5,5,754,6,0,0,11,2
35,12/08/2021,01/01/2021,01/31/2021,By Month,2021,13,United States,1,0.5,5,913,9,0,0,14,3
36,12/08/2021,01/01/2021,01/31/2021,By Month,2021,13,United States,-1,9.5,7,178,4,1,1,11,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,12/08/2021,12/01/2020,12/31/2020,By Month,2020,12,United States,-1,79.5,12150,43337,7605,5881,20,13886,27
732,12/08/2021,12/01/2020,12/31/2020,By Month,2020,12,United States,1,69.5,12657,43611,8999,7120,25,14546,28
733,12/08/2021,12/01/2020,12/31/2020,By Month,2020,12,United States,1,85.0,14145,45730,8719,6628,15,16244,29
734,12/08/2021,12/01/2020,12/31/2020,By Month,2020,12,United States,1,79.5,15987,49558,10992,8782,33,18213,30


In [3]:
# Extract X and y for the project here. X from new Month column, Sex, and Age Group; y1 from the covid-19 deaths and y2 from the rank
# We first train our model on only the COVID-19 Deaths in United States.
X = df[['Month','Sex','Age Group']]
y1 = df['COVID-19 Deaths']
y2 = df['Rank']
#Split the dataset into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size = 0.2, random_state = 0)
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size = 0.2, random_state = 0)
# show X_train
X_train

Unnamed: 0,Month,Sex,Age Group
583,10,1,8.5
379,18,-1,69.5
204,4,1,23.5
295,17,1,8.5
591,10,1,64.5
...,...,...,...
41,13,1,19.5
423,19,1,8.5
256,5,-1,9.5
693,23,1,49.5


In [4]:
# try train the decision tree regressor and calculate the prediction score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt

parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(DecisionTreeRegressor(), parameters)
clf.fit(X_train, y1_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

0.8188588451138864 {'max_depth': 8}


In [5]:
# predict each group deaths 
DTR_death = DecisionTreeRegressor(max_depth=12).fit(X_train,y1_train)
DTR_death.score(X_test,y1_test)

0.9052179352499811

In [6]:
deathpred = DTR_death.predict(X)

# sort the prediction to find the most vulnerable group
death = X.copy()
death['Predicted Deaths']=deathpred
death['True Deaths']=y1
death=death.reset_index().drop(columns='index')

deathsorted = death.sort_values(['Predicted Deaths'],ascending=False)
deathsorted.nlargest(10,'Predicted Deaths')

Unnamed: 0,Month,Sex,Age Group,Predicted Deaths,True Deaths
30,13,1,79.5,17169.0,17169
670,12,1,79.5,15987.0,15987
29,13,1,69.5,14738.0,14738
669,12,1,85.0,13717.0,14145
28,13,1,85.0,13717.0,13717
671,12,-1,85.0,12893.0,18664
27,13,-1,79.5,12893.0,12893
667,12,-1,79.5,12893.0,12150
31,13,-1,85.0,12893.0,17390
668,12,1,69.5,12657.0,12657


In [7]:
for i in range(3,24):
    deathsort_byMonth = death[death['Month']==i].sort_values(['Predicted Deaths'],ascending=False)
    print(deathsort_byMonth.nlargest(3,'Predicted Deaths'))

    Month  Sex  Age Group  Predicted Deaths  True Deaths
95      3    1       79.5            1116.0         1116
94      3    1       69.5            1048.0         1048
93      3    1       57.0             938.0          938
     Month  Sex  Age Group  Predicted Deaths  True Deaths
159      4   -1       85.0           12602.0        12602
155      4   -1       79.5           12602.0         7865
158      4    1       79.5            9844.0         9844
     Month  Sex  Age Group  Predicted Deaths  True Deaths
223      5   -1       85.0            8854.0         8854
221      5    1       85.0            4876.0         4876
222      5    1       79.5            4693.0         5169
     Month  Sex  Age Group  Predicted Deaths  True Deaths
283      6    1       85.0           3487.00         2046
287      6   -1       85.0           3470.00         3470
282      6    1       57.0           2501.75         2008
     Month  Sex  Age Group  Predicted Deaths  True Deaths
351      7   -1   

In [None]:
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(DecisionTreeRegressor(), parameters)
clf.fit(X_train, y2_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 