In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import urllib.request
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
import plotnine as p9

In [60]:
data = pd.read_csv('COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_Facility.csv').head(7000)
data.head()



Unnamed: 0,hospital_pk,collection_week,state,ccn,hospital_name,address,city,zip,hospital_subtype,fips_code,...,previous_day_admission_pediatric_covid_confirmed_unknown_7_day_sum,staffed_icu_pediatric_patients_confirmed_covid_7_day_avg,staffed_icu_pediatric_patients_confirmed_covid_7_day_coverage,staffed_icu_pediatric_patients_confirmed_covid_7_day_sum,staffed_pediatric_icu_bed_occupancy_7_day_avg,staffed_pediatric_icu_bed_occupancy_7_day_coverage,staffed_pediatric_icu_bed_occupancy_7_day_sum,total_staffed_pediatric_icu_beds_7_day_avg,total_staffed_pediatric_icu_beds_7_day_coverage,total_staffed_pediatric_icu_beds_7_day_sum
0,52052,2020/05/29,CA,52052,KINDRED HOSPITAL RIVERSIDE,2224 MEDICAL CENTER DRIVE,PERRIS,92571.0,Long Term,6065.0,...,,,0,,,0,,,0,
1,70038,2020/04/24,CT,70038,"CONNECTICUT HOSPICE INC,THE",100 DOUBLE BEACH ROAD,BRANFORD,6405.0,Short Term,9009.0,...,,,0,,,0,,,0,
2,100034,2020/05/01,FL,100034,"MOUNT SINAI MEDICAL CENTER OF FLORIDA, INC",4300 ALTON RD,MIAMI BEACH,33140.0,Short Term,12086.0,...,,,0,,,0,,,0,
3,100069,2020/05/15,FL,100069,ADVENTHEALTH CARROLLWOOD,7171 N DALE MABRY HWY,TAMPA,33614.0,Short Term,12057.0,...,,,0,,,0,,,0,
4,100070,2020/04/24,FL,100070,SHOREPOINT HEALTH VENICE,540 THE RIALTO,VENICE,34285.0,Short Term,12115.0,...,,,0,,,0,,,0,


In [61]:
len(data)

400000

In [62]:
data.columns.to_list()

['hospital_pk',
 'collection_week',
 'state',
 'ccn',
 'hospital_name',
 'address',
 'city',
 'zip',
 'hospital_subtype',
 'fips_code',
 'is_metro_micro',
 'total_beds_7_day_avg',
 'all_adult_hospital_beds_7_day_avg',
 'all_adult_hospital_inpatient_beds_7_day_avg',
 'inpatient_beds_used_7_day_avg',
 'all_adult_hospital_inpatient_bed_occupied_7_day_avg',
 'inpatient_beds_used_covid_7_day_avg',
 'total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg',
 'total_adult_patients_hospitalized_confirmed_covid_7_day_avg',
 'total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg',
 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg',
 'inpatient_beds_7_day_avg',
 'total_icu_beds_7_day_avg',
 'total_staffed_adult_icu_beds_7_day_avg',
 'icu_beds_used_7_day_avg',
 'staffed_adult_icu_bed_occupancy_7_day_avg',
 'staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_avg',
 'staffed_icu_adult_patients_confirmed_covid_7_day_avg',
 'total_p

In [63]:
def ifZero(x):
    if x < 0:
        return 0
    return x

In [64]:
features = ['inpatient_beds_used_covid_7_day_sum', 
            'all_adult_hospital_inpatient_beds_7_day_sum',
            'total_beds_7_day_sum', 
            'inpatient_beds_used_7_day_sum', 
            'inpatient_beds_7_day_sum', 
            'previous_day_admission_adult_covid_confirmed_7_day_sum',
            'previous_day_admission_adult_covid_suspected_7_day_sum']

In [65]:
sliced = data[features]
sliced

Unnamed: 0,inpatient_beds_used_covid_7_day_sum,all_adult_hospital_inpatient_beds_7_day_sum,total_beds_7_day_sum,inpatient_beds_used_7_day_sum,inpatient_beds_7_day_sum,previous_day_admission_adult_covid_confirmed_7_day_sum,previous_day_admission_adult_covid_suspected_7_day_sum
0,0.0,,280.0,268.0,280.0,,
1,0.0,,,,,,
2,436.0,,,,,,
3,71.0,,2676.0,422.0,679.0,,
4,35.0,,1736.0,536.0,1554.0,,
...,...,...,...,...,...,...,...
399995,47.0,,1172.0,449.0,551.0,,
399996,29.0,140.0,140.0,91.0,140.0,-999999.0,0.0
399997,89.0,1161.0,1161.0,1110.0,1161.0,8.0,9.0
399998,262.0,1558.0,1901.0,1668.0,1798.0,32.0,-999999.0


In [66]:
psliced = sliced.dropna()
psliced= psliced.applymap(lambda x: ifZero(x))
psliced

Unnamed: 0,inpatient_beds_used_covid_7_day_sum,all_adult_hospital_inpatient_beds_7_day_sum,total_beds_7_day_sum,inpatient_beds_used_7_day_sum,inpatient_beds_7_day_sum,previous_day_admission_adult_covid_confirmed_7_day_sum,previous_day_admission_adult_covid_suspected_7_day_sum
39,0.0,154.0,154.0,40.0,154.0,0.0,0.0
167,0.0,77.0,112.0,36.0,77.0,0.0,0.0
188,0.0,70.0,100.0,0.0,70.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201,0.0,175.0,266.0,72.0,175.0,0.0,0.0
...,...,...,...,...,...,...,...
399993,0.0,196.0,280.0,6.0,196.0,0.0,0.0
399994,29.0,770.0,770.0,438.0,770.0,0.0,0.0
399996,29.0,140.0,140.0,91.0,140.0,0.0,0.0
399997,89.0,1161.0,1161.0,1110.0,1161.0,8.0,9.0


In [67]:
len(psliced)

288386

In [68]:
input = psliced['previous_day_admission_adult_covid_confirmed_7_day_sum'].values.reshape(-1, 1)
output = psliced['inpatient_beds_used_covid_7_day_sum'].values.reshape(-1, 1)

In [69]:
xtrain, xtest, ytrain, ytest = train_test_split(input, output, test_size=0.3, random_state=404)

In [70]:
dtree = DecisionTreeClassifier(random_state=404)
dtree.fit(xtrain, ytrain)

DecisionTreeClassifier(random_state=404)

In [71]:
y_pred = dtree.predict(xtest)

print("Train data accuracy:",accuracy_score(y_true = ytrain, y_pred = dtree.predict(xtrain)))
print("Test data accuracy:",accuracy_score(y_true = ytest, y_pred = y_pred))

Train data accuracy: 0.31759052855798287
Test data accuracy: 0.31277451569651854
