In [74]:
import numpy as np
import pandas as pd

In [75]:
df = pd.read_csv('./us-states.csv')
df = df[df.date == max(df.date)]
df = df[(df.state != 'Guam') & (df.state != 'Puerto Rico') & (df.state != 'Virgin Islands')]
df.drop(['fips', 'date'], axis=1, inplace=True)
df.columns = ['State', 'Infected', 'Deaths']
df.head(3)

Unnamed: 0,State,Infected,Deaths
1281,Alabama,538,3
1282,Alaska,102,0
1283,Arizona,508,8


### Live Testing data
(could maybe use this data for number of infections and deaths per state)  
http://coronavirusapi.com/states.csv

In [76]:
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [77]:
df_test = pd.read_csv('http://coronavirusapi.com/states.csv')
df_test['name,'] = df_test['name,'].apply(lambda r: states[r])
df_test = df_test[['name,', 'tested']]
df_test.columns = ['State', 'Tested']
df_test.head(3)

Unnamed: 0,State,Tested
0,Alaska,3334
1,Alabama,4755
2,Arkansas,3398


In [78]:
df = df.merge(df_test, on='State', how='inner')

### Gini coefficient

In [79]:
df = df.merge(pd.read_csv('./predictor_data/gini.csv'), on='State', how='inner')

### Total Population, ICU beds, percentage and amount age 60+

In [80]:
cols_of_interest = ['state', 'Total_pop', 'all_icu', '60plus', '60plus_per_each_icu_bed']
df_icu = pd.read_excel('./predictor_data/icu_beds.xlsx', usecols=cols_of_interest)
df_icu = df_icu.groupby('state').sum().reset_index()
df_icu['60plus_pct'] = df_icu['60plus'] / df_icu['Total_pop']
df_icu.columns = ['State', 'ICU Beds', 'Total Population', '60plus', '60plus per bed', '60plus pct pop']
df_icu.head(3)

Unnamed: 0,State,ICU Beds,Total Population,60plus,60plus per bed,60plus pct pop
0,Alabama,1533,4850771,1065625,53854.0,0.219682
1,Alaska,119,738565,117047,6424.0,0.158479
2,Arizona,1559,6809946,1502688,21304.0,0.220661


In [81]:
df = df.merge(df_icu, on='State', how='inner')

### Income per capita

In [82]:
df = df.merge(pd.read_csv('./predictor_data/income_per_capita.csv'), on='State', how='inner')
df.shape

(51, 11)

### GDP

In [83]:
df = df.merge(pd.read_csv('./predictor_data/state_gdp.csv'), on='State', how='inner')

### Unemployment

In [84]:
df = df.merge(pd.read_csv('./predictor_data/unemployment.csv'), on='State', how='inner')

### Sex

In [85]:
df_sex = pd.read_csv('./predictor_data/sex.csv')
df_sex['Sex Ratio'] = df_sex.Male / df_sex.Female 
df_sex = df_sex[['Location', 'Sex Ratio']]
df_sex.columns = ['State', 'Sex Ratio']

In [86]:
df = df.merge(df_sex, on='State', how='inner')

### Smoking

In [87]:
df_smoke = pd.read_csv('./predictor_data/smoking.csv')
df_smoke.columns = ['State', 'Smoking Rate']

In [88]:
df = df.merge(df_smoke, on='State', how='inner')

### Median Age

In [96]:
df_age = pd.read_csv('./predictor_data/age.csv')
df_age = df_age[['State', 'MedianAge', 'MedianAgeMale', 'MedianAgeFemale']]
df_age.columns = ['State', 'Median Age','Median Age M', 'Median Age F']

In [97]:
df = df.merge(df_age, on='State', how='inner')

### 

### 

In [99]:
df.head()
# df.to_csv('COVID19_state.csv', index=False)

Unnamed: 0,State,Infected,Deaths,Tested,Gini,ICU Beds,Total Population,60plus,60plus per bed,60plus pct pop,Income Per Capita,GDP per capita,Unemployment,Sex Ratio,Smoking Rate,Median Age,Median Age M,Median Age F
0,Alabama,538,3,4755,0.4847,1533,4850771,1065625,53854.0,0.219682,42334,45219,2.7,0.930145,20.9,38.9,37.4,40.3
1,Alaska,102,0,3334,0.4081,119,738565,117047,6424.0,0.158479,59687,73205,5.8,1.054688,21.0,34.0,33.6,34.5
2,Arizona,508,8,1164,0.4713,1559,6809946,1502688,21304.0,0.220661,43650,48055,4.5,0.966965,15.6,37.4,36.1,38.7
3,Arkansas,351,3,3398,0.4719,732,2977944,655552,27536.0,0.220136,42566,42454,3.5,0.95663,22.3,37.9,36.6,39.3
4,California,4060,82,89600,0.4899,7338,38982847,7292299,68758.0,0.187064,62586,74205,3.9,0.975113,11.3,36.3,35.2,37.5
