In [1]:
import json
import os
import pandas as pd
import math

# Data loading and dictionary building

In [2]:
# Just run this cell one time, because it can take very long
# load covid data

json_file_path = 'coronavirus-covid-19-pandemic-usa-counties.json'

with open(json_file_path,'r') as f:
    data = None
    data = json.load(f)
dataset = []
for row in data:
    dataset.append(row['fields'])
data = None # save memory
df = pd.DataFrame(dataset)

In [3]:
# Load population data
json_file_path = 'usa-2016-presidential-election-by-county.json'

with open(json_file_path,'r') as f:
    data = None
    data = json.load(f)
population = data

Unnamed: 0,province_state,admin2,fips,location,tot_death,date,tot_confirmed
0,Montana,Flathead,30029,"[48.29575866, -114.0520569]",0,2020-02-27,0
1,Montana,Pondera,30073,"[48.22773388, -112.2252703]",0,2020-02-27,0
2,Montana,Yellowstone,30111,"[45.93955949, -108.2691486]",0,2020-02-27,0
3,Nebraska,Blaine,31009,"[41.91311716, -99.97677845]",0,2020-02-27,0
4,Nebraska,Dixon,31051,"[42.49188363, -96.86782408]",0,2020-02-27,0
...,...,...,...,...,...,...,...
986395,Mississippi,Lawrence,28077,"[31.55147224, -90.10841127]",14,2020-10-03,441
986396,Mississippi,Neshoba,28099,"[32.75339664, -89.11726492]",106,2020-10-03,1645
986397,Mississippi,Newton,28101,"[32.40021118, -89.11843336]",25,2020-10-03,793
986398,Mississippi,Rankin,28121,"[32.26469147, -89.94537876]",80,2020-10-03,3431


In [27]:
# county string mapping
# give a county string ex. 'Dane County, Wiscosin', return a list where the first element is state, an the 
# second is county ex. ['Madison','Dane']
def county_mapping(county_column):
    county_string = county_column
    county_string = county_string.split(',')
    for i in range(len(county_string)):
        if i == 0: 
            county_string[i] = county_string[i].replace(" County", "")
        elif i == 1:
            county_string[i] = county_string[i][1:]
    state_name = county_string[1]
    county_name = county_string[0]
    return [state_name, county_name]

In [29]:
# create mapping dictionary
mapping_dict = {}

for row in population:
    mapping_dict[row['fields']['county']] = county_mapping(row['fields']['county'])

In [30]:
# create population dictionary
population_mapping_dict = {}
adjustment_ratio = 328/323
for row in population:
    population_mapping_dict[tuple(county_mapping(row['fields']['county']))]= math.floor(row['fields']['total_population']*adjustment_ratio)

In [232]:
# total_statics from us
usa_total_population = sum(population_mapping_dict.values())*(328/305)

usa_total_cases_11_16 = df[df['date'] == df['date'].max()]['tot_confirmed'].sum()
usa_total_deaths_11_16 = df[df['date'] == df['date'].max()]['tot_death'].sum()

usa_total_cases_ratio_11_16 = usa_total_cases_11_16/usa_total_population

usa_total_deaths_ratio_11_16 = usa_total_deaths_11_16/usa_total_population

# Label generating function

In [40]:
#This dataset has 300 days of corona virus data from 2020-01-22 to 2020-11-16
#If you want to see the detail of the dataset, use df.head()

#input state, county, label type, and group date
#
#you can either use getmapping dictionary or get mapping function to get the input for state name and county name
#
#label_type has 6 different value
#1. "normalized_cases" 2. "normalized_deaths" 3.deaths_increase_rates  (d1-d0)/population 
#4. cases_increase_rates (c1-c0)/population 5. seriousness_label_cases (compared to whole US and is not affected by 
#group date) 6. seriousness_label_deaths (compared to whole US and is not affected by group date)
# 
# The return will be a list of label or labels for a county
#


def y_label_generator(state, county, label_type, group_date = 300):
    label_list = None
    target_state = df[df['province_state'] == state]
    if county != 'nan': 
        target_county = target_state[ target_state['admin2'] == county]
    else:
        target_county = target_state[ target_state['admin2'].isna()]
    target_county = target_county.sort_values(by = 'date')
    # target county will store a dataframe for a given county and in time order
    
    county_population = population_mapping_dict[(state,county)] # get total_population for nomalization
    
    
    if label_type == "normalized_cases": # ask for number of cases cut by each group data
        num_cuts = len(target_county)/group_date
        assert(num_cuts >= 1), "group_date is too big"
        label_list = []
        for i in range(math.floor(num_cuts)):
            if i <= math.floor(num_cuts) - 1:
                index = (i+1)*group_date-1
                label_list.append(target_county['tot_confirmed'].iloc[index])
        if math.floor(num_cuts) != num_cuts:
            label_list.append(target_county['tot_confirmed'].iloc[-1])
            
        ## perform normalization 
        for i in range(len(label_list)):
            label_list[i] = label_list[i]/county_population
       
    
    elif label_type == "normalized_deaths": # ask for number of deaths cut by each group data
        num_cuts = len(target_county)/group_date
        assert(num_cuts >= 1), "group_date is too big"
        label_list = []
        for i in range(math.floor(num_cuts)):
            if i <= math.floor(num_cuts) - 1:
                index = (i+1)*group_date-1
                label_list.append(target_county['tot_death'].iloc[index])
        if math.floor(num_cuts) != num_cuts:
            label_list.append(target_county['tot_death'].iloc[-1])         
        ## perform normalization 
        for i in range(len(label_list)):
            label_list[i] = label_list[i]/county_population
            
            
    elif label_type == "cases_increase_rates": # ask for the increase rate of cases cut by each group data
        num_cuts = len(target_county)/group_date
        assert(num_cuts >= 1), "group_date is too big"
        label_list = []
        for i in range(math.floor(num_cuts)):
            if i == 0:
                index = (i+1)*group_date-1
                label_list.append(target_county['tot_confirmed'].iloc[index]/county_population)
            elif i <= math.floor(num_cuts) - 1:
                index = (i+1)*group_date-1
                label_list.append((target_county['tot_confirmed'].iloc[index]- \
                                 target_county['tot_confirmed'].iloc[index-group_date])/county_population)
        if math.floor(num_cuts) != num_cuts:
            label_list.append((target_county['tot_confirmed'].iloc[-1] - \
                             target_county['tot_confirmed'].iloc[group_date*math.floor(num_cuts)-1])/ \
                              county_population)
            
            
    elif label_type == "deaths_increase_rates": # ask for the increase rate of deaths cut by each group data
        num_cuts = len(target_county)/group_date
        assert(num_cuts >= 1), "group_date is too big"
        label_list = []
        for i in range(math.floor(num_cuts)):
            if i == 0:
                index = (i+1)*group_date-1
                label_list.append(target_county['tot_death'].iloc[index]/county_population)
            elif i <= math.floor(num_cuts) - 1:
                index = (i+1)*group_date-1
                label_list.append((target_county['tot_death'].iloc[index]- \
                                 target_county['tot_death'].iloc[index-group_date])/county_population)
        if math.floor(num_cuts) != num_cuts:
            label_list.append((target_county['tot_death'].iloc[-1] - \
                             target_county['tot_death'].iloc[group_date*math.floor(num_cuts)-1])/ \
                              county_population)
      
    
    elif label_type == "seriousness_label_cases": 
        if target_county['tot_confirmed'].iloc[-1]/county_population > usa_total_cases_ratio_11_16:
            return [1]
        else:
            return [0]
        
    elif label_type == "seriousness_label_deaths": 
        if target_county['tot_death'].iloc[-1]/county_population > usa_total_deaths_ratio_11_16:
            return [1]
        else:
            return [0]       
    
    return label_list
    

# Sample usage

## County mapping function

Since the format of the county from election dataset is a bit different from that in covid dataset, I design the mapping function to convert the county in election data to the county in covid data. The format for county in election data is like this:

In [235]:
population[0]['fields']['county']

'Pike County, Georgia'

We can use county mapping function on the above string and covert it to the data input like these : 

In [236]:
county_mapping(population[0]['fields']['county'])

['Georgia', 'Pike']

In [237]:
county_mapping(population[2]['fields']['county'])

['Washington', 'Franklin']

Note that the first element of the return list is state, and the second is the county.

#  y_label_generator(state, county, label_type, group_date = 300)

The input of y lavel generator are as above. Not that state and county is the first and second element of the returning list from county_mapping function.

label_type has 6 different value
1. "normalized_cases" 
2. "normalized_deaths" 
3. "deaths_increase_rates"  (per day) 
4. "cases_increase_rates"   (per day) 
5. "seriousness_label_cases" (compared to whole US and is not affected by group date) (1 is more serious than
    average) 
6. "seriousness_label_deaths" (compared to whole US and is not affected by group date)(1 is more serious than
    average) 

group_date is the interval cut among the whole dataset. The default value is 300, because there are 300 days of data for each county. For example, if group date is 30, then there will be 300/30 = 10 times intervals, and our model will output list of len 10 to represent the value at the end (label_type 1 and 2) of the interval or the increase rate of the interval (label_type 3 and 4). Note that label_type 5 and 6 are not effected by group_date, since we just compare the seriousness of total cases or deaths at the last time period.
(ps. it is better that you use a proper diviosr for 300 to avoid having remaindor. Though I add some constraint to let it work if it is not. I still recommend you do that way.)

In [37]:
y_label_generator('Georgia', 'Pike', "normalized_cases", group_date = 300)

[0.03764329512130098]

In [38]:
y_label_generator('Washington', 'Franklin', "cases_increase_rates", group_date = 30)

[0.0,
 0.06666666666666667,
 6.3,
 9.166666666666666,
 21.566666666666666,
 54.666666666666664,
 33.5,
 12.966666666666667,
 16.1,
 29.7]

In [39]:
y_label_generator('Washington', 'Franklin', "normalized_deaths", group_date = 300)

[0.0008800935413706829]

In [251]:
y_label_generator('Wisconsin', 'Dane', "seriousness_label_cases", group_date = 30)

[1]

In [252]:
y_label_generator('Wisconsin', 'Dane', "seriousness_label_deaths", group_date = 30)

[0]