In [1]:
import pandas as pd
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [2]:
# p(B) Scale and process for county strength prob(B)
def get_scaled_county(countystrength):
    countystrength = countystrength.set_index('fips_code')
    countystrength_vals = countystrength.values
#     min_max_scaler = preprocessing.MinMaxScaler()
    countystrength_scaled_vals = min_max_scaler.fit_transform(countystrength_vals)
    countystrength_scaled = pd.DataFrame(countystrength_scaled_vals, index=countystrength.index, columns=countystrength.columns)

    countystrength_scaled['pB'] = countystrength_scaled['county_health'] + (1-countystrength_scaled['social_score']) + countystrength_scaled['preparedness_score'] ## tune here
#     countystrength_scaled['pB'] =  (1-countystrength_scaled['social_score']) + countystrength_scaled['preparedness_score'] ## tune here
#     countystrength_scaled['pB'] = countystrength_scaled['county_health'] + countystrength_scaled['preparedness_score'] ## tune here

    return countystrength_scaled


In [3]:
# p(A) Scale and process for probability of spread at hubs(i.e. neighbours)
def get_scaled_spread(prob_spread):
    prob_spread = prob_spread.set_index('fips_code')
    prob_spread_vals = prob_spread.values
#     min_max_scaler = preprocessing.MinMaxScaler()
    prob_spread_scaled_vals = min_max_scaler.fit_transform(prob_spread_vals)
    prob_spread_scaled = pd.DataFrame(prob_spread_scaled_vals, index=prob_spread.index, columns=prob_spread.columns)

    prob_spread_scaled['pA'] = prob_spread_scaled['covid_rate'] * prob_spread_scaled['social_score'] ## tune here

    return prob_spread_scaled
    

In [14]:
# Code for extracting list of neighbours
def get_neighbours():
    adjacency = pd.read_csv('https://www2.census.gov/geo/docs/reference/county_adjacency.txt', header=None, delimiter='\t', encoding = "ISO-8859-1")
    adjacency = adjacency.fillna(method='ffill')
    adjacency = adjacency.rename(columns={0:'countyname1',1:'county1',2:'countyname2',3:'county2'})
    adjacency = adjacency[adjacency['county1'] != adjacency['county2']]
    adjacency['county1'] = adjacency['county1'].astype(int)
    neighbours = adjacency[['county1','county2']].pivot(columns='county1',values='county2').dropna(how='all')

    return neighbours
    

In [5]:
# Collecting the Prob(A) probabilites of neighbours spreading covid
def get_probA(prob_spread_scaled, neighbours):
    pA_dict = prob_spread_scaled.to_dict()
    neighboursprob = neighbours.copy()
    for county in neighbours.columns:
        newcol = str(county) + 'pA'
        neighboursprob[newcol] = neighbours[county].map(pA_dict['pA'])
        
    return neighboursprob

In [6]:
# For each county, get average of neighbour's prob_spread scores
### alternatively, we can do max of neighbour or sum of neighbours (then scale again)
def get_spread_scores(neighboursprob, neighbours):
    neighboursprob1 = neighboursprob.drop(columns = neighbours.columns)
    mean = neighboursprob1.max().reset_index() #tune here
    mean['fips_code'] = mean['county1'].str.split('p', expand=True)[0]
    mean = mean.rename(columns={0:'prob_spread'})
    mean = mean.set_index('fips_code')
    
    return mean


In [7]:
# Probability of a county getting covid

def get_covid_prob(countystrength_scaled,mean):
    pn = {}
    for county in countystrength_scaled.index:
    #     print(county)
        if str(county) in mean.index :
            pn[county] = mean.at[str(county),"prob_spread"] * (1-countystrength_scaled.at[county,"pB"]) ##tune here
    return pn

In [8]:
def covid_predictions(pn,pred_month):
    jul_pred = pd.DataFrame.from_dict(pn, orient='index').sort_values(0, ascending=False)
    jul_pred = jul_pred.rename(columns={0:'probability_covid'})
    
    jul_pred_vals = jul_pred.values
    jul_pred_scaled_vals = min_max_scaler.fit_transform(jul_pred_vals)
    jul_pred_scaled = pd.DataFrame(jul_pred_scaled_vals, index=jul_pred.index, columns=jul_pred.columns)
    jul_pred_scaled = jul_pred_scaled.reset_index().rename(columns={"index":"fips_code"})

    filename = pred_month + '_predictions.csv'
    print(filename)
    jul_pred_scaled.to_csv(filename, index=False)
    top100pred = jul_pred[0:100]
    return top100pred


In [9]:
# comparing with some ground truth
def groundtruth(pred_month):
    newcases = pd.read_csv('monthly_rural_new_cases.csv')
    newcasesjul = newcases[['fips_code',pred_month]].sort_values(pred_month, ascending=False)
#     top20 = newcasesjul[0:20]
    top100 = newcasesjul[0:100]
    return top100

# def getcommon(top20, top20pred):
#     common = top20.fips_code.isin(top20pred.index)
#     return common

def getcommonall(top, top_pred):
    common = top.fips_code.isin(top_pred.index)
    return common

In [10]:
def get_predictions(prob_spread,countystrength):
    countystrength_scaled = get_scaled_county(countystrength)
    prob_spread_scaled = get_scaled_spread(prob_spread)

    neighbours = get_neighbours()
    neighboursprob = get_probA(prob_spread_scaled, neighbours)
    mean = get_spread_scores(neighboursprob,neighbours)
    pn = get_covid_prob(countystrength_scaled,mean)
    return pn


In [11]:
def get_accuracy(pn, pred_month="July"):
    top100pred = covid_predictions(pn,pred_month)
    top100 = groundtruth(pred_month)
#     top20common = getcommon(top20, top20pred)
    top100common = getcommonall(top100, top100pred)

    return top100common, top100


In [12]:
def main(prob_spread,countystrength,pred_month="July"):
    
    pn = get_predictions(prob_spread,countystrength)

    top20common,top20 = get_accuracy(pn, pred_month)
    
    return  top20common,top20

# START HERE:

In [15]:
months = [['june','July'],['july','August'],['august','September']]
# months = [['july','August']]
# months = [['august','September']]

for month in months:
    h2nfile = month[0] + '-h2n.csv'
    nfile = month[0] + '-n.csv'
    prob_spread_jul = pd.read_csv(h2nfile)
    countystrength_jul = pd.read_csv(nfile)
    
    top20common,top20 = main(prob_spread_jul,countystrength_jul,month[1])
    print(month)
    print(top20[top20common].count())
    print(top20[top20common])


July_predictions.csv
['june', 'July']
fips_code    35
July         35
dtype: int64
      fips_code    July
715     12063.0  1203.0
1531    28151.0   954.0
294     13069.0   759.0
1192     5115.0   728.0
449      1053.0   685.0
118     28011.0   656.0
1409    12121.0   632.0
1514    13299.0   631.0
1132    28107.0   623.0
924     48313.0   615.0
1405    28133.0   613.0
905     28087.0   497.0
218     19033.0   473.0
1190    28115.0   468.0
290     28027.0   449.0
419     37061.0   445.0
530     47053.0   437.0
592     28043.0   429.0
1524    12133.0   382.0
324      1039.0   372.0
1297    37163.0   371.0
857     28083.0   345.0
965     40089.0   344.0
938     28091.0   341.0
1483    28145.0   331.0
1254    22083.0   325.0
933      1093.0   323.0
1022    28095.0   320.0
1591    28159.0   320.0
487      1059.0   319.0
494     22041.0   318.0
65      45009.0   310.0
421     47045.0   301.0
40      13001.0   297.0
838     47097.0   292.0
August_predictions.csv
['july', 'August']
fips_code  

In [16]:
# months = [['june','July'],['july','August'],['august','September']]
months = [['july','August']]
# months = [['august','September']]

for month in months:
    h2nfile = month[0] + '-h2n-change.csv'
    nfile = month[0] + '-n-change.csv'
    prob_spread_jul = pd.read_csv(h2nfile)
    countystrength_jul = pd.read_csv(nfile)
    
    top20common,top20 = main(prob_spread_jul,countystrength_jul,month[1])
    print(month)
    print(top20[top20common].count())
    print(top20[top20common])


August_predictions.csv
['july', 'August']
fips_code    34
August       34
dtype: int64
      fips_code  August
852     28081.0  1015.0
715     12063.0   770.0
1549    47183.0   613.0
530     47053.0   578.0
118     28011.0   575.0
1531    28151.0   570.0
1524    12133.0   511.0
294     13069.0   494.0
1455    13279.0   480.0
619     47069.0   478.0
650     47077.0   458.0
1093    47131.0   452.0
421     47045.0   449.0
857     28083.0   421.0
1101    28105.0   400.0
34      48005.0   399.0
646     47075.0   381.0
1571    47185.0   378.0
1483    28145.0   378.0
443     13107.0   367.0
1405    28133.0   367.0
1022    28095.0   362.0
1190    28115.0   360.0
290     28027.0   352.0
905     28087.0   344.0
1582    45089.0   340.0
40      13001.0   331.0
1059    48347.0   328.0
838     47097.0   314.0
624     47071.0   314.0
1206    28117.0   306.0
494     22041.0   301.0
588     51081.0   292.0
730     13161.0   292.0
