In [1]:
import pandas as pd
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [2]:
# p(A) Scale and process for probability of spread at hubs(i.e. neighbours)
def get_scaled_spread(prob_spread):
    prob_spread = prob_spread.set_index('fips_code')
    prob_spread_vals = prob_spread.values
    prob_spread_scaled_vals = min_max_scaler.fit_transform(prob_spread_vals)
    prob_spread_scaled = pd.DataFrame(prob_spread_scaled_vals, index=prob_spread.index, columns=prob_spread.columns)

    prob_spread_scaled['pA'] = prob_spread_scaled['covid_rate'] * prob_spread_scaled['social_score'] ## tune here

    return prob_spread_scaled

# Code for extracting list of neighbours
def get_neighbours():
    adjacency = pd.read_csv('https://www2.census.gov/geo/docs/reference/county_adjacency.txt', header=None, delimiter='\t', encoding = "ISO-8859-1")
    adjacency = adjacency.fillna(method='ffill')
    adjacency = adjacency.rename(columns={0:'countyname1',1:'county1',2:'countyname2',3:'county2'})
    adjacency = adjacency[adjacency['county1'] != adjacency['county2']]
    adjacency['county1'] = adjacency['county1'].astype(int)
    neighbours = adjacency[['county1','county2']].pivot(columns='county1',values='county2').dropna(how='all')

    return neighbours

# Collecting the Prob(A) probabilites of neighbours spreading covid
def get_probA(prob_spread, neighbours):
    
    prob_spread_scaled = get_scaled_spread(prob_spread)

    pA_dict = prob_spread_scaled.to_dict()
    neighboursprob = neighbours.copy()
    for county in neighbours.columns:
        newcol = str(county) + 'pA'
        neighboursprob[newcol] = neighbours[county].map(pA_dict['pA'])
        
    return neighboursprob

In [3]:
# For each county, get average of neighbour's prob_spread scores
def get_spread_scores(prob_spread):

    neighbours = get_neighbours()
    neighboursprob = get_probA(prob_spread, neighbours)
    
    neighboursprob1 = neighboursprob.drop(columns = neighbours.columns)
    mean = neighboursprob1.mean().reset_index() #tune here
    mean['fips_code'] = mean['county1'].str.split('p', expand=True)[0]
    mean = mean.rename(columns={0:'prob_spread'})
    mean = mean.set_index('fips_code')
    
    return mean


In [4]:
# p(B) Scale and process for county strength prob(B)
def get_scaled_county(countystrength):
    countystrength = countystrength.set_index('fips_code')
    countystrength_vals = countystrength.values
    countystrength_scaled_vals = min_max_scaler.fit_transform(countystrength_vals)
    countystrength_scaled = pd.DataFrame(countystrength_scaled_vals, index=countystrength.index, columns=countystrength.columns)

    countystrength_scaled['pB'] = countystrength_scaled['county_health'] + (1-countystrength_scaled['social_score']) + countystrength_scaled['preparedness_score'] ## tune here

    return countystrength_scaled

# Probability of a county getting covid
def get_covid_prob(countystrength,mean):
    countystrength_scaled = get_scaled_county(countystrength)
    pn = {}
    for county in countystrength_scaled.index:
        if str(county) in mean.index :
            pn[county] = mean.at[str(county),"prob_spread"] * (1-countystrength_scaled.at[county,"pB"]) ##tune here
    return pn

In [5]:
def get_predictions(prob_spread,countystrength):
    mean = get_spread_scores(prob_spread)
    pn = get_covid_prob(countystrength,mean)
    return pn


In [6]:
def covid_predictions(pn,pred_month):
    jul_pred = pd.DataFrame.from_dict(pn, orient='index').sort_values(0, ascending=False)
    jul_pred = jul_pred.rename(columns={0:'probability_covid'})
    
    jul_pred_vals = jul_pred.values
    jul_pred_scaled_vals = min_max_scaler.fit_transform(jul_pred_vals)
    jul_pred_scaled = pd.DataFrame(jul_pred_scaled_vals, index=jul_pred.index, columns=jul_pred.columns)
    jul_pred_scaled = jul_pred_scaled.reset_index().rename(columns={"index":"fips_code"})

    filename = 'Predictions/' + pred_month + '_predictions.csv'
    print(filename)
    jul_pred_scaled.to_csv(filename, index=False)
    top100pred = jul_pred[0:100]
    return top100pred

# comparing with some ground truth
def groundtruth(pred_month):
    newcases = pd.read_csv('monthly_rural_new_cases.csv')
    newcasesjul = newcases[['fips_code',pred_month]].sort_values(pred_month, ascending=False)
    top100 = newcasesjul[0:100]
    return top100

def get_accuracy(pn, pred_month):
    top = groundtruth(pred_month)
    top_pred = covid_predictions(pn,pred_month)
    common = top.fips_code.isin(top_pred.index)
    return common, top

In [7]:
def main(prob_spread,countystrength,pred_month="July"):
    pn = get_predictions(prob_spread,countystrength)
    topcommon,top = get_accuracy(pn, pred_month)
    return  topcommon,top

In [8]:
def getruralstrength(nfile):
    countystrength1 = pd.read_csv(nfile)
    countystrength1['fips_code'] = countystrength1['fips_code'].astype(int)
    
    rural_counties = pd.read_csv('final_rural_counties.csv')
    rural_counties['fips_code'] = rural_counties['fips_code'].astype(int)
    rural_counties_l = rural_counties.fips_code.unique()
    rural_strength = countystrength1[countystrength1.fips_code.isin(rural_counties_l)]
    
    return rural_strength
    

# START HERE:

In [9]:
months = [['April','May'],['May','June'],['June','July'],['July','August'],['August','September'],['September','October']]

for month in months:
    h2nfile = 'all_counties_model/' + month[0] + '-h2n-rate.csv'
    nfile = 'all_counties_model/' + month[0] + '-n.csv'
    prob_spread = pd.read_csv(h2nfile)
    prob_spread['fips_code'] = prob_spread['fips_code'].astype(int)
    countystrength = getruralstrength(nfile)
    
    top100common,top100 = main(prob_spread,countystrength,month[1])
    print(month)
    print(top100[top100common].count())
    print(top100[top100common])
    
# it got weaker in the later months as rural counties may be already experiencing covid so the external factors do not influence spread anymore


Predictions/May_predictions.csv
['April', 'May']
fips_code    28
May          28
dtype: int64
      fips_code     May
973     35031.0  1388.0
419     37061.0   739.0
1066    28099.0   459.0
1297    37163.0   416.0
1055    21177.0   360.0
161      1013.0   358.0
1320    28123.0   334.0
668     28051.0   296.0
848     28079.0   223.0
54      28007.0   199.0
1072    28101.0   190.0
190     24011.0   188.0
1570    18181.0   188.0
369      1049.0   171.0
1582    45089.0   169.0
905     28087.0   157.0
853     45061.0   155.0
237     45025.0   145.0
1419     1123.0   142.0
1443    13277.0   139.0
897     21141.0   135.0
70       1005.0   128.0
931      1091.0   121.0
857     28083.0   116.0
1257    37153.0   116.0
398     45033.0   113.0
267      1025.0   112.0
247      1023.0   110.0
Predictions/June_predictions.csv
['May', 'June']
fips_code    31
June         31
dtype: int64
      fips_code   June
1297    37163.0  561.0
1443    13277.0  494.0
1450    48449.0  348.0
1409    12121.0  332.0
1