In [1]:
import pandas as pd
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

In [2]:
# p(B) Scale and process for county strength prob(B)
def get_scaled_county(countystrength):
    countystrength = countystrength.set_index('fips_code')
    countystrength_vals = countystrength.values
#     min_max_scaler = preprocessing.MinMaxScaler()
    countystrength_scaled_vals = min_max_scaler.fit_transform(countystrength_vals)
    countystrength_scaled = pd.DataFrame(countystrength_scaled_vals, index=countystrength.index, columns=countystrength.columns)

    countystrength_scaled['pB'] = countystrength_scaled['county_health'] + (1-countystrength_scaled['social_score']) + countystrength_scaled['preparedness_score'] ## tune here
#     countystrength_scaled['pB'] =  (1-countystrength_scaled['social_score']) + countystrength_scaled['preparedness_score'] ## tune here
#     countystrength_scaled['pB'] = countystrength_scaled['county_health'] + countystrength_scaled['preparedness_score'] ## tune here

    return countystrength_scaled


In [3]:
# p(A) Scale and process for probability of spread at hubs(i.e. neighbours)
def get_scaled_spread(prob_spread):
    prob_spread = prob_spread.set_index('fips_code')
    prob_spread_vals = prob_spread.values
#     min_max_scaler = preprocessing.MinMaxScaler()
    prob_spread_scaled_vals = min_max_scaler.fit_transform(prob_spread_vals)
    prob_spread_scaled = pd.DataFrame(prob_spread_scaled_vals, index=prob_spread.index, columns=prob_spread.columns)

    prob_spread_scaled['pA'] = prob_spread_scaled['covid_rate'] * prob_spread_scaled['social_score'] ## tune here

    return prob_spread_scaled
    

In [4]:
# Code for extracting list of neighbours
def get_neighbours():
    adjacency = pd.read_csv('https://www2.census.gov/geo/docs/reference/county_adjacency.txt', header=None, delimiter='\t', encoding = "ISO-8859-1")
    adjacency = adjacency.fillna(method='ffill')
    adjacency = adjacency.rename(columns={0:'countyname1',1:'county1',2:'countyname2',3:'county2'})
    adjacency = adjacency[adjacency['county1'] != adjacency['county2']]
    adjacency['county1'] = adjacency['county1'].astype(int)
    neighbours = adjacency[['county1','county2']].pivot(columns='county1',values='county2').dropna(how='all')

    return neighbours
    

In [5]:
# Collecting the Prob(A) probabilites of neighbours spreading covid
def get_probA(prob_spread_scaled, neighbours):
    pA_dict = prob_spread_scaled.to_dict()
    neighboursprob = neighbours.copy()
    for county in neighbours.columns:
        newcol = str(county) + 'pA'
        neighboursprob[newcol] = neighbours[county].map(pA_dict['pA'])
        
    return neighboursprob

In [6]:
# For each county, get average of neighbour's prob_spread scores
### alternatively, we can do max of neighbour or sum of neighbours (then scale again)
def get_spread_scores(neighboursprob, neighbours):
    neighboursprob1 = neighboursprob.drop(columns = neighbours.columns)
    mean = neighboursprob1.max().reset_index() #tune here
    mean['fips_code'] = mean['county1'].str.split('p', expand=True)[0]
    mean = mean.rename(columns={0:'prob_spread'})
    mean = mean.set_index('fips_code')
    
    return mean


In [7]:
# Probability of a county getting covid

def get_covid_prob(countystrength_scaled,mean):
    pn = {}
    for county in countystrength_scaled.index:
    #     print(county)
        if str(county) in mean.index :
            pn[county] = mean.at[str(county),"prob_spread"] * (1-countystrength_scaled.at[county,"pB"]) ##tune here
    return pn

In [8]:
def covid_predictions_old(pn,pred_month):
    jul_pred = pd.DataFrame.from_dict(pn, orient='index').sort_values(0, ascending=False)
    jul_pred = jul_pred.rename(columns={0:'probability_covid'})
    
    jul_pred_vals = jul_pred.values
    jul_pred_scaled_vals = min_max_scaler.fit_transform(jul_pred_vals)
    jul_pred_scaled = pd.DataFrame(jul_pred_scaled_vals, index=jul_pred.index, columns=jul_pred.columns)
    jul_pred_scaled = jul_pred_scaled.reset_index().rename(columns={"index":"fips_code"})

    filename = 'Predictions/' + pred_month + '_predictions.csv'
    print(filename)
    jul_pred_scaled.to_csv(filename, index=False)
    top100pred = jul_pred[0:100]
    return top100pred


In [9]:
def covid_predictions(pn,pred_month):
    pred = pd.DataFrame.from_dict(pn, orient='index').sort_values(0, ascending=False)
    pred = pred.rename(columns={0:'probability_covid'})
    pred = pred.reset_index()
    
    rural_counties = pd.read_csv('final_rural_counties.csv')
    rural_counties_l = rural_counties.fips_code.unique()
    rural_pred = pred[pred.index.isin(rural_counties_l)]
    
    pred_vals = rural_pred.values
    pred_scaled_vals = min_max_scaler.fit_transform(pred_vals)
    pred_scaled = pd.DataFrame(pred_scaled_vals, index=rural_pred.index, columns=rural_pred.columns)
    pred_scaled = pred_scaled.reset_index().rename(columns={"index":"fips_code"})
        
    filename = 'Predictions/' + pred_month + '_predictions.csv'
    print(filename)
    pred_scaled.to_csv(filename, index=False)
    top100pred = pred_scaled[0:100]
    return top100pred


In [10]:
# comparing with some ground truth
def groundtruth(pred_month):
    newcases = pd.read_csv('monthly_rural_new_cases.csv')
    newcasesjul = newcases[['fips_code',pred_month]].sort_values(pred_month, ascending=False)
    top100 = newcasesjul[0:100]
    return top100

def getcommonall(top, top_pred):
    common = top.fips_code.isin(top_pred.index)
    return common

In [11]:
def get_predictions(prob_spread,countystrength):
    countystrength_scaled = get_scaled_county(countystrength)
    prob_spread_scaled = get_scaled_spread(prob_spread)

    neighbours = get_neighbours()
    neighboursprob = get_probA(prob_spread_scaled, neighbours)
    mean = get_spread_scores(neighboursprob,neighbours)
    pn = get_covid_prob(countystrength_scaled,mean)
    return pn


In [12]:
def get_accuracy(pn, pred_month="July"):
    top100pred = covid_predictions(pn,pred_month)
    top100 = groundtruth(pred_month)
    top100common = getcommonall(top100, top100pred)

    return top100common, top100


In [13]:
def get_accuracy_old(pn, pred_month="July"):
    top100pred = covid_predictions_old(pn,pred_month)
    top100 = groundtruth(pred_month)
    top100common = getcommonall(top100, top100pred)

    return top100common, top100


In [14]:
def main(prob_spread,countystrength,pred_month="July"):
    
    pn = get_predictions(prob_spread,countystrength)

    top20common,top20 = get_accuracy(pn, pred_month)
    
    return  top20common,top20

In [15]:
def main_old(prob_spread,countystrength,pred_month="July"):
    
    pn = get_predictions(prob_spread,countystrength)

    top20common,top20 = get_accuracy_old(pn, pred_month)
    
    return  top20common,top20

In [16]:
def getruralstrength(nfile):
    countystrength1 = pd.read_csv(nfile)
    countystrength1['fips_code'] = countystrength1['fips_code'].astype(int)
    
    rural_counties = pd.read_csv('final_rural_counties.csv')
    rural_counties['fips_code'] = rural_counties['fips_code'].astype(int)
    rural_counties_l = rural_counties.fips_code.unique()
    rural_strength = countystrength1[countystrength1.fips_code.isin(rural_counties_l)]
    
    return rural_strength
    

# START HERE:

In [17]:
months = [['April','May'],['May','June'],['June','July'],['July','August'],['August','September'],['September','October']]
# months = [['july','August']]
# months = [['August','September']]

for month in months:
    h2nfile = 'all_counties_model/' + month[0] + '-h2n-rate.csv'
    nfile = 'all_counties_model/' + month[0] + '-n.csv'
    prob_spread = pd.read_csv(h2nfile)
    prob_spread['fips_code'] = prob_spread['fips_code'].astype(int)
    countystrength = getruralstrength(nfile)
    
    top20common,top20 = main(prob_spread,countystrength,month[1])
    print(month)
    print(top20[top20common].count())
    print(top20[top20common])


Predictions/May_predictions.csv
['April', 'May']
fips_code    0
May          0
dtype: int64
Empty DataFrame
Columns: [fips_code, May]
Index: []
Predictions/June_predictions.csv
['May', 'June']
fips_code    0
June         0
dtype: int64
Empty DataFrame
Columns: [fips_code, June]
Index: []
Predictions/July_predictions.csv
['June', 'July']
fips_code    0
July         0
dtype: int64
Empty DataFrame
Columns: [fips_code, July]
Index: []
Predictions/August_predictions.csv
['July', 'August']
fips_code    0
August       0
dtype: int64
Empty DataFrame
Columns: [fips_code, August]
Index: []
Predictions/September_predictions.csv
['August', 'September']
fips_code    0
September    0
dtype: int64
Empty DataFrame
Columns: [fips_code, September]
Index: []
Predictions/October_predictions.csv
['September', 'October']
fips_code    0
October      0
dtype: int64
Empty DataFrame
Columns: [fips_code, October]
Index: []


In [None]:
months = [['April','May'],['May','June'],['June','July'],['July','August'],['August','September'],['September','October']]
# months = [['july','August']]
# months = [['August','September']]

for month in months:
    h2nfile = 'all_counties_model/' + month[0] + '-h2n-rate.csv'
    nfile = 'all_counties_model/' + month[0] + '-n.csv'
    prob_spread = pd.read_csv(h2nfile)
    prob_spread['fips_code'] = prob_spread['fips_code'].astype(int)
    countystrength = getruralstrength(nfile)
    
    top20common,top20 = main_old(prob_spread,countystrength,month[1])
    print(month)
    print(top20[top20common].count())
    print(top20[top20common])


Predictions/May_predictions.csv
['April', 'May']
fips_code    24
May          24
dtype: int64
      fips_code    May
419     37061.0  739.0
1066    28099.0  459.0
1297    37163.0  416.0
1055    21177.0  360.0
161      1013.0  358.0
1320    28123.0  334.0
668     28051.0  296.0
848     28079.0  223.0
54      28007.0  199.0
594     48185.0  196.0
1072    28101.0  190.0
190     24011.0  188.0
1570    18181.0  188.0
369      1049.0  171.0
1582    45089.0  169.0
905     28087.0  157.0
1419     1123.0  142.0
1443    13277.0  139.0
897     21141.0  135.0
70       1005.0  128.0
931      1091.0  121.0
727     28061.0  121.0
857     28083.0  116.0
247      1023.0  110.0
Predictions/June_predictions.csv
['May', 'June']
fips_code    33
June         33
dtype: int64
      fips_code   June
849      5077.0  631.0
1297    37163.0  561.0
1443    13277.0  494.0
965     40089.0  436.0
1450    48449.0  348.0
1409    12121.0  332.0
369      1049.0  328.0
1543    28153.0  311.0
1066    28099.0  306.0
594    