In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
# p(B) Scale and process for county strength prob(B)
def get_scaled_county(countystrength):
    countystrength = countystrength.set_index('fips_code')
    countystrength_vals = countystrength.values
    min_max_scaler = preprocessing.MinMaxScaler()
    countystrength_scaled_vals = min_max_scaler.fit_transform(countystrength_vals)
    countystrength_scaled = pd.DataFrame(countystrength_scaled_vals, index=countystrength.index, columns=countystrength.columns)

    countystrength_scaled['pB'] = countystrength_scaled['county_health'] * (1-countystrength_scaled['social_score']) * countystrength_scaled['preparedness_score'] ## tune here
    
    return countystrength_scaled


In [3]:
# p(A) Scale and process for probability of spread at hubs(i.e. neighbours)
def get_scaled_spread(prob_spread):
    prob_spread = prob_spread.set_index('fips_code')
    prob_spread_vals = prob_spread.values
    min_max_scaler = preprocessing.MinMaxScaler()
    prob_spread_scaled_vals = min_max_scaler.fit_transform(prob_spread_vals)
    prob_spread_scaled = pd.DataFrame(prob_spread_scaled_vals, index=prob_spread.index, columns=prob_spread.columns)

    prob_spread_scaled['pA'] = prob_spread_scaled['covid_rate'] * prob_spread_scaled['social_score'] ## tune here

    return prob_spread_scaled
    

In [4]:
# Code for extracting list of neighbours
def get_neighbours():
    mile25 = pd.read_csv('data/sf12010countydistance25miles.csv', encoding = "ISO-8859-1")
    neighbours = mile25[['county1','county2']].pivot(columns='county1',values='county2').dropna(how='all')
    return neighbours
    

In [5]:
# Collecting the Prob(A) probabilites of neighbours spreading covid
def get_probA(prob_spread_scaled, neighbours):
    pA_dict = prob_spread_scaled.to_dict()
    neighboursprob = neighbours.copy()
    for county in neighbours.columns:
        newcol = str(county) + 'pA'
        neighboursprob[newcol] = neighbours[county].map(pA_dict['pA'])
        
    return neighboursprob

In [6]:
# For each county, get average of neighbour's prob_spread scores
### alternatively, we can do max of neighbour or sum of neighbours (then scale again)
def get_spread_scores(neighboursprob, neighbours):
    neighboursprob1 = neighboursprob.drop(columns = neighbours.columns)
    mean = neighboursprob1.mean().reset_index()
    mean['fips_code'] = mean['county1'].str.split('p', expand=True)[0]
    mean = mean.rename(columns={0:'prob_spread'})
    mean = mean.set_index('fips_code')
    
    return mean


In [7]:
# Probability of a county getting covid

def get_covid_prob(countystrength_scaled,mean):
    pn = {}
    for county in countystrength_scaled.index:
    #     print(county)
        if str(county) in mean.index :
            pn[county] = mean.at[str(county),"prob_spread"] * (1-countystrength_scaled.at[county,"pB"]) ##tune here
    return pn

In [8]:
def covid_predictions(pn):
    jul_pred = pd.DataFrame.from_dict(pn, orient='index').sort_values(0, ascending=False)
    jul_pred = jul_pred.rename(columns={0:'probability_covid'})
    jul_pred.to_csv('jul_predictions.csv', index=True)
    top20pred = jul_pred[0:20]
    return top20pred


In [9]:
# comparing with some ground truth
def groundtruth(pred_month):
    newcases = pd.read_csv('monthly_rural_new_cases.csv')
    newcasesjul = newcases[['fips_code',pred_month]].sort_values(pred_month, ascending=False)
    top20 = newcasesjul[0:20]
    return top20

def getcommon(top20, top20pred):
    common = top20.fips_code.isin(top20pred.index)
    return common



In [11]:
def get_predictions(prob_spread,countystrength):
    countystrength_scaled = get_scaled_county(countystrength)
    prob_spread_scaled = get_scaled_spread(prob_spread)

    neighbours = get_neighbours()
    neighboursprob = get_probA(prob_spread_scaled, neighbours)
    mean = get_spread_scores(neighboursprob,neighbours)
    pn = get_covid_prob(countystrength_scaled,mean)
    return pn


In [12]:
def get_accuracy(pn, pred_month="July"):
    top20pred = covid_predictions(pn)
    top20 = groundtruth(pred_month)
    top20common = getcommon(top20, top20pred)
    return top20common, top20


In [13]:
def main(prob_spread,countystrength,pred_month="July"):
    
    pn = get_predictions(prob_spread,countystrength)

    top20common,top20 = get_accuracy(pn, pred_month="July")
    
    return  top20common,top20

# START HERE:

In [14]:
prob_spread_jun = pd.read_csv('june-h2n.csv')
countystrength_jun = pd.read_csv('june-n.csv')
top20commonJul,top20Jul = main(prob_spread_jun,countystrength_jun,"July")
top20Jul[top20commonJul]

Unnamed: 0,fips_code,July
449,1053.0,685.0
