# TODO: Create Features for neighbour edges (H-N) and nodes (N)

### Currently we use neighbour (H) as simplified features for (H-N)
### We need to prepare the dataframes of features below:

In [1]:
import pandas as pd
from sklearn import preprocessing

# Load Features here:

In [2]:
prob_spread = pd.read_csv('june-h2n.csv')

In [3]:
countystrength = pd.read_csv('june-n.csv')

# p(B) Scale and process for county strength prob(B)

In [4]:
countystrength = countystrength.set_index('fips_code')
countystrength_vals = countystrength.values
min_max_scaler = preprocessing.MinMaxScaler()
countystrength_scaled_vals = min_max_scaler.fit_transform(countystrength_vals)
countystrength_scaled = pd.DataFrame(countystrength_scaled_vals, index=countystrength.index, columns=countystrength.columns)

countystrength_scaled['pB'] = countystrength_scaled['county_health'] * (1-countystrength_scaled['social_score']) * countystrength_scaled['preparedness_score'] ## tune here


# p(A) Scale and process for probability of spread at hubs(i.e. neighbours)

In [5]:
prob_spread = prob_spread.set_index('fips_code')
prob_spread_vals = prob_spread.values
min_max_scaler = preprocessing.MinMaxScaler()
prob_spread_scaled_vals = min_max_scaler.fit_transform(prob_spread_vals)
prob_spread_scaled = pd.DataFrame(prob_spread_scaled_vals, index=prob_spread.index, columns=prob_spread.columns)

prob_spread_scaled['pA'] = prob_spread_scaled['covid_rate'] * prob_spread_scaled['social_score'] ## tune here


# Defining the list of neighbours 


In [6]:
# Code for extracting list of neighbours
mile25 = pd.read_csv('data/sf12010countydistance25miles.csv', encoding = "ISO-8859-1")
neighbours = mile25[['county1','county2']].pivot(columns='county1',values='county2').dropna(how='all')
#neighbour_dict = mile25.groupby('countyname1')['countyname2'].apply(list).to_dict()

# Collecting the Prob(A) probabilites of neighbours spreading covid

In [7]:
pA_dict = prob_spread_scaled.to_dict() # running full scale will be SLOW
# pA_dict = prob_spread.to_dict() #used for testing: to be removed
neighboursprob = neighbours.copy()
for county in neighbours.columns:
    newcol = str(county) + 'pA'
    neighboursprob[newcol] = neighbours[county].map(pA_dict['pA'])

In [8]:
neighboursprob[[1011,'1011pA']].dropna(how='all')

county1,1011,1011pA
6,1087.0,0.055934
7,1005.0,0.08621
8,1109.0,


In [9]:
neighboursprob[[1005,'1005pA']].dropna(how='all')

county1,1005,1005pA
2,13239.0,0.016593
3,1011.0,0.103765


# For each county, get average of neighbour's prob_spread scores
### alternatively, we can do max of neighbour or sum of neighbours (then scale again)

In [10]:
neighboursprob1 = neighboursprob.drop(columns = neighbours.columns)
mean = neighboursprob1.mean().reset_index()
mean['fips_code'] = mean['county1'].str.split('p', expand=True)[0]
mean = mean.rename(columns={0:'prob_spread'})
mean = mean.set_index('fips_code')

In [12]:
mean[mean.index=='1005']

Unnamed: 0_level_0,county1,prob_spread
fips_code,Unnamed: 1_level_1,Unnamed: 2_level_1
1005,1005pA,0.060179


In [14]:
mean[mean.index=='1011']

Unnamed: 0_level_0,county1,prob_spread
fips_code,Unnamed: 1_level_1,Unnamed: 2_level_1
1011,1011pA,0.071072


# Probability of a county getting covid
#### P(n) = mean * 1-countystrength

In [15]:
pn = {}
for county in countystrength.index:
#     print(county)
    if str(county) in mean.index :
        pn[county] = mean.at[str(county),"prob_spread"] * (1-countystrength_scaled.at[county,"pB"]) ##tune here
    

# We sort pn and find the counties with the highest probabilites
### E.g. If we used features for June 2020, these probabilites would be the prediction for July 2020

In [16]:
jul_pred = pd.DataFrame.from_dict(pn, orient='index').sort_values(0, ascending=False)
jul_pred = jul_pred.rename(columns={0:'probability_covid'})
jul_pred.to_csv('jul_predictions.csv', index=True)

In [17]:
# Check that it is saved right
read_pred = pd.read_csv('jul_predictions.csv')

In [22]:
jul_pred[0:20]

Unnamed: 0,probability_covid
22123,0.272673
19035,0.204559
46015,0.200806
29143,0.194536
19041,0.17336
28055,0.150197
13307,0.149372
29155,0.148427
19161,0.147023
47045,0.140052


# TODO: We need to get some ground truth to compare with

### The above is for example features from June 2020, we will need to compare the output with July 2020 counties covid count

In [21]:
newcases = pd.read_csv('monthly_rural_new_cases.csv')
newcasesjul = newcases[['fips_code','July']].sort_values('July', ascending=False)
newcasesjul[0:20]

Unnamed: 0,fips_code,July
958,48323.0,1678.0
715,12063.0,1203.0
1488,48465.0,1172.0
33,48001.0,1164.0
34,48005.0,1130.0
369,1049.0,1052.0
1531,28151.0,954.0
294,13069.0,759.0
27,22003.0,732.0
1192,5115.0,728.0
