Instructions for build are here: https://github.com/eho-tacc/episimlab/issues/26

In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import xsimlab as xs
from episimlab.partition import partition
from episimlab.setup.coords import InitDefaultCoords
import multiprocessing as mp
from datetime import datetime

# Load `travel.csv`

In [2]:
tr2020 = pd.read_csv('/Users/kpierce/COVID19/safegraph_mobility/2020_travel_for_contact_partitioning.csv')

In [3]:
tr2020.head()

Unnamed: 0.1,Unnamed: 0,source,destination,age,n,date,destination_type
0,0,76511,76511,<5,47.377358,2020-05-30,local
1,1,76511,76511,18-49,638.320755,2020-05-30,local
2,2,76511,76511,5-17,202.754717,2020-05-30,local
3,3,76511,76511,50-64,223.641509,2020-05-30,local
4,4,76511,76511,65+,164.037736,2020-05-30,local


In [4]:
mar2020_tr = tr2020[(tr2020['date'] < '2020-04-01') & (tr2020['date'] >= '2020-03-01')]

In [5]:
mar2020_tr.head()

Unnamed: 0.1,Unnamed: 0,source,destination,age,n,date,destination_type
30555,30555,76511,76511,<5,35.053846,2020-03-11,local
30556,30556,76511,76511,18-49,472.284615,2020-03-11,local
30557,30557,76511,76511,5-17,150.015385,2020-03-11,local
30558,30558,76511,76511,50-64,165.469231,2020-03-11,local
30559,30559,76511,76511,65+,121.369231,2020-03-11,local


In [None]:
#mar2020_tr.to_csv('/Users/kpierce/episimlab/data/mar2020_travel.csv')

# Load `contact.csv`

In [6]:
polymod = pd.read_csv('~/COVID19/SEIR-city/data/Cities_Data/ContactMatrixAll_5AgeGroups.csv', header=None)

In [7]:
polymod.columns = ['<5', '5-17', '18-49', '50-64', '65+']

In [8]:
polymod['index'] = ['<5', '5-17', '18-49', '50-64', '65+']

In [9]:
polymod_melt = polymod.melt(id_vars='index', value_vars=['<5', '5-17', '18-49', '50-64', '65+'])

In [10]:
polymod_melt = polymod_melt.rename(columns={'index': 'age1', 'variable': 'age2', 'value': 'daily_per_capita_contacts'})

In [11]:
polymod_melt.head(10)

Unnamed: 0,age1,age2,daily_per_capita_contacts
0,<5,<5,2.160941
1,5-17,<5,0.597341
2,18-49,<5,0.382203
3,50-64,<5,0.352397
4,65+,<5,0.189756
5,<5,5-17,2.164117
6,5-17,5-17,8.14697
7,18-49,5-17,2.431392
8,50-64,5-17,1.8851
9,65+,5-17,0.892909


In [None]:
#polymod_melt.to_csv('/Users/kpierce/episimlab/data/polymod_contacts.csv')

# Partition contacts

In [12]:
date_list = sorted(mar2020_tr['date'].unique(), reverse=True)

In [13]:
travel_20200311 = mar2020_tr[mar2020_tr['date'] == '2020-03-11']

In [14]:
travel_20200311.head()

Unnamed: 0.1,Unnamed: 0,source,destination,age,n,date,destination_type
30555,30555,76511,76511,<5,35.053846,2020-03-11,local
30556,30556,76511,76511,18-49,472.284615,2020-03-11,local
30557,30557,76511,76511,5-17,150.015385,2020-03-11,local
30558,30558,76511,76511,50-64,165.469231,2020-03-11,local
30559,30559,76511,76511,65+,121.369231,2020-03-11,local


In [None]:
#travel_20200311.to_csv('/Users/kpierce/episimlab/data/20200311_travel.csv')

In [16]:
part = partition.Partition2Contact(
    travel_fp='/Users/kpierce/episimlab/data/20200311_travel.csv',
    contacts_fp='/Users/kpierce/episimlab/data/polymod_contacts.csv'
)

In [18]:
part.initialize(
    step_delta=np.timedelta64(24, 'h'),
    step_start=np.datetime64('2020-03-11T00:00:00.000000000'),
    step_end=np.datetime64('2020-03-12T00:00:00.000000000')
)

DEBUG:root:'load_travel_df' took 0.04 seconds
DEBUG:root:step_start: 2020-03-11T00:00:00.000000000
DEBUG:root:step_end: 2020-03-11T00:00:00.000000000
DEBUG:root:Starting dask merge at 2021-07-20 13:46:06.595431
DEBUG:root:Finishing dask merge at 2021-07-20 13:46:15.523958
DEBUG:root:Starting pandas merge 1 at 2021-07-20 13:46:15.524820
DEBUG:root:Starting pandas merge 2 at 2021-07-20 13:46:22.542231
DEBUG:root:Calculating contact probabilities on full dataframe starting at 2021-07-20 13:46:28.131331
DEBUG:root:'dask_partition' took 24.04 seconds
DEBUG:root:Building contact xarray at 2021-07-20 13:46:30.900185


In [19]:
start = datetime.now()
part.run_step(
    step_delta=np.timedelta64(24, 'h'),
    step_start=np.datetime64('2020-03-11T00:00:00.000000000'),
    step_end=np.datetime64('2020-03-12T00:00:00.000000000')
)
stop = datetime.now()
print('Stop time is {}'.format(stop))
print('Total time is {}'.format(stop-start))

DEBUG:root:step_start: 2020-03-11T00:00:00.000000000
DEBUG:root:step_end: 2020-03-12T00:00:00.000000000
DEBUG:root:Starting dask merge at 2021-07-20 13:48:22.914539
DEBUG:root:Finishing dask merge at 2021-07-20 13:48:30.614760
DEBUG:root:Starting pandas merge 1 at 2021-07-20 13:48:30.615669
DEBUG:root:Starting pandas merge 2 at 2021-07-20 13:48:36.221208
DEBUG:root:Calculating contact probabilities on full dataframe starting at 2021-07-20 13:48:41.419961
DEBUG:root:'dask_partition' took 20.72 seconds
DEBUG:root:Building contact xarray at 2021-07-20 13:48:43.865085


Stop time is 2021-07-20 13:48:43.929065
Total time is 0:00:21.042926


In [20]:
faster_partition = part.contact_xr

In [None]:
#faster_partition.to_netcdf('/Users/kpierce/episimlab/data/20200311_contact_matrix_dask_xarray.nc')

In [21]:
faster_partition

The older version of the contact matrix is incorrect because of some bugs in my logic for contact partitioning.

In [None]:
slower_partition = xr.open_dataset('/Users/kpierce/episimlab/data/20200311_contact_matrix.nc')

In [None]:
slower_partition

In [None]:
slower_partition = slower_partition.to_array()
slower_partition = slower_partition.squeeze().drop('variable')
slower_partition = slower_partition.rename(
    {
        'vertex_i': 'vertex1',
        'vertex_j': 'vertex2',
        'age_i': 'age_group1',
        'age_j': 'age_group2',
    }
)

In [None]:
np.array_equal(slower_partition.data, faster_partition.data) 

In [None]:
def sort_coords(da):
    for dim in da.dims:
        da = da.sortby(dim)
    return da

In [None]:
xr.testing.assert_allclose(sort_coords(slower_partition), sort_coords(faster_partition))

In [None]:

def partition_dates(df, date):

    # get a dataframe subset and save as tmpfile
    date_df = df[df['date'] == date]
    date_temp = tempfile.NamedTemporaryFile()
    date_df.to_csv(date_temp)
    
    # partition dates
    part = partition.Partition(
        travel_fp=date_temp.name,
        contacts_fp='/Users/kpierce/episimlab/data/polymod_contacts.csv',
        age_group=['<5', '5-17', '18-49', '50-64', '65+'],
        risk_group=['high', 'low']
    )
    part.initialize()
    part.contact_partitions['date'] = date
    
    # close tempfile
    date_temp.close()
    
    return part.contact_partitions

In [None]:
mar2020_tr['date'].unique()[0:4]

In [None]:
start = datetime.now()
print('Start time is {}'.format(start))
tasks = [(mar2020_tr, i) for i in mar2020_tr['date'].unique()]
short_tasks = [(mar2020_tr, mar2020_tr['date'].unique()[0])]
pool = mp.Pool(1)
results = [pool.apply_async(partition_dates, t) for t in short_tasks]
pool.close()
partitioned_dfs = []
for r in results:
    partitioned_dfs.append(r.get())
stop = datetime.now()
print('Stop time is {}'.format(stop))
print('Total time is {}'.format(stop-start))

In [None]:
partitioned_dfs

In [None]:
partitioned_df_final = pd.concat(partitioned_dfs)

In [None]:
partitioned_df_final.head()

In [None]:
partitioned_df_final[partitioned_df_final['age_i'] != partitioned_df_final['age_j']]

In [None]:
partitioned_df_final['date'].unique()

In [None]:
partitioned_df_final.to_csv('/Users/kpierce/episimlab/data/20200311_contact_matrix.csv')

In [None]:
repart = partition.Partition(
    travel_fp='/Users/kpierce/COVID19/safegraph_mobility/2020_travel_for_contact_partitioning.csv',
    contacts_fp='/Users/kpierce/episimlab/data/polymod_contacts.csv',
    age_group=['<5', '5-17', '18-49', '50-64', '65+'],
    risk_group=['high', 'low']
)

In [None]:
# assign some class attributes
repart.contact_partitions = partitioned_df_final
repart.age_group = ['<5', '5-17', '18-49', '50-64', '65+']
repart.age_dims = ['source_age', 'destination_age']

In [None]:
contact_matrix = repart.contact_matrix()

In [None]:
contact_matrix

In [None]:
contact_matrix.to_netcdf('/Users/kpierce/episimlab/data/20200311_contact_matrix.nc')