In [1]:
import os
from pprint import pprint as pp
from glob import glob
import numpy as np
import pandas as pd

# 20210629_lccf

----

The purpose of this notebook is to generate test data for LCCF performance scaling experiments on the model run in the script `scripts/20210623_lccf.py`.

In this notebook, I will load files `data/lccf/*.csv` and cp the rows to emulate large-scale Safegraph data ingests. The starting point of this notebook looks like:

```bash
$ ls ../data/lccf
census_pop1_rows1.csv   contacts_pop1_rows1.csv travel_pop1_rows1.csv
```

The goal is to make `*rows2.csv`, `*rows4.csv`, etc. files that have 2x, 4x, etc. as many rows as the `*rows1.csv` files.

----

To test performance scaling as a function of population size, I will also make `*_pop2_rows0.csv`, `*_pop4_rows0.csv`, etc. which are equivalent to `*rows0.csv`, but with 2x, 4x, etc. greater contact/travel/census population. `pop1` is implied if no `pop` identifier is included in the file name.

In [2]:
!ls ../data/lccf

census_pop1_rows1.csv   contacts_pop1_rows1.csv travel_pop1_rows1.csv


In [3]:
def create_scaled_df(ref_df: pd.DataFrame, pop_mult=1, rows_mult=1, schema=None) -> pd.DataFrame:
    """Returns a population or row scaled DataFrame, given a reference dataframe `ref_df`"""
    pass

In [4]:
schemas = {
    'census': {
        'value_col': None,
        'usecols': ("GEOID", "age_bin", "group_pop"),
        'glob': list(),
        'keys': None
    },
    'contacts': {
        'value_col': None,
        'usecols': ('age1', 'age2', 'daily_per_capita_contacts'),
        'glob': list(),
        'keys': None
    },
    'travel': {
        'value_col': None,
        'usecols': ('source', 'destination', 'age', 'n', 'date', 'destination_type'),
        'glob': list(),
        'keys': None
    },
}

In [5]:
def create_scaled_csvs(dir_fp: str, scales: tuple = (2, 4, 8), verbose=2):
    assert os.path.isdir(dir_fp)
    for schema in schemas:
        query = f"{schema}*.csv"
        hits = glob(os.path.join(dir_fp, query))
        assert len(hits) == 1, f"found {len(hits)} hits for glob query {query}, expected 1"
        schemas[schema]['glob'] = hits
    if verbose:
        print(f'schemas:')
        pp(schemas)

In [6]:
create_scaled_csvs('../data/lccf')

schemas:
{'census': {'glob': ['../data/lccf/census_pop1_rows1.csv'],
            'keys': None,
            'usecols': ('GEOID', 'age_bin', 'group_pop'),
            'value_col': None},
 'contacts': {'glob': ['../data/lccf/contacts_pop1_rows1.csv'],
              'keys': None,
              'usecols': ('age1', 'age2', 'daily_per_capita_contacts'),
              'value_col': None},
 'travel': {'glob': ['../data/lccf/travel_pop1_rows1.csv'],
            'keys': None,
            'usecols': ('source',
                        'destination',
                        'age',
                        'n',
                        'date',
                        'destination_type'),
            'value_col': None}}


**Question**: what should we use as the `value_col` for each schema? Also gives us a reminder of what the schema is. Let's `head`...

In [7]:
!head ../data/lccf/*.csv

==> ../data/lccf/census_pop1_rows1.csv <==
"","GEOID","NAME","age_bin","group_pop"
"1","75001","ZCTA5 75001","<5",794
"2","75001","ZCTA5 75001","18-49",9420
"3","75001","ZCTA5 75001","5-17",1404
"4","75001","ZCTA5 75001","50-64",2259
"5","75001","ZCTA5 75001","65+",1115
"6","75002","ZCTA5 75002","<5",4227
"7","75002","ZCTA5 75002","18-49",29659
"8","75002","ZCTA5 75002","5-17",15710
"9","75002","ZCTA5 75002","50-64",14706

==> ../data/lccf/contacts_pop1_rows1.csv <==
,age1,age2,daily_per_capita_contacts
0,<5,<5,2.160940833918119
1,5-17,<5,0.5973413405271149
2,18-49,<5,0.3822025191217617
3,50-64,<5,0.3523966597811896
4,65+,<5,0.18975609071541075
5,<5,5-17,2.164117384279739
6,5-17,5-17,8.146970087503425
7,18-49,5-17,2.431391745980527
8,50-64,5-17,1.885100325362032

==> ../data/lccf/travel_pop1_rows1.csv <==
,Unnamed: 0,source,destination,age,n,date,destination_type
30555,30555,76511,76511,<5,35.05384615384615,2020-03-11,local
30556,30556,76511,76511,18-49,472.2846153846154,2020-03-11,loc

We're going to need more than just cp rows. Specifically:
- Census data probably needs unique `primaryKey == [GEOID, age_bin]`
- Travel data needs n^2 rows where n is the number of GEOIDs

So what we really mean by "scaling rows" is that we're scaling by `n`, number of GEOIDs in the census CSV. Easy way to do this is just prepend non-zero integer to GEOID.

----

## Parsers for each schema

In [8]:
def parse_census(csv_fp: str) -> pd.DataFrame:
    df = pd.read_csv(csv_fp, usecols=schemas['census']['usecols'])
    assert not df.isna().any().any(), ('found null values in df', df.isna().any())
    # df.rename(columns={'GEOID': 'vertex', 'age_bin': 'age_group'}, inplace=True)
    # df.set_index(['vertex', 'age_group'], inplace=True)
    # filter to zcta that we want to model in the simulation (vertex coords)
    return df

In [9]:
def parse_contacts(csv_fp: str) -> pd.DataFrame:
    return pd.read_csv(csv_fp, usecols=schemas['contacts']['usecols'])

In [10]:
def parse_travel(csv_fp: str) -> pd.DataFrame:
    return pd.read_csv(csv_fp, usecols=schemas['travel']['usecols'])

In [11]:
for schema in schemas:
    schemas[schema]['parser'] = globals()[f'parse_{schema}']
pp(schemas)

{'census': {'glob': ['../data/lccf/census_pop1_rows1.csv'],
            'keys': None,
            'parser': <function parse_census at 0x7f837ce609d0>,
            'usecols': ('GEOID', 'age_bin', 'group_pop'),
            'value_col': None},
 'contacts': {'glob': ['../data/lccf/contacts_pop1_rows1.csv'],
              'keys': None,
              'parser': <function parse_contacts at 0x7f837bb54d30>,
              'usecols': ('age1', 'age2', 'daily_per_capita_contacts'),
              'value_col': None},
 'travel': {'glob': ['../data/lccf/travel_pop1_rows1.csv'],
            'keys': None,
            'parser': <function parse_travel at 0x7f837ba72a60>,
            'usecols': ('source',
                        'destination',
                        'age',
                        'n',
                        'date',
                        'destination_type'),
            'value_col': None}}


## Generate unique GEOIDs for census data

In [12]:
def get_unique_geoid(ref: pd.Series, n=None) -> pd.Series:
    """Returns a pd.Series with `n` unique GEOID-like integers.
    Iterates over GEOIDs in `ref`, prepending positive integer
    to generate unique IDs.
    """
    ref_ids = ref.unique().astype(str)
    n = n if n is not None else len(ref_ids)
    prefix = 1
    out = list()
    while len(out) < n:
        out.extend((f'{str(prefix)}{geoid}' for geoid in ref_ids))
        prefix += 1
    as_ser = pd.Series(out[:n]).astype(int)
    assert as_ser.unique().all()
    return as_ser

In [13]:
schemas['census']['keys'] = get_unique_geoid(parse_census('../data/lccf/census_pop1_rows1.csv')['GEOID'], n=1936)

## Generate unique pairwise combinations of GEOIDs for travel data

We assume the worst case scenario here: a length n^2 index for n GEOIDs. In reality, probably a few less, since the graph of travel between GEOIDs is not complete.

In [14]:
def get_unique_pairs(geoids: pd.Series) -> pd.DataFrame
    return geoids.to_frame(name='source').merge(geoids.to_frame(name='destination'), how='cross')

Testing:

In [17]:
get_unique_pairs(schemas['census']['keys']).head()

Unnamed: 0,source,destination
0,175001,175001
1,175001,175002
2,175001,175006
3,175001,175007
4,175001,175009


# Sandbox

In [110]:
schemas['census']['keys']

0         175001
1         175002
2         175006
3         175007
4         175009
          ...   
19364    1175028
19365    1175032
19366    1175034
19367    1175035
19368    1175038
Length: 19369, dtype: int64