## Preparing the Synthetic Population

We will use the spc package for our synthetic population. To add it as a dependancy in this virtual environment, I ran `poetry add git+https://github.com/alan-turing-institute/uatk-spc.git@55-output-formats-python#subdirectory=python`. The branch may change if the python package is merged into the main spc branch. 

In [1]:
#import json
import pandas as pd

#https://github.com/alan-turing-institute/uatk-spc/blob/55-output-formats-python/python/examples/spc_builder_example.ipynb
from uatk_spc.builder import Builder

### Loading in the SPC synthetic population

I use the code in the `Quickstart` [here](https://github.com/alan-turing-institute/uatk-spc/blob/55-output-formats-python/python/README.md) to get a parquet file and convert it to JSON. 

You have two options:


1- Slow and memory-hungry: Download the pbf file directly from [here](https://alan-turing-institute.github.io/uatk-spc/using_england_outputs.html) and load in the pbf file with the python package

2- Faster: Covert the pbf file to parquet, and then load it using the python package. To convert to parquet, you need to:

a. clone the [uatk-spc](https://github.com/alan-turing-institute/uatk-spc/tree/main/docs) 

b. Run `cargo run --release -- --rng-seed 0 --flat-output config/England/west-yorkshire.txt --year 2020`  and replace `west-yorkshire` and `2020` with your preferred option
        

In [2]:
# Pick a region with SPC output saved
path = "../data/spc_output/raw/"
region = "west-yorkshire"

#### People and household data

In [3]:
# add people and households
spc_people_hh = (
    Builder(path, region, backend="pandas", input_type="parquet")
    .add_households()
    .unnest(["health", "employment", "details"])
    # remove nssec column
    .build()
)

spc_people_hh.head()

Unnamed: 0,id,household,workplace,location,demographics,events,weekday_diaries,weekend_diaries,orig_pid,id_tus_hh,...,salary_yearly,salary_hourly,hid,nssec8,accommodation_type,communal_type,num_rooms,central_heat,tenure,num_cars
0,0,0,,"{'x': -1.7892179489135742, 'y': 53.91915130615...","{'sex': 1, 'age_years': 86, 'ethnicity': 1, 'n...","{'sport': 0.09000000357627869, 'rugby': 0.1133...","[1583, 13161]","[1582, 13160]",E02002183_0001_001,11291218,...,,,E02002183_0001,1.0,1.0,,2.0,True,2.0,2
1,1,1,,"{'x': -1.8262380361557007, 'y': 53.92028045654...","{'sex': 1, 'age_years': 74, 'ethnicity': 3, 'n...","{'sport': 0.23899999260902405, 'rugby': 0.3011...","[2900, 4948, 4972, 7424, 10284, 10586, 12199, ...","[2901, 4949, 4973, 7425, 10285, 10585, 12198, ...",E02002183_0002_001,17291219,...,,,E02002183_0002,1.0,3.0,,6.0,True,2.0,2
2,2,1,,"{'x': -1.8262380361557007, 'y': 53.92028045654...","{'sex': 2, 'age_years': 68, 'ethnicity': 1, 'n...","{'sport': 0.23899999260902405, 'rugby': 0.1768...","[3010, 6389, 9448, 10184, 11598]","[3011, 6388, 9447, 10183, 11599]",E02002183_0002_002,17070713,...,,,E02002183_0002,1.0,3.0,,6.0,True,2.0,2
3,3,2,56126.0,"{'x': -1.8749940395355225, 'y': 53.94298934936...","{'sex': 1, 'age_years': 27, 'ethnicity': 1, 'n...","{'sport': 0.2329999953508377, 'rugby': 0.14678...","[366, 867, 2096, 3678, 5212, 5450, 8145, 9254,...","[365, 868, 2097, 3677, 5213, 5451, 8146, 9253,...",E02002183_0003_001,20310313,...,32857.859375,14.360952,E02002183_0003,4.0,3.0,,6.0,True,2.0,1
4,4,2,,"{'x': -1.8749940395355225, 'y': 53.94298934936...","{'sex': 2, 'age_years': 26, 'ethnicity': 1, 'n...","{'sport': 0.2329999953508377, 'rugby': 0.08620...","[1289, 12528, 12870]","[1288, 12529, 12871]",E02002183_0003_002,13010909,...,18162.451172,9.439944,E02002183_0003,4.0,3.0,,6.0,True,2.0,1


In [4]:
# we need to unnest the demographic data. If we do this above
# we get an error as there will be two "nssec8" columns.

# Unnest the JSON column
demographics = pd.json_normalize(spc_people_hh['demographics'])

# Remove the columns we don't want
spc_people_hh = spc_people_hh.drop(['demographics', 'nssec8'], axis = 1)
# Add the unnested demographics column
spc_people_hh = pd.concat([spc_people_hh, demographics], axis = 1)

spc_people_hh.head()

Unnamed: 0,id,household,workplace,location,events,weekday_diaries,weekend_diaries,orig_pid,id_tus_hh,id_tus_p,...,accommodation_type,communal_type,num_rooms,central_heat,tenure,num_cars,sex,age_years,ethnicity,nssec8
0,0,0,,"{'x': -1.7892179489135742, 'y': 53.91915130615...","{'sport': 0.09000000357627869, 'rugby': 0.1133...","[1583, 13161]","[1582, 13160]",E02002183_0001_001,11291218,1,...,1.0,,2.0,True,2.0,2,1,86,1,1.0
1,1,1,,"{'x': -1.8262380361557007, 'y': 53.92028045654...","{'sport': 0.23899999260902405, 'rugby': 0.3011...","[2900, 4948, 4972, 7424, 10284, 10586, 12199, ...","[2901, 4949, 4973, 7425, 10285, 10585, 12198, ...",E02002183_0002_001,17291219,1,...,3.0,,6.0,True,2.0,2,1,74,3,1.0
2,2,1,,"{'x': -1.8262380361557007, 'y': 53.92028045654...","{'sport': 0.23899999260902405, 'rugby': 0.1768...","[3010, 6389, 9448, 10184, 11598]","[3011, 6388, 9447, 10183, 11599]",E02002183_0002_002,17070713,2,...,3.0,,6.0,True,2.0,2,2,68,1,2.0
3,3,2,56126.0,"{'x': -1.8749940395355225, 'y': 53.94298934936...","{'sport': 0.2329999953508377, 'rugby': 0.14678...","[366, 867, 2096, 3678, 5212, 5450, 8145, 9254,...","[365, 868, 2097, 3677, 5213, 5451, 8146, 9253,...",E02002183_0003_001,20310313,1,...,3.0,,6.0,True,2.0,1,1,27,1,4.0
4,4,2,,"{'x': -1.8749940395355225, 'y': 53.94298934936...","{'sport': 0.2329999953508377, 'rugby': 0.08620...","[1289, 12528, 12870]","[1288, 12529, 12871]",E02002183_0003_002,13010909,3,...,3.0,,6.0,True,2.0,1,2,26,1,6.0


In [7]:
# save the output
spc_people_hh.to_parquet('../data/spc_output/' + region + '_people_hh.parquet')


In [None]:
spc_people_hh['salary_yearly'].hist(bins=100)


#plt.show()

In [None]:
spc_people_hh['salary_yearly'].unique()


#### People and time-use data

In [None]:

# Subset of (non-time-use) features to include and unnest

# The features can be found here: https://github.com/alan-turing-institute/uatk-spc/blob/main/synthpop.proto
features = {
    "health": [
        "bmi",
        "has_cardiovascular_disease",
        "has_diabetes",
        "has_high_blood_pressure",
        "self_assessed_health",
        "life_satisfaction",
    ],
    "demographics": ["age_years",
                     "ethnicity",
                     "sex",
                     "nssec8"
    ],
    "employment": ["sic1d2007",
                   "sic2d2007",
                   "pwkstat",
                   "salary_yearly"
    ]

}

# build the table
spc_people_tu = (
    Builder(path, region, backend="polars", input_type="parquet")
    .add_households()
    .add_time_use_diaries(features, diary_type="weekday_diaries")
    .build()
)
spc_people_tu.head()



In [None]:
# save the output
spc_people_tu.write_parquet('../data/spc_output/' + region + '_people_tu.parquet')