In [7]:
#import polars as pl

We will use the spc package for our synthetic population. To add it as a dependancy in this virtual environment, I ran `poetry add git+https://github.com/alan-turing-institute/uatk-spc.git@55-output-formats-python#subdirectory=python`. The branch may change if the python package is merged into the main spc branch. 

In [9]:
#https://github.com/alan-turing-institute/uatk-spc/blob/55-output-formats-python/python/examples/spc_builder_example.ipynb
from uatk_spc.builder import Builder

### Loading in the SPC synthetic population

I use the code in the `Quickstart` [here](https://github.com/alan-turing-institute/uatk-spc/blob/55-output-formats-python/python/README.md) to get a parquet file and convert it to JSON. 

You have two options:


1- Slow and memory-hungry: Download the pbf file directly from [here](https://alan-turing-institute.github.io/uatk-spc/using_england_outputs.html) and load in the pbf file with the python package

2- Faster: Covert the pbf file to parquet, and then load it using the python package. To convert to parquet, you need to:

a. clone the [uatk-spc](https://github.com/alan-turing-institute/uatk-spc/tree/main/docs) 
   
b. Run `cargo run --release -- --rng-seed 0 --flat-output config/England/west-yorkshire.txt --year 2020`  and replace `west-yorkshire` and `2020` with your preferred option
        

In [10]:
# Pick a region with SPC output saved
path = "../data/spc_output/raw/"
region = "west-yorkshire"

#### People and household data

In [41]:
# add people and households
spc_people_hh = (
    Builder(path, region, backend="polars", input_type="parquet")
    .add_households()
    .unnest(["health", "employment", "details"])
    .build()
)

spc_people_hh.head()

id,household,workplace,location,orig_pid,id_tus_hh,id_tus_p,pid_hs,demographics,sic1d2007,sic2d2007,soc2010,pwkstat,salary_yearly,salary_hourly,bmi,has_cardiovascular_disease,has_diabetes,has_high_blood_pressure,number_medications,self_assessed_health,life_satisfaction,events,weekday_diaries,weekend_diaries,msoa,oa,members,hid,nssec8,accommodation_type,communal_type,num_rooms,central_heat,tenure,num_cars
u64,u64,u64,struct[2],str,i64,i64,i64,struct[4],str,u64,u64,i32,f32,f32,f32,bool,bool,bool,u64,i32,i32,struct[7],list[u64],list[u64],str,str,list[u64],str,i32,i32,i32,u64,bool,i32,u64
0,0,,"{-1.789218,53.919151}","""E02002183_0001…",11291218,1,2905399,"{1,86,1,1}","""J""",58,1115,6,,,24.879356,False,False,False,,3,2.0,"{0.09,0.1134,2.9846e-31,1.2791e-31,0.000881,0.000377,0.10494}","[1583, 13161]","[1582, 13160]","""E02002183""","""E00053954""",[0],"""E02002183_0001…",1,1,,2,True,2,2
1,1,,"{-1.826238,53.92028}","""E02002183_0002…",17291219,1,2905308,"{1,74,3,1}","""C""",25,1121,6,,,27.491207,False,False,True,,3,,"{0.239,0.30114,2.2734e-20,9.7432e-21,0.051032,0.021871,0.13662}","[2900, 4948, … 15793]","[2901, 4949, … 15792]","""E02002183""","""E00053953""","[1, 2]","""E02002183_0002…",1,3,,6,True,2,2
2,1,,"{-1.826238,53.92028}","""E02002183_0002…",17070713,2,2907681,"{2,68,1,2}","""P""",85,2311,6,,,17.310829,False,True,True,,2,4.0,"{0.239,0.17686,3.6288e-16,8.4672e-16,0.098134,0.228979,0.15741}","[3010, 6389, … 11598]","[3011, 6388, … 11599]","""E02002183""","""E00053953""","[1, 2]","""E02002183_0002…",1,3,,6,True,2,2
3,2,56126.0,"{-1.874994,53.942989}","""E02002183_0003…",20310313,1,2902817,"{1,27,1,4}","""C""",31,3422,1,32857.859375,14.360952,20.852091,False,False,False,,2,1.0,"{0.233,0.14679,4.397019,1.884437,0.522664,0.223999,0.15741}","[366, 867, … 14534]","[365, 868, … 14533]","""E02002183""","""E00053689""","[3, 4]","""E02002183_0003…",4,3,,6,True,2,1
4,2,,"{-1.874994,53.942989}","""E02002183_0003…",13010909,3,2900884,"{2,26,1,6}","""J""",62,7214,1,18162.451172,9.439944,20.032526,False,False,False,1.0,2,3.0,"{0.233,0.08621,2.090329,4.877435,0.18608,0.434187,0.15741}","[1289, 12528, 12870]","[1288, 12529, 12871]","""E02002183""","""E00053689""","[3, 4]","""E02002183_0003…",4,3,,6,True,2,1


In [42]:
# save the output
spc_people_hh.write_parquet('../data/spc_output/' + region + '_people_hh.parquet')

#### People and time-use data

In [14]:

# Subset of (non-time-use) features to include and unnest

# The features can be found here: https://github.com/alan-turing-institute/uatk-spc/blob/main/synthpop.proto
features = {
    "health": [
        "bmi",
        "has_cardiovascular_disease",
        "has_diabetes",
        "has_high_blood_pressure",
        "self_assessed_health",
        "life_satisfaction",
    ],
    "demographics": ["age_years",
                     "ethnicity",
                     "sex",
                     "nssec8"
    ],
    "employment": ["sic1d2007",
                   "sic2d2007",
                   "pwkstat",
                   "salary_yearly"
    ]

}

# build the table
spc_people_tu = (
    Builder(path, region, backend="polars", input_type="parquet")
    .add_households()
    .add_time_use_diaries(features, diary_type="weekday_diaries")
    .build()
)
spc_people_tu.head()



: 

In [13]:
# save the output
spc_people_tu.write_parquet('../data/spc_output/' + region + '_people_tu.parquet')

In [19]:
spc_people_tu.columns


['id',
 'household',
 'bmi',
 'has_cardiovascular_disease',
 'has_diabetes',
 'has_high_blood_pressure',
 'self_assessed_health',
 'life_satisfaction',
 'age_years',
 'sex',
 'nssec8',
 'pwkstat',
 'salary_yearly',
 'weekday_diaries',
 'uid',
 'weekday',
 'day_type',
 'month',
 'pworkhome',
 'phomeother',
 'pwork',
 'pschool',
 'pshop',
 'pservices',
 'pleisure',
 'pescort',
 'ptransport',
 'phome_total',
 'pnothome_total',
 'punknown_total',
 'pmwalk',
 'pmcycle',
 'pmprivate',
 'pmpublic',
 'pmunknown',
 'age35g']

In [22]:
spc_people_hh['health']

health
struct[7]
"{24.879356,false,false,false,null,3,2}"
"{27.491207,false,false,true,null,3,null}"
"{17.310829,false,true,true,null,2,4}"
"{20.852091,false,false,false,null,2,1}"
"{20.032526,false,false,false,1,2,3}"
"{29.106817,false,false,true,null,1,3}"
"{25.621599,false,false,false,3,3,3}"
"{33.893459,true,false,true,3,1,3}"
"{null,false,false,false,null,1,null}"
"{24.492905,false,false,false,null,4,2}"
