From 96bbc334cf2414b4dc093d83508b0832389d3be4 Mon Sep 17 00:00:00 2001 From: PyMap Date: Tue, 28 Jul 2020 19:20:02 -0300 Subject: [PATCH] draw --- demos/Synthesis workflow.ipynb | 5848 +++++++++++++++++++++++++++----- 1 file changed, 5022 insertions(+), 826 deletions(-) diff --git a/demos/Synthesis workflow.ipynb b/demos/Synthesis workflow.ipynb index 69ffaaf..b5b8b2e 100644 --- a/demos/Synthesis workflow.ipynb +++ b/demos/Synthesis workflow.ipynb @@ -463,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -473,9 +473,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['acsyear', 'c', 'county', 'get_available_geography_ids', 'get_geography_name', 'get_household_joint_dist_for_geography', 'get_household_marginal_for_geography', 'get_num_geographies', 'get_person_joint_dist_for_geography', 'get_person_marginal_for_geography', 'h_acs', 'h_acs_cat', 'h_pums_cols', 'p_acs', 'p_acs_cat', 'p_pums_cols', 'state', 'tract']\n" + ] + } + ], "source": [ "# what do we get from starter class?\n", "print([m for m in dir(starter) if not m.startswith('__')])" @@ -490,9 +498,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "('02', '290')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# ...state and county pair\n", "starter.state, starter.county" @@ -521,16 +540,143 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NAMEB19001_001EB19001_002EB19001_003EB19001_004EB19001_005EB19001_006EB19001_007EB19001_008EB19001_009E...B08201_002EB08201_003EB08201_004EB08201_005EB08201_006EB08202_001EB08202_002EB08202_003EB08202_004EB08202_005E
0Block Group 1, Census Tract 1, Yukon-Koyukuk C...187521320112012118...1194016371878974212
1Block Group 2, Census Tract 1, Yukon-Koyukuk C...33668194220112789...2147130513336160133373
\n", + "

2 rows × 127 columns

\n", + "
" + ], + "text/plain": [ + " NAME B19001_001E \\\n", + "0 Block Group 1, Census Tract 1, Yukon-Koyukuk C... 187 \n", + "1 Block Group 2, Census Tract 1, Yukon-Koyukuk C... 336 \n", + "\n", + " B19001_002E B19001_003E B19001_004E B19001_005E B19001_006E \\\n", + "0 52 13 20 11 20 \n", + "1 68 19 42 20 11 \n", + "\n", + " B19001_007E B19001_008E B19001_009E ... B08201_002E B08201_003E \\\n", + "0 12 1 18 ... 119 40 \n", + "1 27 8 9 ... 214 71 \n", + "\n", + " B08201_004E B08201_005E B08201_006E B08202_001E B08202_002E \\\n", + "0 16 3 7 187 89 \n", + "1 30 5 13 336 160 \n", + "\n", + " B08202_003E B08202_004E B08202_005E \n", + "0 74 21 2 \n", + "1 133 37 3 \n", + "\n", + "[2 rows x 127 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "starter.h_acs.head(2)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -540,27 +686,57 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "starter.c " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "c" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['_get_fips_lookup', '_get_pums_relationship', '_query', '_read_csv', '_scale_and_merge', 'acsyear_files', 'base_url', 'block_group_and_tract_query', 'block_group_query', 'c', 'download_household_pums', 'download_population_pums', 'fips_df', 'fips_url', 'pums00_household_base_url', 'pums00_population_base_url', 'pums10_household_base_url', 'pums10_population_base_url', 'pums_cache', 'pums_household_state_base_url', 'pums_population_state_base_url', 'pums_relationship_df', 'pums_relationship_file_url', 'tract_query', 'tract_to_puma', 'try_fips_lookup']\n" + ] + } + ], "source": [ "# that has his own methods \n", "print([ m for m in dir(c) if not m.startswith('__')])" @@ -577,9 +753,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + ">" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# we create and merge both tables with:\n", "c.block_group_and_tract_query" @@ -596,9 +783,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ALL', '_acs', 'acs', 'acs1', 'acs1dp', 'acs3', 'acs3dp', 'acs5', 'acs5dp', 'session', 'sf1', 'sf3']\n" + ] + } + ], "source": [ "# imported methods from census module\n", "print([ m for m in dir(c.c) if not m.startswith('__')])" @@ -613,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -628,7 +823,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -639,16 +834,110 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NAMEB11005_001EB11005_002EB11005_011Estatecountytract
0Census Tract 1, Yukon-Koyukuk Census Area, Alaska523140.0383.002290000100
1Census Tract 2, Yukon-Koyukuk Census Area, Alaska549155.0394.002290000200
2Census Tract 3, Yukon-Koyukuk Census Area, Alaska612251.0361.002290000300
3Census Tract 4, Yukon-Koyukuk Census Area, Alaska372151.0221.002290000400
\n", + "
" + ], + "text/plain": [ + " NAME B11005_001E B11005_002E \\\n", + "0 Census Tract 1, Yukon-Koyukuk Census Area, Alaska 523 140.0 \n", + "1 Census Tract 2, Yukon-Koyukuk Census Area, Alaska 549 155.0 \n", + "2 Census Tract 3, Yukon-Koyukuk Census Area, Alaska 612 251.0 \n", + "3 Census Tract 4, Yukon-Koyukuk Census Area, Alaska 372 151.0 \n", + "\n", + " B11005_011E state county tract \n", + "0 383.0 02 290 000100 \n", + "1 394.0 02 290 000200 \n", + "2 361.0 02 290 000300 \n", + "3 221.0 02 290 000400 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.DataFrame(tr)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -659,761 +948,1792 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.DataFrame(bg)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Both tables are merged at the highest disaggregation level (block group). To do it `scale and merge` Census method scales down from tract to block group level as it is shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# tract variables\n", - "starter.h_acs[['NAME']+vehicle_columns]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we build the `acs` table for block group and tracts levels - this process is repeated for persons too, here we only demo households subject tables-." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# the None tract value correspond to all the tracts\n", - "h_acs = c.block_group_and_tract_query(block_group_columns=presence_of_children_columns,\n", - " tract_columns=vehicle_columns, \n", - " state='02', county='290',\n", - " merge_columns=['tract', 'county', 'state'],\n", - " block_group_size_attr=\"B11005_001E\",\n", - " tract_size_attr=\"B08201_001E\",\n", - " tract=None, year=2013)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "h_acs.head(2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.2. `Categorize` subject tables " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once we get the subject data for [households](https://github.com/UDST/synthpop/blob/0bc36f8fef036913b416e0d4eb8a3fda79fc70ad/synthpop/recipes/starter2.py#L52-L68) and [persons](https://github.com/UDST/synthpop/blob/0bc36f8fef036913b416e0d4eb8a3fda79fc70ad/synthpop/recipes/starter2.py#L131-L139) variables, the `acs` dataset is categorized by: " - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], - "source": [ - "from synthpop import categorizer as cat" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NAMEB11005_001EB11005_002EB11005_011Estatecountytractblock group
0Block Group 1, Census Tract 1, Yukon-Koyukuk C...18758.0129.0022900001001
1Block Group 2, Census Tract 1, Yukon-Koyukuk C...33682.0254.0022900001002
2Block Group 1, Census Tract 2, Yukon-Koyukuk C...20857.0151.0022900002001
3Block Group 2, Census Tract 2, Yukon-Koyukuk C...34198.0243.0022900002002
4Block Group 1, Census Tract 3, Yukon-Koyukuk C...234108.0126.0022900003001
5Block Group 2, Census Tract 3, Yukon-Koyukuk C...19076.0114.0022900003002
6Block Group 3, Census Tract 3, Yukon-Koyukuk C...18867.0121.0022900003003
7Block Group 1, Census Tract 4, Yukon-Koyukuk C...18082.098.0022900004001
8Block Group 2, Census Tract 4, Yukon-Koyukuk C...19269.0123.0022900004002
\n", + "
" + ], + "text/plain": [ + " NAME B11005_001E B11005_002E \\\n", + "0 Block Group 1, Census Tract 1, Yukon-Koyukuk C... 187 58.0 \n", + "1 Block Group 2, Census Tract 1, Yukon-Koyukuk C... 336 82.0 \n", + "2 Block Group 1, Census Tract 2, Yukon-Koyukuk C... 208 57.0 \n", + "3 Block Group 2, Census Tract 2, Yukon-Koyukuk C... 341 98.0 \n", + "4 Block Group 1, Census Tract 3, Yukon-Koyukuk C... 234 108.0 \n", + "5 Block Group 2, Census Tract 3, Yukon-Koyukuk C... 190 76.0 \n", + "6 Block Group 3, Census Tract 3, Yukon-Koyukuk C... 188 67.0 \n", + "7 Block Group 1, Census Tract 4, Yukon-Koyukuk C... 180 82.0 \n", + "8 Block Group 2, Census Tract 4, Yukon-Koyukuk C... 192 69.0 \n", + "\n", + " B11005_011E state county tract block group \n", + "0 129.0 02 290 000100 1 \n", + "1 254.0 02 290 000100 2 \n", + "2 151.0 02 290 000200 1 \n", + "3 243.0 02 290 000200 2 \n", + "4 126.0 02 290 000300 1 \n", + "5 114.0 02 290 000300 2 \n", + "6 121.0 02 290 000300 3 \n", + "7 98.0 02 290 000400 1 \n", + "8 123.0 02 290 000400 2 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "cat.categorize" + "pd.DataFrame(bg)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This give us back a multindexed table with new names:" + "Both tables are merged at the highest disaggregation level (block group). To do it `scale and merge` Census method scales down from tract to block group level as it is shown below:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NAMEB08201_001EB08201_002EB08201_003EB08201_004EB08201_005EB08201_006E
0Block Group 1, Census Tract 1, Yukon-Koyukuk C...187119401637
1Block Group 2, Census Tract 1, Yukon-Koyukuk C...3362147130513
2Block Group 1, Census Tract 2, Yukon-Koyukuk C...2085173502111
3Block Group 2, Census Tract 2, Yukon-Koyukuk C...34183120833418
4Block Group 1, Census Tract 3, Yukon-Koyukuk C...234127583584
5Block Group 2, Census Tract 3, Yukon-Koyukuk C...190103472873
6Block Group 3, Census Tract 3, Yukon-Koyukuk C...188102462873
7Block Group 1, Census Tract 4, Yukon-Koyukuk C...18091463252
8Block Group 2, Census Tract 4, Yukon-Koyukuk C...19298503553
\n", + "
" + ], + "text/plain": [ + " NAME B08201_001E \\\n", + "0 Block Group 1, Census Tract 1, Yukon-Koyukuk C... 187 \n", + "1 Block Group 2, Census Tract 1, Yukon-Koyukuk C... 336 \n", + "2 Block Group 1, Census Tract 2, Yukon-Koyukuk C... 208 \n", + "3 Block Group 2, Census Tract 2, Yukon-Koyukuk C... 341 \n", + "4 Block Group 1, Census Tract 3, Yukon-Koyukuk C... 234 \n", + "5 Block Group 2, Census Tract 3, Yukon-Koyukuk C... 190 \n", + "6 Block Group 3, Census Tract 3, Yukon-Koyukuk C... 188 \n", + "7 Block Group 1, Census Tract 4, Yukon-Koyukuk C... 180 \n", + "8 Block Group 2, Census Tract 4, Yukon-Koyukuk C... 192 \n", + "\n", + " B08201_002E B08201_003E B08201_004E B08201_005E B08201_006E \n", + "0 119 40 16 3 7 \n", + "1 214 71 30 5 13 \n", + "2 51 73 50 21 11 \n", + "3 83 120 83 34 18 \n", + "4 127 58 35 8 4 \n", + "5 103 47 28 7 3 \n", + "6 102 46 28 7 3 \n", + "7 91 46 32 5 2 \n", + "8 98 50 35 5 3 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "h_acs_cat = cat.categorize(h_acs, {(\"hh_children\", \"yes\"): \"B11005_002E\",\n", - " (\"hh_children\", \"no\"): \"B11005_011E\",\n", - " (\"hh_cars\", \"none\"): \"B08201_002E\",\n", - " (\"hh_cars\", \"one\"): \"B08201_003E\",\n", - " (\"hh_cars\", \"two or more\"):\n", - " \"B08201_004E + B08201_005E + B08201_006E\"},\n", - " index_cols=['state', 'county', 'tract', 'block group'])" + "# tract variables\n", + "starter.h_acs[['NAME']+vehicle_columns]" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "h_acs_cat" + "Finally, we build the `acs` table for block group and tracts levels - this process is repeated for persons too, here we only demo households subject tables-." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 23, "metadata": {}, + "outputs": [], "source": [ - "... that can also be obtained by calling it from the `starter` object:" + "# the None tract value correspond to all the tracts\n", + "h_acs = c.block_group_and_tract_query(block_group_columns=presence_of_children_columns,\n", + " tract_columns=vehicle_columns, \n", + " state='02', county='290',\n", + " merge_columns=['tract', 'county', 'state'],\n", + " block_group_size_attr=\"B11005_001E\",\n", + " tract_size_attr=\"B08201_001E\",\n", + " tract=None, year=2013)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NAMEB11005_001EB11005_002EB11005_011Estatecountytractblock groupB08201_001EB08201_002EB08201_003EB08201_004EB08201_005EB08201_006E
0Block Group 1, Census Tract 1, Yukon-Koyukuk C...18758129022900001001187119401637
1Block Group 2, Census Tract 1, Yukon-Koyukuk C...336822540229000010023362147130513
\n", + "
" + ], + "text/plain": [ + " NAME B11005_001E \\\n", + "0 Block Group 1, Census Tract 1, Yukon-Koyukuk C... 187 \n", + "1 Block Group 2, Census Tract 1, Yukon-Koyukuk C... 336 \n", + "\n", + " B11005_002E B11005_011E state county tract block group B08201_001E \\\n", + "0 58 129 02 290 000100 1 187 \n", + "1 82 254 02 290 000100 2 336 \n", + "\n", + " B08201_002E B08201_003E B08201_004E B08201_005E B08201_006E \n", + "0 119 40 16 3 7 \n", + "1 214 71 30 5 13 " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# here the entire table built in starter2\n", - "starter.h_acs_cat" + "h_acs.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 2.3. Geography relations for joint distributions" + "### 2.2. `Categorize` subject tables " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![pums](img/download_pums.png)" + "Once we get the subject data for [households](https://github.com/UDST/synthpop/blob/0bc36f8fef036913b416e0d4eb8a3fda79fc70ad/synthpop/recipes/starter2.py#L52-L68) and [persons](https://github.com/UDST/synthpop/blob/0bc36f8fef036913b416e0d4eb8a3fda79fc70ad/synthpop/recipes/starter2.py#L131-L139) variables, the `acs` dataset is categorized by: " ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 25, "metadata": {}, + "outputs": [], "source": [ - "As we can see in the diagram, one important method from `Census` constructor is the:" + "from synthpop import categorizer as cat" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "c.tract_to_puma" + "cat.categorize" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "... which returns the correspondant puma id for a given tract:" + "This give us back a multindexed table with new names:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "for tract in ['000100','000200','000300','000400']:\n", - " print('puma10 id for tract {} : {}'.format(tract, c.tract_to_puma(state, county, tract)[0]))\n", - " print('puma00 id for tract {} : {}'.format(tract, c.tract_to_puma(state, county, tract)[1]))\n", - " puma10, puma00 = c.tract_to_puma(state, county, tract)" + "h_acs_cat = cat.categorize(h_acs, {(\"hh_children\", \"yes\"): \"B11005_002E\",\n", + " (\"hh_children\", \"no\"): \"B11005_011E\",\n", + " (\"hh_cars\", \"none\"): \"B08201_002E\",\n", + " (\"hh_cars\", \"one\"): \"B08201_003E\",\n", + " (\"hh_cars\", \"two or more\"):\n", + " \"B08201_004E + B08201_005E + B08201_006E\"},\n", + " index_cols=['state', 'county', 'tract', 'block group'])" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 28, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_namehh_carshh_children
cat_valuenoneonetwo or morenoyes
statecountytractblock group
022900001001119402612958
2214714825482
000200151738215157
28312013524398
00030011275847126108
2103473811476
3102463812167
00040019146399882
298504312369
\n", + "
" + ], + "text/plain": [ + "cat_name hh_cars hh_children \n", + "cat_value none one two or more no yes\n", + "state county tract block group \n", + "02 290 000100 1 119 40 26 129 58\n", + " 2 214 71 48 254 82\n", + " 000200 1 51 73 82 151 57\n", + " 2 83 120 135 243 98\n", + " 000300 1 127 58 47 126 108\n", + " 2 103 47 38 114 76\n", + " 3 102 46 38 121 67\n", + " 000400 1 91 46 39 98 82\n", + " 2 98 50 43 123 69" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "This information will be mainly used to download `pums` from `aws` or `gcs` (depending on the acs requested year)." + "h_acs_cat" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "##### What we call slices?: the `PUMA` geographies " + "... that can also be obtained by calling it from the `starter` object:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 29, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_namehh_age_of_headhh_carshh_childrenhh_income...hispanic_headseniorssf_detachedtenure_mover
cat_valuegt35-lt65gt65lt35noneonetwo or morenoyesgt100-lt150gt150...noyesnoyesnoyesown not recentown recentrent not recentrent recent
statecountytractblock group
022900001001122254011940261295860...187016225018713593112
221077492147148254823315...33152597720316222104658
000200112367185173821515782...2044138701519314744512
2229763683120135243985925...33922548744297240116921
00030011464345127584712610891...234018450123315094728
212144251034738114764116...1882143476184129132919
3112373910246381216780...188014741418413523318
0004001924939914639988237...180013149018012164211
2108463898504312369166...18841444871859786918
\n", + "

9 rows × 34 columns

\n", + "
" + ], + "text/plain": [ + "cat_name hh_age_of_head hh_cars \\\n", + "cat_value gt35-lt65 gt65 lt35 none one \n", + "state county tract block group \n", + "02 290 000100 1 122 25 40 119 40 \n", + " 2 210 77 49 214 71 \n", + " 000200 1 123 67 18 51 73 \n", + " 2 229 76 36 83 120 \n", + " 000300 1 146 43 45 127 58 \n", + " 2 121 44 25 103 47 \n", + " 3 112 37 39 102 46 \n", + " 000400 1 92 49 39 91 46 \n", + " 2 108 46 38 98 50 \n", + "\n", + "cat_name hh_children hh_income \\\n", + "cat_value two or more no yes gt100-lt150 \n", + "state county tract block group \n", + "02 290 000100 1 26 129 58 6 \n", + " 2 48 254 82 33 \n", + " 000200 1 82 151 57 8 \n", + " 2 135 243 98 59 \n", + " 000300 1 47 126 108 9 \n", + " 2 38 114 76 41 \n", + " 3 38 121 67 8 \n", + " 000400 1 39 98 82 3 \n", + " 2 43 123 69 16 \n", + "\n", + "cat_name ... hispanic_head seniors \\\n", + "cat_value gt150 ... no yes no yes \n", + "state county tract block group ... \n", + "02 290 000100 1 0 ... 187 0 162 25 \n", + " 2 15 ... 331 5 259 77 \n", + " 000200 1 2 ... 204 4 138 70 \n", + " 2 25 ... 339 2 254 87 \n", + " 000300 1 1 ... 234 0 184 50 \n", + " 2 16 ... 188 2 143 47 \n", + " 3 0 ... 188 0 147 41 \n", + " 000400 1 7 ... 180 0 131 49 \n", + " 2 6 ... 188 4 144 48 \n", + "\n", + "cat_name sf_detached tenure_mover \\\n", + "cat_value no yes own not recent own recent \n", + "state county tract block group \n", + "02 290 000100 1 0 187 135 9 \n", + " 2 20 316 222 10 \n", + " 000200 1 15 193 147 4 \n", + " 2 44 297 240 11 \n", + " 000300 1 1 233 150 9 \n", + " 2 6 184 129 13 \n", + " 3 4 184 135 2 \n", + " 000400 1 0 180 121 6 \n", + " 2 7 185 97 8 \n", + "\n", + "cat_name \n", + "cat_value rent not recent rent recent \n", + "state county tract block group \n", + "02 290 000100 1 31 12 \n", + " 2 46 58 \n", + " 000200 1 45 12 \n", + " 2 69 21 \n", + " 000300 1 47 28 \n", + " 2 29 19 \n", + " 3 33 18 \n", + " 000400 1 42 11 \n", + " 2 69 18 \n", + "\n", + "[9 rows x 34 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "![pums_rel](img/pums_tracts.png)" + "# here the entire table built in starter2\n", + "starter.h_acs_cat" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "PUMS stands for Public Use Microdata Sample. These are individual records of survey responses with identifying\n", - "information removed.These files do not include every record of every person who responded to the ACS. Only a select few that in turn, are representative of the population. \n", - "\n", - "The ACS samples 3.5 million addresses per year. The 1-year ACS PUMS file contains about 1% of all of the US households. The 5-year ACS PUMS file is the equivalent of five 1-year files, so it includes about 5% of all of the US households.\n", - "\n", - "By contrast, in aggregated tables or summary data, the individual records are categorized and weighted to create an estimate for the larger population. These estimates contains a Margin of Error (which is, to put it into simple words, the percentage of times we do not hit a target population type when we randomly select cases: e.g. 5%, etc. ).\n", - "\n", - "This means that, since `PUMS` microdata provides a sample of the ACS records it is necessary to create an estimate of how many persons/households the raw records represent. This microdata has no geographies smaller than what we call `PUMAs`.\n", - "\n", - "PUMA is an area where the population of over 100,000 in – population of over 100,000, large enough to meet the disclosure avoidance requirements. It is identified by a five-digit code that is unique within each estate, and they nest within state or state equivalence.\n", - "\n", - "PUMAs are redefined after each Decennial Census. It is important to mention that PUMAs redefined after the 2010 Census were first used in 2012 ACS PUMS files. Multi-year files contain dual PUMAs vintages, for example, the 2010 to 2014 ACS PUMS files. PUMAs are built on Census Tracks and Counties, and can be combined to create rough approximations of towns, counties or cities for analysis." + "### 2.3. Geography relations for joint distributions" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# PUMS variables\n", - "h_pums_cols = ('serialno', 'PUMA00','PUMA10', 'RT', 'NP', 'TYPE',\n", - " 'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "h_pums = c.download_household_pums(state, puma10, puma00, usecols=h_pums_cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "h_pums" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Using `PUMS` files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Joint distributions represents a total value for acs queried table. Since this last dataset contains aggregated data for tracts and block groups levels, and given that the PUMS are a representative sample of individual records - each serial number corresponds to a unique answer -, both tables will be used to build target values to be joined during the synthesis. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cat.joint_distribution" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we build a dataframe with the target categories that we obtained querying the `acs subject table`. This, by calling the `category_combinations` method from the categorizer that will return all possible combinations: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cat.category_combinations(h_acs_cat.columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With this, the `joint_distribution` method will return a sample and categories dataframes. This by following next steps:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# mapping functions to return values depending on slices dataframes\n", - "def cars_cat(r):\n", - " if r.VEH == 0:\n", - " return \"none\"\n", - " elif r.VEH == 1:\n", - " return \"one\"\n", - " return \"two or more\"\n", - "\n", - "def children_cat(r):\n", - " if r.R18 == 1:\n", - " return \"yes\"\n", - " return \"no\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "h_pums, jd_households = cat.joint_distribution(h_pums,\n", - " cat.category_combinations(h_acs_cat.columns),\n", - " {\"hh_cars\": cars_cat,\n", - " \"hh_children\": children_cat,\n", - " })" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- **1. Categories dataframe**\n", - "\n", - "This cointains the amount of cases (or frequencies) for each category combination within the PUMA geography. It is important to remark that this totals corresponds to all the tracts we passed to the `tract to puma` function. Given that PUMS returns individual answers for a bunch of variables, we can combine them based on acs subject table columns and get the totals of each combination for a group tracts. This will return a total value for a group of tracts (or PUMA). PUMAs are unique whitin a state." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jd_households" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- **2. Sample dataframe**\n", - "\n", - "This is the microdata puma df we downloaded for tracts 100 to 400, with a new `cat_id` combination column:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "h_pums" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Until this point, we...**:\n", - "1. Queried the `acs5` subject table to get persons and households variables at block group and tracts levels\n", - "2. Downloaded sample files (PUMS) containing the same variables with different standard names.\n", - "3. Matched the tracts of the `county`, `state` pair with a puma id.\n", - "4. Built combination of variables at puma level - or all the tracts that are cointained inside a puma id (e.g. households with no children and no cars)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.4. Marginals, the block group level" + "![pums](img/download_pums.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Marginals are called from the `starter` object inside the `synthesize_all` function and returns..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#...for a group of available geographies inside a county/state pair\n", - "list(starter.get_available_geography_ids())[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "indexes = starter.get_available_geography_ids()" + "As we can see in the diagram, one important method from `Census` constructor is the:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + ">" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "for geog_id in indexes:\n", - " h_marg = starter.get_household_marginal_for_geography(geog_id)" + "c.tract_to_puma" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "***...the total value of a given variable for the block group geography levels:***" + "... which returns the correspondant puma id for a given tract:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "puma10 id for tract 000100 : 00400\n", + "puma00 id for tract 000100 : 00400\n", + "puma10 id for tract 000200 : 00400\n", + "puma00 id for tract 000200 : 00400\n", + "puma10 id for tract 000300 : 00400\n", + "puma00 id for tract 000300 : 00400\n", + "puma10 id for tract 000400 : 00400\n", + "puma00 id for tract 000400 : 00400\n" + ] + } + ], "source": [ - "# This is the marginal table we stored for the last census tract (400)\n", - "h_marg" + "for tract in ['000100','000200','000300','000400']:\n", + " print('puma10 id for tract {} : {}'.format(tract, c.tract_to_puma(state, county, tract)[0]))\n", + " print('puma00 id for tract {} : {}'.format(tract, c.tract_to_puma(state, county, tract)[1]))\n", + " puma10, puma00 = c.tract_to_puma(state, county, tract)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For example, in our acs5 categorized table, in the block group 1 of the 000400 tract (we only stored the last geography since we ran a for loop) there are 82 households with childs and 46 with one car." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# can check these totals in our acs subject table...\n", - "h_acs_cat" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# store the marginal values from the block group 1 of the 000400 census tract\n", - "hh_marginals_tract_400_block_gp_1 = h_acs_cat.loc[tuple(list(starter.get_available_geography_ids())[7])]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "hh_marginals_tract_400_block_gp_1" + "This information will be mainly used to download `pums` from `aws` or `gcs` (depending on the acs requested year)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Sinthesize all: using `Starter` outputs " + "##### What we call slices?: the `PUMA` geographies " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 3.1 The iterative proportional fitting (IPF) procedure " + "![pums_rel](img/pums_tracts.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![constraints](img/constraints.png)" + "PUMS stands for Public Use Microdata Sample. These are individual records of survey responses with identifying\n", + "information removed.These files do not include every record of every person who responded to the ACS. Only a select few that in turn, are representative of the population. \n", + "\n", + "The ACS samples 3.5 million addresses per year. The 1-year ACS PUMS file contains about 1% of all of the US households. The 5-year ACS PUMS file is the equivalent of five 1-year files, so it includes about 5% of all of the US households.\n", + "\n", + "By contrast, in aggregated tables or summary data, the individual records are categorized and weighted to create an estimate for the larger population. These estimates contains a Margin of Error (which is, to put it into simple words, the percentage of times we do not hit a target population type when we randomly select cases: e.g. 5%, etc. ).\n", + "\n", + "This means that, since `PUMS` microdata provides a sample of the ACS records it is necessary to create an estimate of how many persons/households the raw records represent. This microdata has no geographies smaller than what we call `PUMAs`.\n", + "\n", + "PUMA is an area where the population of over 100,000 in – population of over 100,000, large enough to meet the disclosure avoidance requirements. It is identified by a five-digit code that is unique within each estate, and they nest within state or state equivalence.\n", + "\n", + "PUMAs are redefined after each Decennial Census. It is important to mention that PUMAs redefined after the 2010 Census were first used in 2012 ACS PUMS files. Multi-year files contain dual PUMAs vintages, for example, the 2010 to 2014 ACS PUMS files. PUMAs are built on Census Tracks and Counties, and can be combined to create rough approximations of towns, counties or cities for analysis." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "from synthpop.ipf.ipf import calculate_constraints" + "# PUMS variables\n", + "h_pums_cols = ('serialno', 'PUMA00','PUMA10', 'RT', 'NP', 'TYPE',\n", + " 'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading households pums from https://s3-us-west-1.amazonaws.com/synthpop-data2/\n", + "Reading PUMS00 from https://s3-us-west-1.amazonaws.com/synthpop-data2/\n" + ] + } + ], "source": [ - "h_constraint, _ = calculate_constraints(hh_marginals_tract_400_block_gp_1, jd_households.frequency)" + "h_pums = c.download_household_pums(state, puma10, puma00, usecols=h_pums_cols)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], - "source": [ - "# this is our contraints table\n", - "h_constraint" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
serialnoRTpuma00puma10NPTYPEBLDTENVEHHINCPMVR18R65
02012000000643H-9400212.03.00.033360.01.00.00.0
22012000001889H-9400412.02.00.089420.07.00.02.0
32012000003083H-9400112.02.00.020800.07.00.00.0
42012000004753H-9400712.02.00.026300.06.01.00.0
52012000005450H-9400313.03.01.065300.01.01.00.0
..........................................
58572011001481195H400-9612.03.01.0165600.07.01.01.0
58582011001486432H400-9512.02.00.023900.05.01.00.0
58622011001495660H400-9312.02.02.029500.07.00.02.0
58652011001496296H400-9112.04.00.0900.07.00.00.0
58682011001498413H400-9412.03.01.019600.02.01.00.0
\n", + "

3625 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " serialno RT puma00 puma10 NP TYPE BLD TEN VEH HINCP MV \\\n", + "0 2012000000643 H -9 400 2 1 2.0 3.0 0.0 33360.0 1.0 \n", + "2 2012000001889 H -9 400 4 1 2.0 2.0 0.0 89420.0 7.0 \n", + "3 2012000003083 H -9 400 1 1 2.0 2.0 0.0 20800.0 7.0 \n", + "4 2012000004753 H -9 400 7 1 2.0 2.0 0.0 26300.0 6.0 \n", + "5 2012000005450 H -9 400 3 1 3.0 3.0 1.0 65300.0 1.0 \n", + "... ... .. ... ... .. ... ... ... ... ... ... \n", + "5857 2011001481195 H 400 -9 6 1 2.0 3.0 1.0 165600.0 7.0 \n", + "5858 2011001486432 H 400 -9 5 1 2.0 2.0 0.0 23900.0 5.0 \n", + "5862 2011001495660 H 400 -9 3 1 2.0 2.0 2.0 29500.0 7.0 \n", + "5865 2011001496296 H 400 -9 1 1 2.0 4.0 0.0 900.0 7.0 \n", + "5868 2011001498413 H 400 -9 4 1 2.0 3.0 1.0 19600.0 2.0 \n", + "\n", + " R18 R65 \n", + "0 0.0 0.0 \n", + "2 0.0 2.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "5 1.0 0.0 \n", + "... ... ... \n", + "5857 1.0 1.0 \n", + "5858 1.0 0.0 \n", + "5862 0.0 2.0 \n", + "5865 0.0 0.0 \n", + "5868 1.0 0.0 \n", + "\n", + "[3625 rows x 13 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# this is the number of iterations performed to achieve the constraint values\n", - "_" + "h_pums" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In the block group 1 of the census tract 400 we used to have 91 households with no cars, 46 with one car and 39 with two or more:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bg_targets = hh_marginals_tract_400_block_gp_1[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bg_targets" + "##### Using `PUMS` files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "At the same time, our `joint_distributions` table gave us the total of that variable for the entire PUMA (in this case, `843 households with no children + 919 households with one children = 1762 households with no cars`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jd_households[:2]" + "Joint distributions represents a total value for acs queried table. Since this last dataset contains aggregated data for tracts and block groups levels, and given that the PUMS are a representative sample of individual records - each serial number corresponds to a unique answer -, both tables will be used to build target values to be joined during the synthesis. " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "jd_households['frequency'][:2].sum()" + "cat.joint_distribution" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "With this information, we will build new target values for each category total within the block group by multiplying the target and the proportion that this target represents in the entire PUMA - remember here, that our joint distribution table gets this total by combining different variables. In our case number of cars and children -\n", - "\n", - "`new_target = current_target * (proportion of the current target)`" + "First, we build a dataframe with the target categories that we obtained querying the `acs subject table`. This, by calling the `category_combinations` method from the categorizer that will return all possible combinations: " ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sub_category_idx_0 = 0\n", - "sub_category_idx_1 = 2\n", - "\n", - "for target in range(len(bg_targets)):\n", - " total_cat = jd_households.frequency[sub_category_idx_0:sub_category_idx_1].sum()\n", - " sub_category_idx_0 += 2\n", - " sub_category_idx_1 += 2\n", - " print('Block group target value: %s'%str(bg_targets[target]))\n", - " print('Total value of the combined category is: %s'%str(total_cat))\n", - " print('Proportion of each category in the combined variables total: %s'%str(bg_targets[target] / total_cat))\n", - " print('New Block group target value: %s'%str(bg_targets[target]*(bg_targets[target] / total_cat)))\n", - " print('*********************************************************************************')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These new values that were built based on the proportion that each block group represents in the PUMA (for every category value) will be used to update our joint distributions table: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jd_households.values" - ] - }, - { - "cell_type": "markdown", + "execution_count": 36, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_id
hh_carshh_children
noneno0
yes1
oneno2
yes3
two or moreno4
yes5
\n", + "
" + ], + "text/plain": [ + " cat_id\n", + "hh_cars hh_children \n", + "none no 0\n", + " yes 1\n", + "one no 2\n", + " yes 3\n", + "two or more no 4\n", + " yes 5" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "...the totals we had for each `category combination`: 0; 1; 2; 3; 4 & 5 will be replaced using that proportion. This way we will define a new maximum value for every category within the block group." + "cat.category_combinations(h_acs_cat.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Imagine we have a constraints table with categories from 0 to 2 (has no car and no children, has no car but children and has one car and no children) as we shown in the example above. We will need to:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "current_constraints = jd_households[:3].values.copy()" + "With this, the `joint_distribution` method will return a sample and categories dataframes. This by following next steps:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "# 1. define new targets based on proportions\n", - "new_targets = [4.69977298524404, 2.1075697211155378, 1.7706635622817228]\n", + "# mapping functions to return values depending on slices dataframes\n", + "def cars_cat(r):\n", + " if r.VEH == 0:\n", + " return \"none\"\n", + " elif r.VEH == 1:\n", + " return \"one\"\n", + " return \"two or more\"\n", "\n", - "next_constraints = current_constraints.astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 2. update the constraints table\n", - "fre = -1\n", - "for t in new_targets:\n", - " fre +=1 \n", - " next_constraints[fre][1] = t" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "next_constraints" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These values will be updated inside an iterative process, where previous and next constraints will be evaluated under a maximum of `tolerance`: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tolerance = 1e-3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.abs(current_constraints, next_constraints).sum()" + "def children_cat(r):\n", + " if r.R18 == 1:\n", + " return \"yes\"\n", + " return \"no\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ - "if np.abs(current_constraints, next_constraints).sum()>tolerance:\n", - " print('Recalculate constraints table')" + "h_pums, jd_households = cat.joint_distribution(h_pums,\n", + " cat.category_combinations(h_acs_cat.columns),\n", + " {\"hh_cars\": cars_cat,\n", + " \"hh_children\": children_cat,\n", + " })" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This way, the iteration will continue until reaching a group of new target values under the tolerance defined before. The man result of this process will be..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ... our marginals table with total values for the block group\n", - "hh_marginals_tract_400_block_gp_1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# updated with new totals.\n", - "h_constraint" + "- **1. Categories dataframe**\n", + "\n", + "This cointains the amount of cases (or frequencies) for each category combination within the PUMA geography. It is important to remark that this totals corresponds to all the tracts we passed to the `tract to puma` function. Given that PUMS returns individual answers for a bunch of variables, we can combine them based on acs subject table columns and get the totals of each combination for a group tracts. This will return a total value for a group of tracts (or PUMA). PUMAs are unique whitin a state." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_idfrequency
hh_carshh_children
noneno0843
yes1919
oneno2573
yes3431
two or moreno4491
yes5368
\n", + "
" + ], + "text/plain": [ + " cat_id frequency\n", + "hh_cars hh_children \n", + "none no 0 843\n", + " yes 1 919\n", + "one no 2 573\n", + " yes 3 431\n", + "two or more no 4 491\n", + " yes 5 368" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# that will be used to update joint distributions frequencies\n", "jd_households" ] }, @@ -1421,547 +2741,3423 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Consuming the Census API we built a dataset with, for example, 91 aggregated cases for households with no car. Building the constraints table, we used PUMA geographies to determine the proportion that these 91 cases represents in a more dissaggregated level. With this estimation we determine that instead of 91, the block group 1 of the census tract 00400 from the Yukon-Koyukuk county in the Alaska state has 46.5 households with no car. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3.2. The iterative proportional updating (IPU) procedure " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![ipu](img/ipf_ipu.png)" + "- **2. Sample dataframe**\n", + "\n", + "This is the microdata puma df we downloaded for tracts 100 to 400, with a new `cat_id` combination column:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, - "outputs": [], - "source": [ - "from synthpop.ipu.ipu import household_weights" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
serialnoRTpuma00puma10NPTYPEBLDTENVEHHINCPMVR18R65hh_carshh_childrencat_id
02012000000643H-9400212.03.00.033360.01.00.00.0noneno0
22012000001889H-9400412.02.00.089420.07.00.02.0noneno0
32012000003083H-9400112.02.00.020800.07.00.00.0noneno0
152012000017291H-9400412.02.00.054800.05.00.00.0noneno0
312012000041419H-9400212.02.00.033300.07.00.01.0noneno0
...................................................
57622011001370081H400-9112.02.06.0300300.05.00.00.0two or moreno4
57712011001380825H400-9212.04.02.0173600.01.00.00.0two or moreno4
58352011001457691H400-9211.03.02.097400.03.00.00.0two or moreno4
58432011001464975H400-9412.01.02.08650.05.00.00.0two or moreno4
58622011001495660H400-9312.02.02.029500.07.00.02.0two or moreno4
\n", + "

3625 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " serialno RT puma00 puma10 NP TYPE BLD TEN VEH HINCP MV \\\n", + "0 2012000000643 H -9 400 2 1 2.0 3.0 0.0 33360.0 1.0 \n", + "2 2012000001889 H -9 400 4 1 2.0 2.0 0.0 89420.0 7.0 \n", + "3 2012000003083 H -9 400 1 1 2.0 2.0 0.0 20800.0 7.0 \n", + "15 2012000017291 H -9 400 4 1 2.0 2.0 0.0 54800.0 5.0 \n", + "31 2012000041419 H -9 400 2 1 2.0 2.0 0.0 33300.0 7.0 \n", + "... ... .. ... ... .. ... ... ... ... ... ... \n", + "5762 2011001370081 H 400 -9 1 1 2.0 2.0 6.0 300300.0 5.0 \n", + "5771 2011001380825 H 400 -9 2 1 2.0 4.0 2.0 173600.0 1.0 \n", + "5835 2011001457691 H 400 -9 2 1 1.0 3.0 2.0 97400.0 3.0 \n", + "5843 2011001464975 H 400 -9 4 1 2.0 1.0 2.0 8650.0 5.0 \n", + "5862 2011001495660 H 400 -9 3 1 2.0 2.0 2.0 29500.0 7.0 \n", + "\n", + " R18 R65 hh_cars hh_children cat_id \n", + "0 0.0 0.0 none no 0 \n", + "2 0.0 2.0 none no 0 \n", + "3 0.0 0.0 none no 0 \n", + "15 0.0 0.0 none no 0 \n", + "31 0.0 1.0 none no 0 \n", + "... ... ... ... ... ... \n", + "5762 0.0 0.0 two or more no 4 \n", + "5771 0.0 0.0 two or more no 4 \n", + "5835 0.0 0.0 two or more no 4 \n", + "5843 0.0 0.0 two or more no 4 \n", + "5862 0.0 2.0 two or more no 4 \n", + "\n", + "[3625 rows x 16 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "h_constraint.index = jd_households.cat_id" + "h_pums" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "h_constraint" + "**Until this point, we...**:\n", + "1. Queried the `acs5` subject table to get persons and households variables at block group and tracts levels\n", + "2. Downloaded sample files (PUMS) containing the same variables with different standard names.\n", + "3. Matched the tracts of the `county`, `state` pair with a puma id.\n", + "4. Built combination of variables at puma level - or all the tracts that are cointained inside a puma id (e.g. households with no children and no cars)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Besides the `categories dataframe` - our joint distributions table- we mentioned in `2.3`, there is a `sample dataframe` which is the public user microdata sample - our pums." + "### 2.4. Marginals, the block group level" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "households_sample_df = h_pums.copy()" + "Marginals are called from the `starter` object inside the `synthesize_all` function and returns..." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "state 02\n", + "county 290\n", + "tract 000100\n", + "block group 1\n", + "dtype: object" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "households_sample_df.head()" + "#...for a group of available geographies inside a county/state pair\n", + "list(starter.get_available_geography_ids())[0]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ - "households_sample_df.index.name = \"hh_id\"" + "indexes = starter.get_available_geography_ids()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ - "households_sample_df = households_sample_df.reset_index().set_index(\"serialno\")" + "for geog_id in indexes:\n", + " h_marg = starter.get_household_marginal_for_geography(geog_id)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "h_freq_table = cat._frequency_table(households_sample_df, jd_households.cat_id)" + "***...the total value of a given variable for the block group geography levels:***" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, - "outputs": [], - "source": [ - "h_freq_table = h_freq_table.sort_index(axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "cat_name cat_value \n", + "hh_age_of_head gt35-lt65 108\n", + " gt65 46\n", + " lt35 38\n", + "hh_cars none 98\n", + " one 50\n", + " two or more 43\n", + "hh_children no 123\n", + " yes 69\n", + "hh_income gt100-lt150 16\n", + " gt150 6\n", + " gt30-lt60 32\n", + " gt60-lt100 81\n", + " lt30 57\n", + "hh_race_of_head asian 0\n", + " black 2\n", + " other 108\n", + " white 82\n", + "hh_size four or more 36\n", + " one 61\n", + " three 44\n", + " two 51\n", + "hh_workers none 69\n", + " one 77\n", + " two or more 45\n", + "hispanic_head no 188\n", + " yes 4\n", + "seniors no 144\n", + " yes 48\n", + "sf_detached no 7\n", + " yes 185\n", + "tenure_mover own not recent 97\n", + " own recent 8\n", + " rent not recent 69\n", + " rent recent 18\n", + "Name: (02, 290, 000400, 2), dtype: int64" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "h_freq_table.head()" + "# This is the marginal table we stored for the last census tract (400)\n", + "h_marg" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Togheter, frequency tables and constraints will be used for weights matrix and fit quality:" + "For example, in our acs5 categorized table, in the block group 1 of the 000400 tract (we only stored the last geography since we ran a for loop) there are 82 households with childs and 46 with one car." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_namehh_carshh_children
cat_valuenoneonetwo or morenoyes
statecountytractblock group
022900001001119402612958
2214714825482
000200151738215157
28312013524398
00030011275847126108
2103473811476
3102463812167
00040019146399882
298504312369
\n", + "
" + ], + "text/plain": [ + "cat_name hh_cars hh_children \n", + "cat_value none one two or more no yes\n", + "state county tract block group \n", + "02 290 000100 1 119 40 26 129 58\n", + " 2 214 71 48 254 82\n", + " 000200 1 51 73 82 151 57\n", + " 2 83 120 135 243 98\n", + " 000300 1 127 58 47 126 108\n", + " 2 103 47 38 114 76\n", + " 3 102 46 38 121 67\n", + " 000400 1 91 46 39 98 82\n", + " 2 98 50 43 123 69" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "best_weights, fit_quality, iterations = household_weights(h_freq_table,\n", - " None,\n", - " h_constraint,\n", - " None)" + "# can check these totals in our acs subject table...\n", + "h_acs_cat" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ - "from synthpop.ipu.ipu import _FrequencyAndConstraints" + "# store the marginal values from the block group 1 of the 000400 census tract\n", + "hh_marginals_tract_400_block_gp_1 = h_acs_cat.loc[tuple(list(starter.get_available_geography_ids())[7])]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "cat_name cat_value \n", + "hh_cars none 91\n", + " one 46\n", + " two or more 39\n", + "hh_children no 98\n", + " yes 82\n", + "Name: (02, 290, 000400, 1), dtype: int64" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "freq_wrap = _FrequencyAndConstraints(h_freq_table, h_constraint)" + "hh_marginals_tract_400_block_gp_1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This wrapper returns every `cat_id` column with non zero values:\n", - "\n", - "* `0` represents households with no car and no children \n", - "* `1` represents households with no car and children \n", - "* `2` represents households with one car and no children\n", - "* `3` represents households with one car and children\n", - "* `4` represents households with two or more cars and no children\n", - "* `5` represents households with two or more cars and children\n", - "\n", - "The wrapper returns each variables combination (from 0 to 5) inside a tuple with:\n", - "\n", - "1) `cat_id` value\n", - "\n", - "2) a `weights matrix`\n", - "\n", - "3) the new target value (the `constraint` or maximum value that the combination can reach within the block group) we built in the `ipf`.\n", - "\n", - "4) the index of non zero column values " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# here the information for all the combinations\n", - "freq_wrap.iter_columns()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# and here, we can see for category \"0\" that non zero values are...\n", - "freq_wrap.get_column(0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#... in the index 0, 1, 2, 7, (...)\n", - "h_freq_table[0][0:8]" + "## 3. Sinthesize all: using `Starter` outputs " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "With this wrapper, we build the fit quality od each `cat_id`" + "### 3.1 The iterative proportional fitting (IPF) procedure " ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from synthpop.ipu.ipu import _average_fit_quality\n", - "from synthpop.ipu.ipu import _fit_quality" + "![constraints](img/constraints.png)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ - "# weights matrix\n", - "weights = np.ones(len(h_freq_table), dtype='float')\n", - "\n", - "# column (this is the non-zero elements of the \"0\" cat_id column of the frequency table)\n", - "cat_id_0_column = [e for e in freq_wrap.get_column(0)][1]\n", - "\n", - "# nz (this is the idx of the frequency table where non zero values are stored)\n", - "cat_id_0_nz = [e for e in freq_wrap.get_column(0)][3]" + "from synthpop.ipf.ipf import calculate_constraints" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ - "# the non zero values should have the same length when filtering the weights matrix\n", - "len(cat_id_0_column) == len(weights[cat_id_0_nz])" + "h_constraint, _ = calculate_constraints(hh_marginals_tract_400_block_gp_1, jd_households.frequency)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "hh_cars hh_children\n", + "none no 46.529753\n", + " yes 46.538428\n", + "one no 27.835805\n", + " yes 19.209650\n", + "two or more no 23.634442\n", + " yes 16.251921\n", + "dtype: float64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# the new target value for the block group\n", - "constraint = [e for e in freq_wrap.get_column(0)][2]" + "# this is our contraints table\n", + "h_constraint" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# this is the \"fit quality\" value for cat_id \"0\"\n", - "_fit_quality(cat_id_0_column, weights[cat_id_0_nz], constraint)" + "# this is the number of iterations performed to achieve the constraint values\n", + "_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This value is the result of multiplying the non-zero values by the weights matrix, which returns the total frequency of the category within the PUMA: " + "In the block group 1 of the census tract 400 we used to have 91 households with no cars, 46 with one car and 39 with two or more:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ - "(cat_id_0_column * weights[cat_id_0_nz]).sum()" + "bg_targets = hh_marginals_tract_400_block_gp_1[:3]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "cat_name cat_value \n", + "hh_cars none 91\n", + " one 46\n", + " two or more 39\n", + "Name: (02, 290, 000400, 1), dtype: int64" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Here the total households with no car and no children(843)\n", - "jd_households" + "bg_targets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The sum of non zero values is weighted and then reduced by substracting the constraints..." + "At the same time, our `joint_distributions` table gave us the total of that variable for the entire PUMA (in this case, `843 households with no children + 919 households with one children = 1762 households with no cars`:" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "(cat_id_0_column * weights[cat_id_0_nz]).sum() - constraint" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "... and finally is expressed in terms of the constraint for that category - the 𝛿 parameter described in the IPU paper -. The absolute value of the relative difference between the weighted sum and the corresponding constraint may be used as a goodness of fit measure and is defined as: " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "((cat_id_0_column * weights[cat_id_0_nz]).sum() - constraint) / constraint" - ] - }, - { - "cell_type": "markdown", + "execution_count": 54, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_idfrequency
hh_carshh_children
noneno0843
yes1919
\n", + "
" + ], + "text/plain": [ + " cat_id frequency\n", + "hh_cars hh_children \n", + "none no 0 843\n", + " yes 1 919" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "This is basically showing the proportion of the `cat_id` within the entire population of the PUMA. In the example of this first iteration, we have that for each household that `cat_id` == `0`, there are 17 households that could be out of that category (`cat_id` != `0`). " + "jd_households[:2]" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 55, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1762" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "![constraints](img/ipu.png)" + "jd_households['frequency'][:2].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This process will be repeated for all household types (or `cat_id`) and build a unique average value (which is the sum of each `_fit_quality` result divided by the number of `cat_id` columns):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fit_qual_0_to_5 = _average_fit_quality(freq_wrap, weights)" + "With this information, we will build new target values for each category total within the block group by multiplying the target and the proportion that this target represents in the entire PUMA - remember here, that our joint distribution table gets this total by combining different variables. In our case number of cars and children -\n", + "\n", + "`new_target = current_target * (proportion of the current target)`" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Block group target value: 91\n", + "Total value of the combined category is: 1762\n", + "Proportion of each category in the combined variables total: 0.051645856980703744\n", + "New Block group target value: 4.69977298524404\n", + "*********************************************************************************\n", + "Block group target value: 46\n", + "Total value of the combined category is: 1004\n", + "Proportion of each category in the combined variables total: 0.045816733067729085\n", + "New Block group target value: 2.1075697211155378\n", + "*********************************************************************************\n", + "Block group target value: 39\n", + "Total value of the combined category is: 859\n", + "Proportion of each category in the combined variables total: 0.04540162980209546\n", + "New Block group target value: 1.7706635622817228\n", + "*********************************************************************************\n" + ] + } + ], "source": [ - "fit_qual_0_to_5" + "sub_category_idx_0 = 0\n", + "sub_category_idx_1 = 2\n", + "\n", + "for target in range(len(bg_targets)):\n", + " total_cat = jd_households.frequency[sub_category_idx_0:sub_category_idx_1].sum()\n", + " sub_category_idx_0 += 2\n", + " sub_category_idx_1 += 2\n", + " print('Block group target value: %s'%str(bg_targets[target]))\n", + " print('Total value of the combined category is: %s'%str(total_cat))\n", + " print('Proportion of each category in the combined variables total: %s'%str(bg_targets[target] / total_cat))\n", + " print('New Block group target value: %s'%str(bg_targets[target]*(bg_targets[target] / total_cat)))\n", + " print('*********************************************************************************')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As it is explained on [A METHODOLOGY TO MATCH DISTRIBUTIONS OF BOTH HOUSEHOLD AND\n", - "PERSON ATTRIBUTES IN THE GENERATION OF SYNTHETIC POPULATIONS ](http://www.scag.ca.gov/Documents/PopulationSynthesizerPaper_TRB.pdf), the IPU algorithm starts by assuming equal weights for all households in the sample. The algorithm then proceeds by adjusting weights for each household/person constraint in an iterative process until the constraints are matched as closely as possible for both household and person attributes. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fit_change = np.inf\n", - "convergence= 1e-4" + "These new values that were built based on the proportion that each block group represents in the PUMA (for every category value) will be used to update our joint distributions table: " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 843],\n", + " [ 1, 919],\n", + " [ 2, 573],\n", + " [ 3, 431],\n", + " [ 4, 491],\n", + " [ 5, 368]])" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "if fit_change > convergence:\n", - " print(\"Updating weights matrix until reaching a fit quality value under the convergence!\")" + "jd_households.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The weights for the each household level constraint are adjusted by dividing the number of households in that category (i.e., the constraint value) by the weighted sum of the first household type column: \n", - "\n", - "The `_update_weights` creates the following adjustment `adj = constraint / float((column * weights).sum())` and use it to update weights (`weights * adj`). The weights for all households of each household type will be multiplied by this ratio to satisfy the constraint. " + "...the totals we had for each `category combination`: 0; 1; 2; 3; 4 & 5 will be replaced using that proportion. This way we will define a new maximum value for every category within the block group." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this sense, the `households_weights` function will finally return a:" + "Imagine we have a constraints table with categories from 0 to 2 (has no car and no children, has no car but children and has one car and no children) as we shown in the example above. We will need to:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ - "# 1. An array of corrected weights that best matches each household type \n", - "best_weights" + "current_constraints = jd_households[:3].values.copy()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ - "# 2. And a fit quality based on the proportion of each hh type that reduces the fit changes under the convergence \n", - "fit_quality" + "# 1. define new targets based on proportions\n", + "new_targets = [4.69977298524404, 2.1075697211155378, 1.7706635622817228]\n", + "\n", + "next_constraints = current_constraints.astype(float)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ - "# 3. Built in ...\n", - "iterations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Drawing synthetic population " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![draw](img/draw.png)" + "# 2. update the constraints table\n", + "fre = -1\n", + "for t in new_targets:\n", + " fre +=1 \n", + " next_constraints[fre][1] = t" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0. , 4.69977299],\n", + " [1. , 2.10756972],\n", + " [2. , 1.77066356]])" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from synthpop import draw" + "next_constraints" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "num_households = int(hh_marginals_tract_400_block_gp_1.groupby(level=0).sum().mean())" + "These values will be updated inside an iterative process, where previous and next constraints will be evaluated under a maximum of `tolerance`: " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ - "num_households" + "tolerance = 1e-3" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "2338.0" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "fac = _FrequencyAndConstraints(h_freq_table, h_constraint)" + "np.abs(current_constraints, next_constraints).sum()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recalculate constraints table\n" + ] + } + ], "source": [ - "indexes = draw._draw_indexes(num_households, fac, best_weights)" + "if np.abs(current_constraints, next_constraints).sum()>tolerance:\n", + " print('Recalculate constraints table')" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "indexes" + "This way, the iteration will continue until reaching a group of new target values under the tolerance defined before. The man result of this process will be..." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "cat_name cat_value \n", + "hh_cars none 91\n", + " one 46\n", + " two or more 39\n", + "hh_children no 98\n", + " yes 82\n", + "Name: (02, 290, 000400, 1), dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "synth_hh = h_pums.loc[indexes].reset_index(drop=True)" + "# ... our marginals table with total values for the block group\n", + "hh_marginals_tract_400_block_gp_1" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "hh_cars hh_children\n", + "none no 46.529753\n", + " yes 46.538428\n", + "one no 27.835805\n", + " yes 19.209650\n", + "two or more no 23.634442\n", + " yes 16.251921\n", + "dtype: float64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "synth_hh" + "# updated with new totals.\n", + "h_constraint" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": {}, - "outputs": [], - "source": [ - "mrg_tbl = pd.DataFrame(\n", - " {'serialno': synth_hh.serialno.values,\n", - " 'hh_id': synth_hh.index.values})" - ] - }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_idfrequency
hh_carshh_children
noneno0843
yes1919
oneno2573
yes3431
two or moreno4491
yes5368
\n", + "
" + ], + "text/plain": [ + " cat_id frequency\n", + "hh_cars hh_children \n", + "none no 0 843\n", + " yes 1 919\n", + "one no 2 573\n", + " yes 3 431\n", + "two or more no 4 491\n", + " yes 5 368" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# that will be used to update joint distributions frequencies\n", + "jd_households" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Consuming the Census API we built a dataset with, for example, 91 aggregated cases for households with no car. Building the constraints table, we used PUMA geographies to determine the proportion that these 91 cases represents in a more dissaggregated level. With this estimation we determine that instead of 91, the block group 1 of the census tract 00400 from the Yukon-Koyukuk county in the Alaska state has 46.5 households with no car. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2. The iterative proportional updating (IPU) procedure " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![ipu](img/ipf_ipu.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "from synthpop.ipu.ipu import household_weights" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "h_constraint.index = jd_households.cat_id" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "cat_id\n", + "0 46.529753\n", + "1 46.538428\n", + "2 27.835805\n", + "3 19.209650\n", + "4 23.634442\n", + "5 16.251921\n", + "dtype: float64" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "h_constraint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Besides the `categories dataframe` - our joint distributions table- we mentioned in `2.3`, there is a `sample dataframe` which is the public user microdata sample - our pums." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "households_sample_df = h_pums.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
serialnoRTpuma00puma10NPTYPEBLDTENVEHHINCPMVR18R65hh_carshh_childrencat_id
02012000000643H-9400212.03.00.033360.01.00.00.0noneno0
22012000001889H-9400412.02.00.089420.07.00.02.0noneno0
32012000003083H-9400112.02.00.020800.07.00.00.0noneno0
152012000017291H-9400412.02.00.054800.05.00.00.0noneno0
312012000041419H-9400212.02.00.033300.07.00.01.0noneno0
\n", + "
" + ], + "text/plain": [ + " serialno RT puma00 puma10 NP TYPE BLD TEN VEH HINCP MV \\\n", + "0 2012000000643 H -9 400 2 1 2.0 3.0 0.0 33360.0 1.0 \n", + "2 2012000001889 H -9 400 4 1 2.0 2.0 0.0 89420.0 7.0 \n", + "3 2012000003083 H -9 400 1 1 2.0 2.0 0.0 20800.0 7.0 \n", + "15 2012000017291 H -9 400 4 1 2.0 2.0 0.0 54800.0 5.0 \n", + "31 2012000041419 H -9 400 2 1 2.0 2.0 0.0 33300.0 7.0 \n", + "\n", + " R18 R65 hh_cars hh_children cat_id \n", + "0 0.0 0.0 none no 0 \n", + "2 0.0 2.0 none no 0 \n", + "3 0.0 0.0 none no 0 \n", + "15 0.0 0.0 none no 0 \n", + "31 0.0 1.0 none no 0 " + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "households_sample_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "households_sample_df.index.name = \"hh_id\"" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "households_sample_df = households_sample_df.reset_index().set_index(\"serialno\")" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "h_freq_table = cat._frequency_table(households_sample_df, jd_households.cat_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "h_freq_table = h_freq_table.sort_index(axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_id012345
hh_id
01.00.00.00.00.00.0
21.00.00.00.00.00.0
31.00.00.00.00.00.0
40.01.00.00.00.00.0
50.00.00.01.00.00.0
\n", + "
" + ], + "text/plain": [ + "cat_id 0 1 2 3 4 5\n", + "hh_id \n", + "0 1.0 0.0 0.0 0.0 0.0 0.0\n", + "2 1.0 0.0 0.0 0.0 0.0 0.0\n", + "3 1.0 0.0 0.0 0.0 0.0 0.0\n", + "4 0.0 1.0 0.0 0.0 0.0 0.0\n", + "5 0.0 0.0 0.0 1.0 0.0 0.0" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "h_freq_table.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Togheter, frequency tables and constraints will be used for weights matrix and fit quality:" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "best_weights, fit_quality, iterations = household_weights(h_freq_table,\n", + " None,\n", + " h_constraint,\n", + " None)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "from synthpop.ipu.ipu import _FrequencyAndConstraints" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "freq_wrap = _FrequencyAndConstraints(h_freq_table, h_constraint)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This wrapper returns every `cat_id` column with non zero values:\n", + "\n", + "* `0` represents households with no car and no children \n", + "* `1` represents households with no car and children \n", + "* `2` represents households with one car and no children\n", + "* `3` represents households with one car and children\n", + "* `4` represents households with two or more cars and no children\n", + "* `5` represents households with two or more cars and children\n", + "\n", + "The wrapper returns each variables combination (from 0 to 5) inside a tuple with:\n", + "\n", + "1) `cat_id` value\n", + "\n", + "2) a `weights matrix`\n", + "\n", + "3) the new target value (the `constraint` or maximum value that the combination can reach within the block group) we built in the `ipf`.\n", + "\n", + "4) the index of non zero column values " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0,\n", + " array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),\n", + " 46.529753144154704,\n", + " array([ 0, 1, 2, 7, 16, 18, 20, 27, 32, 34, 38,\n", + " 41, 47, 58, 60, 74, 84, 86, 87, 88, 90, 95,\n", + " 97, 99, 106, 107, 108, 115, 116, 118, 121, 122, 130,\n", + " 131, 135, 138, 140, 159, 163, 165, 169, 175, 182, 190,\n", + " 192, 205, 207, 211, 212, 214, 215, 221, 223, 226, 227,\n", + " 228, 229, 232, 235, 240, 242, 256, 261, 268, 270, 279,\n", + " 280, 281, 282, 283, 285, 287, 291, 295, 297, 298, 299,\n", + " 300, 308, 309, 310, 311, 315, 320, 324, 331, 332, 336,\n", + " 338, 345, 356, 361, 363, 366, 367, 368, 372, 380, 383,\n", + " 388, 393, 400, 403, 406, 408, 418, 419, 430, 432, 433,\n", + " 434, 449, 450, 465, 466, 471, 472, 473, 483, 486, 499,\n", + " 504, 513, 514, 519, 537, 538, 540, 545, 546, 548, 552,\n", + " 553, 559, 565, 573, 579, 580, 583, 585, 589, 590, 597,\n", + " 598, 609, 613, 622, 624, 629, 632, 635, 636, 637, 638,\n", + " 641, 642, 646, 649, 650, 660, 661, 669, 672, 673, 684,\n", + " 687, 692, 697, 698, 706, 713, 716, 725, 727, 737, 738,\n", + " 739, 741, 747, 751, 754, 761, 762, 768, 797, 804, 805,\n", + " 809, 814, 815, 820, 822, 824, 825, 826, 837, 839, 840,\n", + " 842, 847, 855, 863, 866, 868, 872, 876, 887, 891, 901,\n", + " 905, 911, 916, 917, 923, 937, 939, 942, 945, 946, 952,\n", + " 960, 967, 969, 970, 973, 974, 975, 978, 984, 989, 990,\n", + " 993, 997, 999, 1000, 1005, 1006, 1009, 1011, 1015, 1021, 1022,\n", + " 1024, 1031, 1037, 1040, 1043, 1044, 1057, 1059, 1062, 1063, 1071,\n", + " 1072, 1073, 1079, 1083, 1090, 1093, 1094, 1096, 1097, 1101, 1106,\n", + " 1107, 1108, 1114, 1118, 1124, 1130, 1137, 1138, 1140, 1143, 1144,\n", + " 1161, 1171, 1175, 1178, 1180, 1186, 1187, 1188, 1192, 1196, 1197,\n", + " 1204, 1226, 1247, 1252, 1253, 1254, 1256, 1259, 1261, 1263, 1267,\n", + " 1271, 1279, 1285, 1292, 1295, 1302, 1305, 1308, 1317, 1319, 1321,\n", + " 1324, 1331, 1333, 1335, 1354, 1355, 1357, 1358, 1361, 1362, 1366,\n", + " 1367, 1373, 1382, 1395, 1396, 1397, 1406, 1407, 1408, 1409, 1415,\n", + " 1421, 1425, 1426, 1427, 1429, 1434, 1444, 1445, 1452, 1456, 1460,\n", + " 1461, 1466, 1474, 1480, 1481, 1482, 1490, 1491, 1493, 1501, 1509,\n", + " 1521, 1528, 1545, 1552, 1566, 1567, 1570, 1576, 1577, 1580, 1582,\n", + " 1594, 1603, 1605, 1618, 1619, 1627, 1628, 1643, 1645, 1650, 1653,\n", + " 1654, 1655, 1657, 1660, 1661, 1662, 1664, 1674, 1679, 1688, 1691,\n", + " 1692, 1701, 1704, 1705, 1709, 1710, 1711, 1712, 1713, 1715, 1729,\n", + " 1730, 1731, 1738, 1739, 1741, 1746, 1751, 1766, 1770, 1779, 1783,\n", + " 1785, 1787, 1800, 1802, 1811, 1816, 1820, 1821, 1824, 1829, 1833,\n", + " 1843, 1848, 1850, 1851, 1856, 1861, 1865, 1866, 1867, 1870, 1874,\n", + " 1875, 1878, 1883, 1887, 1890, 1891, 1893, 1897, 1903, 1904, 1905,\n", + " 1907, 1917, 1918, 1920, 1923, 1924, 1932, 1934, 1937, 1947, 1953,\n", + " 1957, 1958, 1962, 1965, 1971, 1972, 1975, 1976, 1979, 1980, 1998,\n", + " 2000, 2003, 2008, 2013, 2021, 2025, 2028, 2032, 2034, 2036, 2041,\n", + " 2048, 2049, 2056, 2061, 2064, 2068, 2070, 2076, 2080, 2086, 2090,\n", + " 2093, 2096, 2102, 2112, 2120, 2124, 2131, 2132, 2133, 2138, 2144,\n", + " 2146, 2159, 2165, 2167, 2173, 2174, 2183, 2184, 2189, 2198, 2200,\n", + " 2203, 2204, 2210, 2213, 2219, 2220, 2228, 2233, 2234, 2240, 2241,\n", + " 2251, 2252, 2255, 2260, 2264, 2265, 2267, 2273, 2280, 2283, 2284,\n", + " 2285, 2286, 2289, 2291, 2293, 2294, 2295, 2297, 2309, 2313, 2314,\n", + " 2317, 2321, 2331, 2335, 2344, 2350, 2352, 2364, 2369, 2370, 2371,\n", + " 2380, 2393, 2396, 2402, 2403, 2404, 2411, 2412, 2415, 2423, 2435,\n", + " 2445, 2447, 2450, 2453, 2466, 2469, 2476, 2481, 2491, 2495, 2507,\n", + " 2513, 2514, 2518, 2519, 2527, 2530, 2532, 2535, 2536, 2539, 2542,\n", + " 2545, 2546, 2547, 2559, 2562, 2565, 2577, 2579, 2584, 2588, 2593,\n", + " 2594, 2596, 2598, 2603, 2604, 2610, 2612, 2615, 2621, 2622, 2623,\n", + " 2631, 2638, 2642, 2646, 2647, 2654, 2659, 2660, 2661, 2662, 2667,\n", + " 2671, 2677, 2684, 2686, 2688, 2689, 2692, 2694, 2701, 2702, 2703,\n", + " 2705, 2706, 2712, 2714, 2715, 2717, 2719, 2724, 2728, 2729, 2735,\n", + " 2739, 2742, 2744, 2748, 2750, 2755, 2757, 2768, 2770, 2773, 2774,\n", + " 2776, 2777, 2778, 2779, 2780, 2785, 2790, 2791, 2792, 2802, 2803,\n", + " 2806, 2812, 2818, 2824, 2828, 2829, 2831, 2848, 2849, 2857, 2858,\n", + " 2861, 2864, 2872, 2874, 2877, 2882, 2884, 2885, 2890, 2898, 2900,\n", + " 2902, 2906, 2911, 2913, 2922, 2926, 2934, 2935, 2943, 2953, 2955,\n", + " 2964, 2967, 2969, 2972, 2973, 2981, 2983, 2984, 2985, 2988, 2990,\n", + " 2996, 2997, 3002, 3009, 3010, 3012, 3024, 3025, 3030, 3035, 3036,\n", + " 3043, 3045, 3047, 3052, 3053, 3055, 3062, 3069, 3072, 3077, 3079,\n", + " 3080, 3084, 3089, 3095, 3098, 3116, 3118, 3124, 3128, 3138, 3145,\n", + " 3148, 3166, 3168, 3173, 3174, 3178, 3185, 3193, 3205, 3208, 3214,\n", + " 3218, 3232, 3237, 3240, 3247, 3256, 3259, 3262, 3263, 3266, 3267,\n", + " 3270, 3273, 3287, 3294, 3296, 3301, 3302, 3311, 3316, 3320, 3336,\n", + " 3341, 3345, 3353, 3356, 3357, 3363, 3365, 3366, 3367, 3369, 3371,\n", + " 3376, 3377, 3380, 3390, 3391, 3393, 3397, 3399, 3401, 3404, 3406,\n", + " 3410, 3415, 3419, 3421, 3424, 3429, 3431, 3432, 3443, 3444, 3457,\n", + " 3458, 3459, 3462, 3466, 3468, 3470, 3471, 3472, 3473, 3478, 3488,\n", + " 3490, 3494, 3497, 3499, 3503, 3505, 3520, 3532, 3544, 3546, 3547,\n", + " 3548, 3550, 3551, 3556, 3557, 3562, 3567, 3570, 3576, 3578, 3581,\n", + " 3582, 3596, 3611, 3616, 3617, 3618, 3623])),\n", + " (1,\n", + " array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1.]),\n", + " 46.538428452649654,\n", + " array([ 3, 6, 8, 9, 21, 28, 29, 31, 33, 43, 48,\n", + " 52, 53, 55, 56, 57, 64, 69, 70, 73, 80, 83,\n", + " 93, 96, 101, 102, 103, 111, 114, 117, 120, 126, 127,\n", + " 128, 133, 134, 136, 137, 145, 148, 157, 160, 161, 164,\n", + " 170, 172, 173, 177, 183, 188, 189, 193, 198, 199, 201,\n", + " 202, 204, 206, 216, 230, 241, 247, 249, 255, 260, 265,\n", + " 267, 275, 276, 286, 289, 292, 296, 302, 304, 313, 318,\n", + " 323, 327, 334, 335, 340, 343, 351, 370, 373, 374, 381,\n", + " 386, 392, 398, 399, 404, 405, 407, 410, 412, 415, 420,\n", + " 424, 435, 437, 442, 443, 446, 451, 452, 454, 455, 460,\n", + " 462, 470, 474, 476, 479, 484, 485, 489, 490, 497, 502,\n", + " 503, 505, 508, 512, 516, 522, 530, 531, 532, 534, 539,\n", + " 542, 554, 563, 564, 581, 584, 586, 592, 595, 599, 600,\n", + " 604, 614, 619, 621, 630, 631, 633, 647, 648, 651, 653,\n", + " 654, 657, 659, 675, 676, 678, 685, 690, 695, 701, 702,\n", + " 704, 705, 707, 708, 709, 712, 714, 715, 720, 721, 728,\n", + " 731, 735, 736, 742, 743, 744, 746, 750, 756, 757, 758,\n", + " 759, 765, 773, 778, 779, 783, 786, 787, 791, 802, 806,\n", + " 808, 813, 817, 819, 821, 823, 829, 831, 833, 834, 838,\n", + " 846, 851, 853, 857, 859, 861, 867, 873, 878, 883, 888,\n", + " 890, 893, 894, 899, 907, 908, 910, 918, 922, 924, 925,\n", + " 927, 929, 930, 931, 932, 934, 941, 947, 950, 955, 965,\n", + " 968, 971, 976, 982, 995, 998, 1001, 1016, 1017, 1018, 1026,\n", + " 1030, 1032, 1035, 1036, 1046, 1060, 1066, 1067, 1070, 1075, 1077,\n", + " 1082, 1100, 1104, 1110, 1112, 1115, 1117, 1120, 1129, 1131, 1132,\n", + " 1133, 1135, 1139, 1147, 1148, 1151, 1152, 1157, 1160, 1165, 1169,\n", + " 1183, 1189, 1194, 1202, 1210, 1213, 1214, 1220, 1224, 1227, 1233,\n", + " 1238, 1243, 1244, 1249, 1258, 1275, 1277, 1280, 1282, 1283, 1284,\n", + " 1287, 1289, 1293, 1301, 1307, 1310, 1313, 1315, 1320, 1329, 1330,\n", + " 1337, 1338, 1340, 1344, 1346, 1352, 1356, 1364, 1365, 1372, 1379,\n", + " 1384, 1394, 1402, 1405, 1412, 1414, 1418, 1419, 1424, 1428, 1431,\n", + " 1436, 1440, 1446, 1453, 1454, 1464, 1468, 1469, 1471, 1473, 1475,\n", + " 1478, 1486, 1487, 1496, 1497, 1498, 1500, 1503, 1507, 1511, 1512,\n", + " 1513, 1518, 1520, 1524, 1531, 1532, 1535, 1539, 1540, 1542, 1543,\n", + " 1553, 1557, 1560, 1563, 1568, 1578, 1581, 1584, 1585, 1587, 1589,\n", + " 1590, 1591, 1595, 1599, 1601, 1604, 1606, 1610, 1614, 1615, 1616,\n", + " 1620, 1631, 1640, 1648, 1656, 1658, 1663, 1665, 1668, 1671, 1680,\n", + " 1685, 1687, 1690, 1702, 1706, 1707, 1725, 1726, 1728, 1733, 1734,\n", + " 1736, 1740, 1750, 1754, 1755, 1760, 1762, 1769, 1773, 1775, 1778,\n", + " 1780, 1781, 1782, 1784, 1788, 1789, 1791, 1794, 1797, 1798, 1806,\n", + " 1810, 1815, 1818, 1822, 1827, 1831, 1832, 1834, 1836, 1838, 1855,\n", + " 1857, 1858, 1859, 1860, 1862, 1863, 1864, 1869, 1876, 1880, 1886,\n", + " 1888, 1894, 1899, 1901, 1909, 1913, 1915, 1919, 1921, 1925, 1926,\n", + " 1929, 1936, 1938, 1940, 1941, 1943, 1952, 1955, 1956, 1960, 1963,\n", + " 1966, 1970, 1973, 1981, 1984, 1991, 1999, 2001, 2002, 2004, 2006,\n", + " 2009, 2012, 2015, 2016, 2017, 2022, 2027, 2031, 2035, 2038, 2039,\n", + " 2047, 2051, 2055, 2063, 2065, 2067, 2071, 2077, 2079, 2083, 2084,\n", + " 2088, 2092, 2104, 2114, 2115, 2117, 2118, 2126, 2134, 2139, 2140,\n", + " 2142, 2145, 2147, 2148, 2155, 2156, 2160, 2161, 2168, 2169, 2175,\n", + " 2177, 2178, 2186, 2188, 2190, 2191, 2199, 2201, 2221, 2224, 2226,\n", + " 2229, 2230, 2232, 2239, 2243, 2246, 2247, 2248, 2249, 2250, 2254,\n", + " 2259, 2266, 2269, 2270, 2274, 2275, 2282, 2292, 2300, 2301, 2302,\n", + " 2304, 2305, 2306, 2311, 2312, 2320, 2323, 2327, 2332, 2342, 2346,\n", + " 2354, 2357, 2358, 2360, 2362, 2363, 2366, 2367, 2368, 2373, 2374,\n", + " 2375, 2376, 2377, 2379, 2382, 2385, 2388, 2392, 2397, 2399, 2407,\n", + " 2409, 2417, 2418, 2419, 2420, 2421, 2424, 2430, 2432, 2439, 2441,\n", + " 2443, 2449, 2451, 2454, 2465, 2474, 2478, 2498, 2499, 2501, 2502,\n", + " 2503, 2506, 2521, 2528, 2529, 2533, 2537, 2543, 2551, 2553, 2554,\n", + " 2556, 2564, 2569, 2572, 2573, 2576, 2578, 2580, 2585, 2587, 2600,\n", + " 2602, 2607, 2608, 2609, 2619, 2620, 2624, 2625, 2626, 2628, 2639,\n", + " 2640, 2648, 2651, 2658, 2664, 2673, 2679, 2680, 2697, 2698, 2708,\n", + " 2710, 2713, 2718, 2723, 2730, 2731, 2732, 2733, 2736, 2738, 2740,\n", + " 2741, 2743, 2746, 2749, 2751, 2752, 2756, 2761, 2766, 2767, 2782,\n", + " 2786, 2788, 2795, 2798, 2799, 2804, 2807, 2809, 2813, 2816, 2817,\n", + " 2821, 2822, 2826, 2836, 2841, 2850, 2854, 2859, 2865, 2867, 2869,\n", + " 2871, 2873, 2879, 2889, 2892, 2895, 2896, 2903, 2907, 2908, 2914,\n", + " 2917, 2923, 2927, 2929, 2932, 2936, 2941, 2946, 2948, 2949, 2957,\n", + " 2959, 2962, 2965, 2970, 2975, 2976, 2978, 2979, 2989, 2995, 2998,\n", + " 3001, 3003, 3005, 3006, 3008, 3014, 3017, 3018, 3021, 3023, 3028,\n", + " 3029, 3031, 3032, 3033, 3034, 3037, 3042, 3046, 3048, 3051, 3057,\n", + " 3063, 3066, 3074, 3081, 3083, 3085, 3103, 3106, 3113, 3114, 3121,\n", + " 3122, 3125, 3129, 3132, 3134, 3137, 3139, 3146, 3147, 3153, 3158,\n", + " 3160, 3169, 3171, 3172, 3177, 3179, 3188, 3189, 3190, 3192, 3199,\n", + " 3201, 3209, 3220, 3226, 3233, 3238, 3239, 3241, 3242, 3245, 3246,\n", + " 3249, 3250, 3251, 3252, 3253, 3254, 3260, 3261, 3264, 3265, 3268,\n", + " 3275, 3282, 3288, 3290, 3292, 3298, 3303, 3312, 3315, 3319, 3326,\n", + " 3328, 3330, 3339, 3347, 3351, 3360, 3361, 3373, 3375, 3379, 3381,\n", + " 3382, 3386, 3392, 3396, 3398, 3412, 3413, 3418, 3423, 3425, 3426,\n", + " 3430, 3433, 3434, 3436, 3442, 3447, 3452, 3456, 3461, 3465, 3474,\n", + " 3477, 3480, 3481, 3482, 3483, 3489, 3496, 3501, 3502, 3504, 3506,\n", + " 3510, 3514, 3517, 3518, 3519, 3524, 3527, 3530, 3533, 3534, 3536,\n", + " 3537, 3538, 3540, 3541, 3545, 3549, 3552, 3553, 3559, 3563, 3565,\n", + " 3566, 3572, 3575, 3577, 3579, 3580, 3583, 3586, 3587, 3597, 3598,\n", + " 3599, 3600, 3602, 3605, 3609, 3621])),\n", + " (2,\n", + " array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),\n", + " 27.835804541797003,\n", + " array([ 10, 11, 19, 22, 25, 26, 42, 44, 46, 71, 72,\n", + " 77, 85, 89, 94, 109, 110, 113, 123, 124, 144, 146,\n", + " 149, 151, 171, 174, 179, 184, 187, 194, 196, 197, 213,\n", + " 218, 224, 234, 243, 246, 248, 252, 253, 266, 271, 278,\n", + " 303, 307, 317, 321, 325, 333, 339, 341, 342, 357, 359,\n", + " 369, 376, 382, 395, 397, 409, 414, 416, 417, 421, 423,\n", + " 426, 436, 438, 445, 456, 457, 458, 467, 469, 480, 487,\n", + " 507, 510, 524, 527, 529, 535, 536, 547, 550, 551, 560,\n", + " 562, 566, 568, 569, 570, 571, 572, 575, 576, 582, 591,\n", + " 596, 602, 603, 605, 610, 617, 626, 644, 652, 658, 665,\n", + " 668, 670, 677, 680, 686, 689, 693, 700, 717, 726, 730,\n", + " 732, 740, 749, 753, 763, 766, 767, 772, 774, 776, 780,\n", + " 781, 785, 789, 792, 795, 810, 844, 848, 854, 856, 865,\n", + " 871, 874, 877, 881, 895, 896, 902, 903, 904, 906, 912,\n", + " 914, 915, 928, 935, 948, 957, 959, 963, 981, 986, 994,\n", + " 1014, 1020, 1025, 1045, 1055, 1065, 1099, 1123, 1134, 1136, 1141,\n", + " 1142, 1154, 1158, 1159, 1168, 1172, 1173, 1174, 1177, 1184, 1185,\n", + " 1191, 1193, 1199, 1200, 1201, 1206, 1222, 1223, 1229, 1230, 1234,\n", + " 1242, 1246, 1255, 1260, 1272, 1273, 1278, 1288, 1290, 1291, 1296,\n", + " 1316, 1325, 1326, 1334, 1336, 1343, 1345, 1348, 1350, 1353, 1359,\n", + " 1368, 1369, 1374, 1375, 1381, 1385, 1391, 1393, 1410, 1411, 1432,\n", + " 1451, 1462, 1483, 1516, 1517, 1519, 1525, 1527, 1529, 1530, 1541,\n", + " 1546, 1548, 1556, 1559, 1565, 1573, 1583, 1597, 1600, 1609, 1617,\n", + " 1621, 1624, 1625, 1629, 1633, 1634, 1642, 1646, 1647, 1649, 1651,\n", + " 1659, 1667, 1672, 1677, 1689, 1699, 1716, 1717, 1718, 1720, 1742,\n", + " 1748, 1757, 1764, 1767, 1772, 1777, 1786, 1792, 1804, 1809, 1812,\n", + " 1813, 1825, 1826, 1828, 1842, 1854, 1873, 1882, 1884, 1900, 1902,\n", + " 1906, 1911, 1935, 1942, 1948, 1951, 1954, 1964, 1968, 1988, 1993,\n", + " 1994, 1995, 1997, 2010, 2024, 2026, 2029, 2037, 2046, 2059, 2060,\n", + " 2072, 2075, 2078, 2082, 2098, 2099, 2107, 2109, 2111, 2119, 2121,\n", + " 2128, 2136, 2152, 2157, 2170, 2171, 2172, 2176, 2192, 2194, 2205,\n", + " 2215, 2217, 2218, 2222, 2223, 2225, 2236, 2242, 2261, 2276, 2277,\n", + " 2279, 2281, 2303, 2307, 2308, 2315, 2322, 2328, 2348, 2353, 2361,\n", + " 2365, 2378, 2383, 2386, 2389, 2390, 2391, 2398, 2406, 2408, 2410,\n", + " 2434, 2436, 2437, 2442, 2444, 2457, 2458, 2460, 2461, 2463, 2467,\n", + " 2480, 2486, 2493, 2497, 2505, 2508, 2510, 2522, 2524, 2526, 2534,\n", + " 2544, 2548, 2549, 2552, 2557, 2561, 2582, 2586, 2590, 2592, 2595,\n", + " 2601, 2606, 2632, 2650, 2652, 2655, 2663, 2665, 2666, 2668, 2669,\n", + " 2672, 2674, 2681, 2687, 2690, 2691, 2693, 2699, 2711, 2720, 2722,\n", + " 2727, 2734, 2745, 2747, 2758, 2759, 2769, 2771, 2772, 2775, 2781,\n", + " 2783, 2787, 2789, 2796, 2810, 2811, 2820, 2827, 2833, 2839, 2840,\n", + " 2842, 2843, 2847, 2853, 2860, 2875, 2876, 2881, 2888, 2894, 2901,\n", + " 2904, 2916, 2919, 2928, 2950, 2954, 2960, 2963, 2966, 2986, 2987,\n", + " 2992, 2994, 2999, 3000, 3007, 3039, 3059, 3061, 3064, 3067, 3073,\n", + " 3078, 3082, 3087, 3088, 3090, 3099, 3104, 3112, 3126, 3127, 3131,\n", + " 3135, 3144, 3149, 3150, 3154, 3155, 3157, 3162, 3170, 3175, 3180,\n", + " 3181, 3182, 3183, 3184, 3187, 3197, 3204, 3210, 3216, 3221, 3225,\n", + " 3227, 3230, 3235, 3236, 3243, 3244, 3271, 3272, 3276, 3278, 3283,\n", + " 3289, 3293, 3304, 3305, 3306, 3307, 3318, 3324, 3329, 3346, 3348,\n", + " 3350, 3352, 3358, 3362, 3383, 3385, 3388, 3405, 3408, 3422, 3437,\n", + " 3439, 3446, 3448, 3450, 3486, 3507, 3512, 3513, 3516, 3522, 3523,\n", + " 3554, 3560, 3569, 3573, 3574, 3584, 3593, 3595, 3601, 3606, 3613,\n", + " 3614])),\n", + " (3,\n", + " array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1.]),\n", + " 19.20965012246301,\n", + " array([ 4, 5, 13, 30, 54, 59, 61, 76, 78, 82, 98,\n", + " 104, 112, 129, 132, 142, 143, 152, 155, 156, 167, 168,\n", + " 180, 181, 185, 203, 209, 222, 225, 239, 245, 251, 258,\n", + " 262, 269, 272, 284, 290, 305, 312, 316, 337, 347, 349,\n", + " 350, 352, 354, 362, 377, 385, 394, 413, 422, 425, 428,\n", + " 429, 439, 440, 441, 447, 448, 453, 459, 463, 464, 488,\n", + " 492, 496, 500, 509, 520, 521, 523, 525, 526, 533, 549,\n", + " 557, 558, 567, 574, 587, 593, 606, 607, 625, 627, 634,\n", + " 655, 663, 667, 674, 679, 683, 696, 718, 719, 729, 748,\n", + " 760, 769, 796, 800, 801, 807, 816, 818, 832, 835, 850,\n", + " 858, 862, 870, 875, 880, 884, 886, 889, 892, 897, 920,\n", + " 940, 944, 954, 958, 979, 985, 988, 991, 1002, 1010, 1027,\n", + " 1028, 1033, 1039, 1041, 1050, 1052, 1053, 1084, 1085, 1098, 1103,\n", + " 1109, 1116, 1122, 1146, 1156, 1163, 1166, 1170, 1190, 1195, 1203,\n", + " 1209, 1215, 1216, 1221, 1237, 1248, 1251, 1265, 1266, 1268, 1274,\n", + " 1298, 1304, 1318, 1322, 1347, 1349, 1377, 1390, 1392, 1404, 1417,\n", + " 1420, 1430, 1433, 1438, 1441, 1443, 1448, 1449, 1457, 1458, 1459,\n", + " 1465, 1476, 1485, 1488, 1502, 1510, 1515, 1523, 1533, 1536, 1538,\n", + " 1558, 1571, 1574, 1579, 1586, 1592, 1593, 1602, 1607, 1623, 1632,\n", + " 1636, 1637, 1638, 1639, 1652, 1670, 1673, 1682, 1683, 1684, 1698,\n", + " 1719, 1735, 1744, 1745, 1752, 1756, 1763, 1768, 1776, 1790, 1795,\n", + " 1796, 1807, 1823, 1830, 1837, 1839, 1840, 1841, 1852, 1877, 1879,\n", + " 1889, 1930, 1939, 1944, 1945, 1959, 1969, 1978, 1985, 1987, 1996,\n", + " 2007, 2014, 2018, 2030, 2043, 2044, 2066, 2081, 2095, 2101, 2105,\n", + " 2108, 2125, 2129, 2137, 2143, 2150, 2153, 2185, 2196, 2206, 2212,\n", + " 2214, 2216, 2231, 2235, 2262, 2272, 2278, 2298, 2310, 2318, 2324,\n", + " 2333, 2343, 2359, 2372, 2381, 2394, 2395, 2400, 2425, 2429, 2446,\n", + " 2462, 2470, 2471, 2472, 2475, 2477, 2484, 2485, 2487, 2488, 2489,\n", + " 2511, 2523, 2538, 2541, 2563, 2568, 2617, 2629, 2644, 2649, 2670,\n", + " 2676, 2683, 2685, 2695, 2696, 2819, 2823, 2825, 2830, 2834, 2838,\n", + " 2846, 2863, 2870, 2883, 2893, 2910, 2912, 2921, 2930, 2931, 2933,\n", + " 2939, 2940, 2942, 2945, 2958, 2971, 2980, 3013, 3015, 3016, 3019,\n", + " 3026, 3027, 3049, 3050, 3060, 3076, 3086, 3092, 3093, 3094, 3096,\n", + " 3100, 3110, 3115, 3120, 3130, 3133, 3136, 3141, 3143, 3159, 3165,\n", + " 3186, 3191, 3195, 3198, 3202, 3211, 3215, 3222, 3223, 3228, 3229,\n", + " 3234, 3248, 3257, 3258, 3280, 3281, 3286, 3291, 3314, 3317, 3325,\n", + " 3327, 3340, 3355, 3368, 3370, 3387, 3395, 3402, 3407, 3414, 3417,\n", + " 3420, 3428, 3435, 3451, 3454, 3455, 3464, 3479, 3487, 3491, 3493,\n", + " 3495, 3535, 3558, 3564, 3590, 3592, 3607, 3610, 3612, 3615, 3619,\n", + " 3620, 3624])),\n", + " (4,\n", + " array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),\n", + " 23.6344423140483,\n", + " array([ 17, 23, 24, 36, 37, 39, 40, 50, 62, 66, 67,\n", + " 75, 81, 91, 92, 100, 119, 125, 158, 162, 166, 176,\n", + " 186, 191, 195, 200, 208, 210, 220, 233, 237, 244, 250,\n", + " 257, 259, 263, 273, 293, 294, 314, 319, 322, 330, 344,\n", + " 348, 353, 360, 387, 401, 402, 461, 468, 475, 478, 482,\n", + " 493, 494, 495, 498, 501, 506, 511, 515, 518, 528, 541,\n", + " 556, 561, 577, 578, 601, 611, 612, 615, 618, 620, 623,\n", + " 640, 645, 656, 664, 666, 681, 688, 691, 694, 699, 710,\n", + " 711, 722, 723, 724, 733, 734, 745, 755, 764, 770, 771,\n", + " 777, 790, 794, 799, 803, 812, 830, 836, 841, 843, 849,\n", + " 852, 860, 879, 882, 885, 900, 909, 919, 926, 933, 936,\n", + " 943, 951, 956, 961, 964, 980, 983, 992, 1007, 1012, 1013,\n", + " 1029, 1034, 1047, 1048, 1051, 1054, 1056, 1058, 1061, 1064, 1080,\n", + " 1081, 1086, 1087, 1088, 1091, 1092, 1095, 1111, 1113, 1119, 1121,\n", + " 1125, 1126, 1127, 1145, 1149, 1150, 1164, 1167, 1176, 1181, 1205,\n", + " 1207, 1208, 1219, 1228, 1235, 1236, 1239, 1240, 1250, 1264, 1276,\n", + " 1299, 1300, 1303, 1306, 1311, 1314, 1328, 1332, 1342, 1351, 1363,\n", + " 1370, 1371, 1376, 1380, 1386, 1387, 1398, 1399, 1403, 1422, 1423,\n", + " 1435, 1437, 1439, 1442, 1447, 1467, 1470, 1472, 1479, 1489, 1492,\n", + " 1499, 1504, 1508, 1514, 1526, 1537, 1547, 1551, 1555, 1561, 1562,\n", + " 1564, 1575, 1596, 1608, 1611, 1612, 1635, 1641, 1675, 1676, 1678,\n", + " 1681, 1693, 1694, 1695, 1696, 1697, 1700, 1703, 1714, 1721, 1722,\n", + " 1723, 1732, 1747, 1753, 1758, 1759, 1761, 1765, 1774, 1805, 1819,\n", + " 1844, 1845, 1846, 1847, 1849, 1871, 1881, 1885, 1892, 1898, 1908,\n", + " 1910, 1927, 1928, 1949, 1950, 1961, 1967, 1977, 1982, 1983, 1986,\n", + " 1989, 1990, 2011, 2019, 2020, 2023, 2040, 2042, 2045, 2050, 2054,\n", + " 2057, 2058, 2085, 2087, 2097, 2100, 2106, 2110, 2123, 2130, 2141,\n", + " 2151, 2158, 2163, 2179, 2187, 2193, 2195, 2197, 2202, 2207, 2208,\n", + " 2227, 2237, 2238, 2244, 2256, 2258, 2263, 2287, 2299, 2319, 2325,\n", + " 2326, 2329, 2330, 2336, 2340, 2345, 2384, 2387, 2401, 2414, 2416,\n", + " 2426, 2427, 2431, 2433, 2452, 2456, 2459, 2468, 2483, 2492, 2494,\n", + " 2496, 2500, 2504, 2512, 2515, 2516, 2520, 2525, 2540, 2560, 2566,\n", + " 2567, 2570, 2571, 2581, 2599, 2605, 2613, 2616, 2630, 2634, 2635,\n", + " 2637, 2641, 2645, 2653, 2656, 2657, 2675, 2678, 2709, 2725, 2737,\n", + " 2753, 2760, 2762, 2763, 2784, 2797, 2800, 2801, 2808, 2815, 2832,\n", + " 2837, 2844, 2845, 2851, 2868, 2880, 2886, 2887, 2897, 2899, 2905,\n", + " 2915, 2925, 2937, 2938, 2944, 2947, 2951, 2952, 2956, 2961, 2968,\n", + " 2977, 2991, 3004, 3038, 3058, 3065, 3068, 3070, 3071, 3075, 3097,\n", + " 3102, 3105, 3109, 3117, 3119, 3123, 3140, 3152, 3161, 3163, 3164,\n", + " 3167, 3176, 3194, 3196, 3200, 3207, 3212, 3219, 3224, 3255, 3274,\n", + " 3277, 3279, 3284, 3285, 3295, 3297, 3299, 3300, 3308, 3310, 3321,\n", + " 3322, 3323, 3333, 3334, 3337, 3338, 3343, 3344, 3354, 3364, 3372,\n", + " 3378, 3384, 3394, 3409, 3411, 3416, 3427, 3438, 3440, 3441, 3449,\n", + " 3463, 3469, 3475, 3476, 3484, 3485, 3508, 3509, 3511, 3528, 3531,\n", + " 3539, 3542, 3555, 3561, 3603, 3608, 3622])),\n", + " (5,\n", + " array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),\n", + " 16.251921424887335,\n", + " array([ 12, 14, 15, 35, 45, 49, 51, 63, 65, 68, 79,\n", + " 105, 139, 141, 147, 150, 153, 154, 178, 217, 219, 231,\n", + " 236, 238, 254, 264, 274, 277, 288, 301, 306, 326, 328,\n", + " 329, 346, 355, 358, 364, 365, 371, 375, 378, 379, 384,\n", + " 389, 390, 391, 396, 411, 427, 431, 444, 477, 481, 491,\n", + " 517, 543, 544, 555, 588, 594, 608, 616, 628, 639, 643,\n", + " 662, 671, 682, 703, 752, 775, 782, 784, 788, 793, 798,\n", + " 811, 827, 828, 845, 864, 869, 898, 913, 921, 938, 949,\n", + " 953, 962, 966, 972, 977, 987, 996, 1003, 1004, 1008, 1019,\n", + " 1023, 1038, 1042, 1049, 1068, 1069, 1074, 1076, 1078, 1089, 1102,\n", + " 1105, 1128, 1153, 1155, 1162, 1179, 1182, 1198, 1211, 1212, 1217,\n", + " 1218, 1225, 1231, 1232, 1241, 1245, 1257, 1262, 1269, 1270, 1281,\n", + " 1286, 1294, 1297, 1309, 1312, 1323, 1327, 1339, 1341, 1360, 1378,\n", + " 1383, 1388, 1389, 1400, 1401, 1413, 1416, 1450, 1455, 1463, 1477,\n", + " 1484, 1494, 1495, 1505, 1506, 1522, 1534, 1544, 1549, 1550, 1554,\n", + " 1569, 1572, 1588, 1598, 1613, 1622, 1626, 1630, 1644, 1666, 1669,\n", + " 1686, 1708, 1724, 1727, 1737, 1743, 1749, 1771, 1793, 1799, 1801,\n", + " 1803, 1808, 1814, 1817, 1835, 1853, 1868, 1872, 1895, 1896, 1912,\n", + " 1914, 1916, 1922, 1931, 1933, 1946, 1974, 1992, 2005, 2033, 2052,\n", + " 2053, 2062, 2069, 2073, 2074, 2089, 2091, 2094, 2103, 2113, 2116,\n", + " 2122, 2127, 2135, 2149, 2154, 2162, 2164, 2166, 2180, 2181, 2182,\n", + " 2209, 2211, 2245, 2253, 2257, 2268, 2271, 2288, 2290, 2296, 2316,\n", + " 2334, 2337, 2338, 2339, 2341, 2347, 2349, 2351, 2355, 2356, 2405,\n", + " 2413, 2422, 2428, 2438, 2440, 2448, 2455, 2464, 2473, 2479, 2482,\n", + " 2490, 2509, 2517, 2531, 2550, 2555, 2558, 2574, 2575, 2583, 2589,\n", + " 2591, 2597, 2611, 2614, 2618, 2627, 2633, 2636, 2643, 2682, 2700,\n", + " 2704, 2707, 2716, 2721, 2726, 2754, 2764, 2765, 2793, 2794, 2805,\n", + " 2814, 2835, 2852, 2855, 2856, 2862, 2866, 2878, 2891, 2909, 2918,\n", + " 2920, 2924, 2974, 2982, 2993, 3011, 3020, 3022, 3040, 3041, 3044,\n", + " 3054, 3056, 3091, 3101, 3107, 3108, 3111, 3142, 3151, 3156, 3203,\n", + " 3206, 3213, 3217, 3231, 3269, 3309, 3313, 3331, 3332, 3335, 3342,\n", + " 3349, 3359, 3374, 3389, 3400, 3403, 3445, 3453, 3460, 3467, 3492,\n", + " 3498, 3500, 3515, 3521, 3525, 3526, 3529, 3543, 3568, 3571, 3585,\n", + " 3588, 3589, 3591, 3594, 3604]))]" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# here the information for all the combinations\n", + "freq_wrap.iter_columns()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0,\n", + " array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", + " 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),\n", + " 46.529753144154704,\n", + " array([ 0, 1, 2, 7, 16, 18, 20, 27, 32, 34, 38,\n", + " 41, 47, 58, 60, 74, 84, 86, 87, 88, 90, 95,\n", + " 97, 99, 106, 107, 108, 115, 116, 118, 121, 122, 130,\n", + " 131, 135, 138, 140, 159, 163, 165, 169, 175, 182, 190,\n", + " 192, 205, 207, 211, 212, 214, 215, 221, 223, 226, 227,\n", + " 228, 229, 232, 235, 240, 242, 256, 261, 268, 270, 279,\n", + " 280, 281, 282, 283, 285, 287, 291, 295, 297, 298, 299,\n", + " 300, 308, 309, 310, 311, 315, 320, 324, 331, 332, 336,\n", + " 338, 345, 356, 361, 363, 366, 367, 368, 372, 380, 383,\n", + " 388, 393, 400, 403, 406, 408, 418, 419, 430, 432, 433,\n", + " 434, 449, 450, 465, 466, 471, 472, 473, 483, 486, 499,\n", + " 504, 513, 514, 519, 537, 538, 540, 545, 546, 548, 552,\n", + " 553, 559, 565, 573, 579, 580, 583, 585, 589, 590, 597,\n", + " 598, 609, 613, 622, 624, 629, 632, 635, 636, 637, 638,\n", + " 641, 642, 646, 649, 650, 660, 661, 669, 672, 673, 684,\n", + " 687, 692, 697, 698, 706, 713, 716, 725, 727, 737, 738,\n", + " 739, 741, 747, 751, 754, 761, 762, 768, 797, 804, 805,\n", + " 809, 814, 815, 820, 822, 824, 825, 826, 837, 839, 840,\n", + " 842, 847, 855, 863, 866, 868, 872, 876, 887, 891, 901,\n", + " 905, 911, 916, 917, 923, 937, 939, 942, 945, 946, 952,\n", + " 960, 967, 969, 970, 973, 974, 975, 978, 984, 989, 990,\n", + " 993, 997, 999, 1000, 1005, 1006, 1009, 1011, 1015, 1021, 1022,\n", + " 1024, 1031, 1037, 1040, 1043, 1044, 1057, 1059, 1062, 1063, 1071,\n", + " 1072, 1073, 1079, 1083, 1090, 1093, 1094, 1096, 1097, 1101, 1106,\n", + " 1107, 1108, 1114, 1118, 1124, 1130, 1137, 1138, 1140, 1143, 1144,\n", + " 1161, 1171, 1175, 1178, 1180, 1186, 1187, 1188, 1192, 1196, 1197,\n", + " 1204, 1226, 1247, 1252, 1253, 1254, 1256, 1259, 1261, 1263, 1267,\n", + " 1271, 1279, 1285, 1292, 1295, 1302, 1305, 1308, 1317, 1319, 1321,\n", + " 1324, 1331, 1333, 1335, 1354, 1355, 1357, 1358, 1361, 1362, 1366,\n", + " 1367, 1373, 1382, 1395, 1396, 1397, 1406, 1407, 1408, 1409, 1415,\n", + " 1421, 1425, 1426, 1427, 1429, 1434, 1444, 1445, 1452, 1456, 1460,\n", + " 1461, 1466, 1474, 1480, 1481, 1482, 1490, 1491, 1493, 1501, 1509,\n", + " 1521, 1528, 1545, 1552, 1566, 1567, 1570, 1576, 1577, 1580, 1582,\n", + " 1594, 1603, 1605, 1618, 1619, 1627, 1628, 1643, 1645, 1650, 1653,\n", + " 1654, 1655, 1657, 1660, 1661, 1662, 1664, 1674, 1679, 1688, 1691,\n", + " 1692, 1701, 1704, 1705, 1709, 1710, 1711, 1712, 1713, 1715, 1729,\n", + " 1730, 1731, 1738, 1739, 1741, 1746, 1751, 1766, 1770, 1779, 1783,\n", + " 1785, 1787, 1800, 1802, 1811, 1816, 1820, 1821, 1824, 1829, 1833,\n", + " 1843, 1848, 1850, 1851, 1856, 1861, 1865, 1866, 1867, 1870, 1874,\n", + " 1875, 1878, 1883, 1887, 1890, 1891, 1893, 1897, 1903, 1904, 1905,\n", + " 1907, 1917, 1918, 1920, 1923, 1924, 1932, 1934, 1937, 1947, 1953,\n", + " 1957, 1958, 1962, 1965, 1971, 1972, 1975, 1976, 1979, 1980, 1998,\n", + " 2000, 2003, 2008, 2013, 2021, 2025, 2028, 2032, 2034, 2036, 2041,\n", + " 2048, 2049, 2056, 2061, 2064, 2068, 2070, 2076, 2080, 2086, 2090,\n", + " 2093, 2096, 2102, 2112, 2120, 2124, 2131, 2132, 2133, 2138, 2144,\n", + " 2146, 2159, 2165, 2167, 2173, 2174, 2183, 2184, 2189, 2198, 2200,\n", + " 2203, 2204, 2210, 2213, 2219, 2220, 2228, 2233, 2234, 2240, 2241,\n", + " 2251, 2252, 2255, 2260, 2264, 2265, 2267, 2273, 2280, 2283, 2284,\n", + " 2285, 2286, 2289, 2291, 2293, 2294, 2295, 2297, 2309, 2313, 2314,\n", + " 2317, 2321, 2331, 2335, 2344, 2350, 2352, 2364, 2369, 2370, 2371,\n", + " 2380, 2393, 2396, 2402, 2403, 2404, 2411, 2412, 2415, 2423, 2435,\n", + " 2445, 2447, 2450, 2453, 2466, 2469, 2476, 2481, 2491, 2495, 2507,\n", + " 2513, 2514, 2518, 2519, 2527, 2530, 2532, 2535, 2536, 2539, 2542,\n", + " 2545, 2546, 2547, 2559, 2562, 2565, 2577, 2579, 2584, 2588, 2593,\n", + " 2594, 2596, 2598, 2603, 2604, 2610, 2612, 2615, 2621, 2622, 2623,\n", + " 2631, 2638, 2642, 2646, 2647, 2654, 2659, 2660, 2661, 2662, 2667,\n", + " 2671, 2677, 2684, 2686, 2688, 2689, 2692, 2694, 2701, 2702, 2703,\n", + " 2705, 2706, 2712, 2714, 2715, 2717, 2719, 2724, 2728, 2729, 2735,\n", + " 2739, 2742, 2744, 2748, 2750, 2755, 2757, 2768, 2770, 2773, 2774,\n", + " 2776, 2777, 2778, 2779, 2780, 2785, 2790, 2791, 2792, 2802, 2803,\n", + " 2806, 2812, 2818, 2824, 2828, 2829, 2831, 2848, 2849, 2857, 2858,\n", + " 2861, 2864, 2872, 2874, 2877, 2882, 2884, 2885, 2890, 2898, 2900,\n", + " 2902, 2906, 2911, 2913, 2922, 2926, 2934, 2935, 2943, 2953, 2955,\n", + " 2964, 2967, 2969, 2972, 2973, 2981, 2983, 2984, 2985, 2988, 2990,\n", + " 2996, 2997, 3002, 3009, 3010, 3012, 3024, 3025, 3030, 3035, 3036,\n", + " 3043, 3045, 3047, 3052, 3053, 3055, 3062, 3069, 3072, 3077, 3079,\n", + " 3080, 3084, 3089, 3095, 3098, 3116, 3118, 3124, 3128, 3138, 3145,\n", + " 3148, 3166, 3168, 3173, 3174, 3178, 3185, 3193, 3205, 3208, 3214,\n", + " 3218, 3232, 3237, 3240, 3247, 3256, 3259, 3262, 3263, 3266, 3267,\n", + " 3270, 3273, 3287, 3294, 3296, 3301, 3302, 3311, 3316, 3320, 3336,\n", + " 3341, 3345, 3353, 3356, 3357, 3363, 3365, 3366, 3367, 3369, 3371,\n", + " 3376, 3377, 3380, 3390, 3391, 3393, 3397, 3399, 3401, 3404, 3406,\n", + " 3410, 3415, 3419, 3421, 3424, 3429, 3431, 3432, 3443, 3444, 3457,\n", + " 3458, 3459, 3462, 3466, 3468, 3470, 3471, 3472, 3473, 3478, 3488,\n", + " 3490, 3494, 3497, 3499, 3503, 3505, 3520, 3532, 3544, 3546, 3547,\n", + " 3548, 3550, 3551, 3556, 3557, 3562, 3567, 3570, 3576, 3578, 3581,\n", + " 3582, 3596, 3611, 3616, 3617, 3618, 3623]))" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and here, we can see for category \"0\" that non zero values are...\n", + "freq_wrap.get_column(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "hh_id\n", + "0 1.0\n", + "2 1.0\n", + "3 1.0\n", + "4 0.0\n", + "5 0.0\n", + "13 0.0\n", + "14 0.0\n", + "15 1.0\n", + "Name: 0, dtype: float64" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#... in the index 0, 1, 2, 7, (...)\n", + "h_freq_table[0][0:8]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With this wrapper, we build the fit quality od each `cat_id`" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "from synthpop.ipu.ipu import _average_fit_quality\n", + "from synthpop.ipu.ipu import _fit_quality" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "# weights matrix\n", + "weights = np.ones(len(h_freq_table), dtype='float')\n", + "\n", + "# column (this is the non-zero elements of the \"0\" cat_id column of the frequency table)\n", + "cat_id_0_column = [e for e in freq_wrap.get_column(0)][1]\n", + "\n", + "# nz (this is the idx of the frequency table where non zero values are stored)\n", + "cat_id_0_nz = [e for e in freq_wrap.get_column(0)][3]" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the non zero values should have the same length when filtering the weights matrix\n", + "len(cat_id_0_column) == len(weights[cat_id_0_nz])" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "# the new target value for the block group\n", + "constraint = [e for e in freq_wrap.get_column(0)][2]" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17.11743976780375" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this is the \"fit quality\" value for cat_id \"0\"\n", + "_fit_quality(cat_id_0_column, weights[cat_id_0_nz], constraint)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This value is the result of multiplying the non-zero values by the weights matrix, which returns the total frequency of the category within the PUMA: " + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "843.0" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(cat_id_0_column * weights[cat_id_0_nz]).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat_idfrequency
hh_carshh_children
noneno0843
yes1919
oneno2573
yes3431
two or moreno4491
yes5368
\n", + "
" + ], + "text/plain": [ + " cat_id frequency\n", + "hh_cars hh_children \n", + "none no 0 843\n", + " yes 1 919\n", + "one no 2 573\n", + " yes 3 431\n", + "two or more no 4 491\n", + " yes 5 368" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Here the total households with no car and no children(843)\n", + "jd_households" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The sum of non zero values is weighted and then reduced by substracting the constraints..." + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "796.4702468558453" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(cat_id_0_column * weights[cat_id_0_nz]).sum() - constraint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "... and finally is expressed in terms of the constraint for that category - the 𝛿 parameter described in the IPU paper -. The absolute value of the relative difference between the weighted sum and the corresponding constraint may be used as a goodness of fit measure and is defined as: " + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "17.11743976780375" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "((cat_id_0_column * weights[cat_id_0_nz]).sum() - constraint) / constraint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is basically showing the proportion of the `cat_id` within the entire population of the PUMA. In the example of this first iteration, we have that for each household that `cat_id` == `0`, there are 17 households that could be out of that category (`cat_id` != `0`). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![constraints](img/ipu.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This process will be repeated for all household types (or `cat_id`) and build a unique average value (which is the sum of each `_fit_quality` result divided by the number of `cat_id` columns):" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "fit_qual_0_to_5 = _average_fit_quality(freq_wrap, weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19.71740696516147" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_qual_0_to_5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As it is explained on [A METHODOLOGY TO MATCH DISTRIBUTIONS OF BOTH HOUSEHOLD AND\n", + "PERSON ATTRIBUTES IN THE GENERATION OF SYNTHETIC POPULATIONS ](http://www.scag.ca.gov/Documents/PopulationSynthesizerPaper_TRB.pdf), the IPU algorithm starts by assuming equal weights for all households in the sample. The algorithm then proceeds by adjusting weights for each household/person constraint in an iterative process until the constraints are matched as closely as possible for both household and person attributes. " + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "fit_change = np.inf\n", + "convergence= 1e-4" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updating weights matrix until reaching a fit quality value under the convergence!\n" + ] + } + ], + "source": [ + "if fit_change > convergence:\n", + " print(\"Updating weights matrix until reaching a fit quality value under the convergence!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The weights for the each household level constraint are adjusted by dividing the number of households in that category (i.e., the constraint value) by the weighted sum of the first household type column: \n", + "\n", + "The `_update_weights` creates the following adjustment `adj = constraint / float((column * weights).sum())` and use it to update weights (`weights * adj`). The weights for all households of each household type will be multiplied by this ratio to satisfy the constraint. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this sense, the `households_weights` function will finally return a:" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "hh_id\n", + "0 0.055195\n", + "2 0.055195\n", + "3 0.055195\n", + "4 0.050640\n", + "5 0.044570\n", + " ... \n", + "5857 0.044570\n", + "5858 0.050640\n", + "5862 0.048135\n", + "5865 0.055195\n", + "5868 0.044570\n", + "Length: 3625, dtype: float64" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. An array of corrected weights that best matches each household type \n", + "best_weights" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.636802750002612e-16" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 2. And a fit quality based on the proportion of each hh type that reduces the fit changes under the convergence \n", + "fit_quality" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 3. Built in ...\n", + "iterations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Drawing synthetic population " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![draw](img/draw.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "from synthpop import draw" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "num_households = int(hh_marginals_tract_400_block_gp_1.groupby(level=0).sum().mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_households" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "fac = _FrequencyAndConstraints(h_freq_table, h_constraint)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "indexes = draw._draw_indexes(num_households, fac, best_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([1093, 3079, 5129, 2804, 987, 1928, 5717, 5465, 5675, 4820,\n", + " ...\n", + " 3483, 329, 1902, 3312, 4611, 4939, 1862, 590, 4364, 5444],\n", + " dtype='int64', length=178)" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "indexes" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "synth_hh = h_pums.loc[indexes].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
serialnoRTpuma00puma10NPTYPEBLDTENVEHHINCPMVR18R65hh_carshh_childrencat_id
02012001216090H-9400112.04.00.06500.01.00.00.0noneno0
12009000613417H400-9112.02.00.012850.07.00.00.0noneno0
22011000710483H400-9112.03.00.060000.02.00.00.0noneno0
32009000187197H400-9112.02.00.019100.06.00.01.0noneno0
42012001104421H-9400212.04.00.046430.06.00.00.0noneno0
...................................................
1732011000489471H400-9712.03.02.036600.03.01.00.0two or moreyes5
1742013000550452H-9400412.02.02.054100.05.01.00.0two or moreyes5
1752012000679081H-9400612.01.04.0116160.02.01.00.0two or moreyes5
1762010001169426H400-9812.01.03.0103000.06.01.00.0two or moreyes5
1772011001034440H400-9215.03.01.051000.03.00.00.0oneno2
\n", + "

178 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " serialno RT puma00 puma10 NP TYPE BLD TEN VEH HINCP MV \\\n", + "0 2012001216090 H -9 400 1 1 2.0 4.0 0.0 6500.0 1.0 \n", + "1 2009000613417 H 400 -9 1 1 2.0 2.0 0.0 12850.0 7.0 \n", + "2 2011000710483 H 400 -9 1 1 2.0 3.0 0.0 60000.0 2.0 \n", + "3 2009000187197 H 400 -9 1 1 2.0 2.0 0.0 19100.0 6.0 \n", + "4 2012001104421 H -9 400 2 1 2.0 4.0 0.0 46430.0 6.0 \n", + ".. ... .. ... ... .. ... ... ... ... ... ... \n", + "173 2011000489471 H 400 -9 7 1 2.0 3.0 2.0 36600.0 3.0 \n", + "174 2013000550452 H -9 400 4 1 2.0 2.0 2.0 54100.0 5.0 \n", + "175 2012000679081 H -9 400 6 1 2.0 1.0 4.0 116160.0 2.0 \n", + "176 2010001169426 H 400 -9 8 1 2.0 1.0 3.0 103000.0 6.0 \n", + "177 2011001034440 H 400 -9 2 1 5.0 3.0 1.0 51000.0 3.0 \n", + "\n", + " R18 R65 hh_cars hh_children cat_id \n", + "0 0.0 0.0 none no 0 \n", + "1 0.0 0.0 none no 0 \n", + "2 0.0 0.0 none no 0 \n", + "3 0.0 1.0 none no 0 \n", + "4 0.0 0.0 none no 0 \n", + ".. ... ... ... ... ... \n", + "173 1.0 0.0 two or more yes 5 \n", + "174 1.0 0.0 two or more yes 5 \n", + "175 1.0 0.0 two or more yes 5 \n", + "176 1.0 0.0 two or more yes 5 \n", + "177 0.0 0.0 one no 2 \n", + "\n", + "[178 rows x 16 columns]" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "synth_hh" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "metadata": {}, "outputs": [], + "source": [ + "mrg_tbl = pd.DataFrame(\n", + " {'serialno': synth_hh.serialno.values,\n", + " 'hh_id': synth_hh.index.values})" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
serialnohh_id
020120012160900
120090006134171
220110007104832
320090001871973
420120011044214
.........
1732011000489471173
1742013000550452174
1752012000679081175
1762010001169426176
1772011001034440177
\n", + "

178 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " serialno hh_id\n", + "0 2012001216090 0\n", + "1 2009000613417 1\n", + "2 2011000710483 2\n", + "3 2009000187197 3\n", + "4 2012001104421 4\n", + ".. ... ...\n", + "173 2011000489471 173\n", + "174 2013000550452 174\n", + "175 2012000679081 175\n", + "176 2010001169426 176\n", + "177 2011001034440 177\n", + "\n", + "[178 rows x 2 columns]" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "mrg_tbl" ]