# Extract population size of the spatial zones

This applies to Sweden, The Netherlands, and Sao Paulo.

Sweden is done outside this file with available population of DeSO zones.

The Netherlands and Sao Paulo use gridded population data.

In [34]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
import os
import subprocess
import sys
import pandas as pd
import numpy as np
import geopandas as gpd
from rasterstats import zonal_stats
import netherlands
import saopaulo
import sweden_sv

def get_repo_root():
    """Get the root directory of the repo."""
    dir_in_repo = os.path.dirname(os.path.abspath('__file__')) # os.getcwd()
    return subprocess.check_output('git rev-parse --show-toplevel'.split(),
                                   cwd=dir_in_repo,
                                   universal_newlines=True).rstrip()
sys.path.append(get_repo_root())
ROOT_dir = get_repo_root()
pop_file = ROOT_dir + '/dbs/gpw_2015.tif'

## 1. The Netherlands

In [36]:
rg_n = netherlands.GroundTruthLoader()
rg_n.load_zones()

In [37]:
results = zonal_stats(rg_n.zones.to_crs(4326), pop_file, stats=['sum'], geojson_out=True)
zs_gdf_pz = gpd.GeoDataFrame.from_features(results)
zs_dict = {row['zone']: row['sum'] for _, row in zs_gdf_pz.iterrows()}

In [38]:
df_home_n = pd.read_csv(ROOT_dir + '/dbs/netherlands/homelocations.csv')
gdf_home_n = gpd.GeoDataFrame(
    df_home_n,
    crs='EPSG:4326',
    geometry=gpd.points_from_xy(df_home_n['longitude'], df_home_n['latitude'])
    )
gdf_home_n = gpd.sjoin(gdf_home_n, rg_n.zones.to_crs(4326))
gdf_home_n.head()

Unnamed: 0,userid,latitude,longitude,geometry,index_right,zone
0,1013,51.824402,4.927636,POINT (4.92764 51.82440),1395,4254
1,11675,52.092615,5.112631,POINT (5.11263 52.09261),1080,3511
34,3049901,52.09235,5.11322,POINT (5.11322 52.09235),1080,3511
48,5431382,52.092608,5.112933,POINT (5.11293 52.09261),1080,3511
56,5857552,52.092441,5.11317,POINT (5.11317 52.09244),1080,3511


In [39]:
df_home_n = gdf_home_n.drop(columns=['geometry', 'index_right'])
df_home_n.loc[:, 'pop'] = df_home_n.loc[:, 'zone'].apply(lambda x: zs_dict[x])
df_home_n.head()

Unnamed: 0,userid,latitude,longitude,zone,pop
0,1013,51.824402,4.927636,4254,5445.078613
1,11675,52.092615,5.112631,3511,13629.207031
34,3049901,52.09235,5.11322,3511,13629.207031
48,5431382,52.092608,5.112933,3511,13629.207031
56,5857552,52.092441,5.11317,3511,13629.207031


In [40]:
df_home_n.loc[:, 'pop'] = df_home_n.loc[:, 'pop'].fillna(0)
df_home_n_size = pd.DataFrame(df_home_n.groupby('zone').size(), columns=['tw_pop'])
df_home_n_proc = df_home_n.join(df_home_n_size, on='zone')
df_home_n_proc.loc[:, 'weight'] = df_home_n_proc.loc[:, 'pop'] / df_home_n_proc.loc[:, 'tw_pop']
weight_median = df_home_n_proc.loc[:, 'weight'].median()
df_home_n_proc.loc[:, 'weight'] = df_home_n_proc.loc[:, 'weight'].apply(lambda x: weight_median if x == 0 else x)
df_home_n_proc

Unnamed: 0,userid,latitude,longitude,zone,pop,tw_pop,weight
0,1013,51.824402,4.927636,4254,5445.078613,1,5445.078613
1,11675,52.092615,5.112631,3511,13629.207031,42,324.504929
34,3049901,52.092350,5.113220,3511,13629.207031,42,324.504929
48,5431382,52.092608,5.112933,3511,13629.207031,42,324.504929
56,5857552,52.092441,5.113170,3511,13629.207031,42,324.504929
...,...,...,...,...,...,...,...
5316,4212410595,51.433186,4.313162,4631,9343.757812,1,9343.757812
5325,4764532456,52.450000,4.833330,1503,7179.984375,1,7179.984375
5329,4841286081,51.417730,5.816390,5757,10113.166992,1,10113.166992
5339,710698803171618816,52.843310,5.027540,1771,10078.034180,1,10078.034180


In [41]:
df_home_n_proc.loc[:, ['userid', 'latitude', 'longitude',
                       'zone', 'weight']].to_csv(ROOT_dir + '/dbs/netherlands/homelocations_wt.csv', index=False)


## 2. Sao Paulo

In [42]:
rg_s = saopaulo.GroundTruthLoader()
rg_s.load_zones()

In [43]:
results = zonal_stats(rg_s.zones.to_crs(4326), pop_file, stats=['sum'], geojson_out=True)
zs_gdf_pz = gpd.GeoDataFrame.from_features(results)
zs_dict = {row['zone']: row['sum'] for _, row in zs_gdf_pz.iterrows()}

In [44]:
df_home_s = pd.read_csv(ROOT_dir + '/dbs/saopaulo/homelocations.csv')
gdf_home_s = gpd.GeoDataFrame(
    df_home_s,
    crs='EPSG:4326',
    geometry=gpd.points_from_xy(df_home_s['longitude'], df_home_s['latitude'])
    )
gdf_home_s = gpd.sjoin(gdf_home_s, rg_s.zones.to_crs(4326))
gdf_home_s.head()

Unnamed: 0,userid,latitude,longitude,geometry,index_right,zone
0,2542,-23.562703,-46.697775,POINT (-46.69777 -23.56270),81,82
31,8067402,-23.572454,-46.695811,POINT (-46.69581 -23.57245),81,82
40,8411992,-23.56127,-46.69422,POINT (-46.69422 -23.56127),81,82
67,9207302,-23.570027,-46.691704,POINT (-46.69170 -23.57003),81,82
81,9502352,-23.564909,-46.702545,POINT (-46.70254 -23.56491),81,82


In [45]:
df_home_s = gdf_home_s.drop(columns=['geometry', 'index_right'])
df_home_s.loc[:, 'pop'] = df_home_s.loc[:, 'zone'].apply(lambda x: zs_dict[x])
df_home_s.head()

Unnamed: 0,userid,latitude,longitude,zone,pop
0,2542,-23.562703,-46.697775,82,12405.144531
31,8067402,-23.572454,-46.695811,82,12405.144531
40,8411992,-23.56127,-46.69422,82,12405.144531
67,9207302,-23.570027,-46.691704,82,12405.144531
81,9502352,-23.564909,-46.702545,82,12405.144531


In [46]:
df_home_s.loc[:, 'pop'] = df_home_s.loc[:, 'pop'].fillna(0)
df_home_s_size = pd.DataFrame(df_home_s.groupby('zone').size(), columns=['tw_pop'])
df_home_s_proc = df_home_s.join(df_home_s_size, on='zone')
df_home_s_proc.loc[:, 'weight'] = df_home_s_proc.loc[:, 'pop'] / df_home_s_proc.loc[:, 'tw_pop']
weight_median = df_home_s_proc.loc[:, 'weight'].median()
df_home_s_proc.loc[:, 'weight'] = df_home_s_proc.loc[:, 'weight'].apply(lambda x: weight_median if x == 0 else x)
df_home_s_proc

Unnamed: 0,userid,latitude,longitude,zone,pop,tw_pop,weight
0,2542,-23.562703,-46.697775,82,12405.144531,136,91.214298
31,8067402,-23.572454,-46.695811,82,12405.144531,136,91.214298
40,8411992,-23.561270,-46.694220,82,12405.144531,136,91.214298
67,9207302,-23.570027,-46.691704,82,12405.144531,136,91.214298
81,9502352,-23.564909,-46.702545,82,12405.144531,136,91.214298
...,...,...,...,...,...,...,...
9129,423421443,-23.760865,-46.840051,469,6337.958008,1,6337.958008
9437,589304402,-23.744114,-46.900317,470,14868.516602,1,14868.516602
9943,1576464876,-23.927200,-47.070000,475,39264.402344,1,39264.402344
10192,2436079743,-23.469805,-46.319954,399,77719.625000,1,77719.625000


In [47]:
df_home_s_proc.loc[:, ['userid', 'latitude', 'longitude',
                       'zone', 'weight']].to_csv(ROOT_dir + '/dbs/saopaulo/homelocations_wt.csv', index=False)


## 3. Sweden

In [48]:
rg_sw = sweden_sv.GroundTruthLoader()
rg_sw.load_zones()


In [49]:
df_home_sw = pd.read_csv(ROOT_dir + '/dbs/sweden/homelocations.csv')
gdf_home_sw = gpd.GeoDataFrame(
    df_home_sw,
    crs='EPSG:4326',
    geometry=gpd.points_from_xy(df_home_sw['longitude'], df_home_sw['latitude'])
    )
gdf_home_sw = gpd.sjoin(gdf_home_sw, rg_sw.zones.to_crs(4326))
gdf_home_sw.head()

Unnamed: 0,userid,latitude,longitude,geometry,index_right,zone
0,5616,59.426889,17.954336,POINT (17.95434 59.42689),432,0163C1150
30,5040331,59.428098,17.949386,POINT (17.94939 59.42810),432,0163C1150
1,5976,56.05394,12.68457,POINT (12.68457 56.05394),3162,1283C1590
2582,407770195,56.057689,12.683189,POINT (12.68319 56.05769),3162,1283C1590
3747,2582504754,56.054448,12.684705,POINT (12.68471 56.05445),3162,1283C1590


In [50]:
df_home_sw = gdf_home_sw.drop(columns=['geometry', 'index_right'])
df_pop_sw = pd.read_csv(ROOT_dir + '/dbs/sweden/survey_deso/population.csv')
zs_dict = {row['zone']: row['pop'] for _, row in df_pop_sw.iterrows()}

df_home_sw.loc[:, 'pop'] = df_home_sw.loc[:, 'zone'].apply(lambda x: zs_dict[x])
df_home_sw.head()

Unnamed: 0,userid,latitude,longitude,zone,pop
0,5616,59.426889,17.954336,0163C1150,1840
30,5040331,59.428098,17.949386,0163C1150,1840
1,5976,56.05394,12.68457,1283C1590,1948
2582,407770195,56.057689,12.683189,1283C1590,1948
3747,2582504754,56.054448,12.684705,1283C1590,1948


In [51]:
df_home_sw.loc[:, 'pop'] = df_home_sw.loc[:, 'pop'].fillna(0)
df_home_sw_size = pd.DataFrame(df_home_sw.groupby('zone').size(), columns=['tw_pop'])
df_home_sw_proc = df_home_sw.join(df_home_sw_size, on='zone')
df_home_sw_proc.loc[:, 'weight'] = df_home_sw_proc.loc[:, 'pop'] / df_home_sw_proc.loc[:, 'tw_pop']
weight_median = df_home_sw_proc.loc[:, 'weight'].median()
df_home_sw_proc.loc[:, 'weight'] = df_home_sw_proc.loc[:, 'weight'].apply(lambda x: weight_median if x == 0 else x)
df_home_sw_proc

Unnamed: 0,userid,latitude,longitude,zone,pop,tw_pop,weight
0,5616,59.426889,17.954336,0163C1150,1840,2,920.000000
30,5040331,59.428098,17.949386,0163C1150,1840,2,920.000000
1,5976,56.053940,12.684570,1283C1590,1948,3,649.333333
2582,407770195,56.057689,12.683189,1283C1590,1948,3,649.333333
3747,2582504754,56.054448,12.684705,1283C1590,1948,3,649.333333
...,...,...,...,...,...,...,...
3941,4834800161,55.597618,12.998970,1280C2540,1771,1,1771.000000
3954,750012578802364416,59.616700,17.850000,0191C1020,2584,1,2584.000000
3955,751647827651092481,63.826419,20.295268,2480C1270,1225,1,1225.000000
3956,753867220246269952,60.869916,14.522473,2062A0010,1677,1,1677.000000


In [52]:
df_home_sw_proc.loc[:, ['userid', 'latitude', 'longitude',
                       'zone', 'weight']].to_csv(ROOT_dir + '/dbs/sweden/homelocations_wt.csv', index=False)