In [9]:
db_dir = db
import random
import numpy as np
import pandas as pd
import json
import zipfile
import gzip
import pickle
import torch
import string
import os
from ast import literal_eval
import geopandas
from skmob.tessellation import tilers

In [10]:
print('Generating the processed files - it may take a while....')
print('Reading tessellation....')
try: 
    tessellation = geopandas.read_file(db_dir+'/tessellation.shp', dtype={tile_id_column:str})
except:
    tessellation = geopandas.read_file(db_dir+'/tessellation.geojson', dtype={tile_id_column:str})
tessellation = tessellation[[tile_id_column, tile_geometry]]
print('Reading output areas....')
try: 
    output_areas = geopandas.read_file(db_dir+'/output_areas.shp', dtype={oa_id_column:str})
except:
    output_areas = geopandas.read_file(db_dir+'/output_areas.geojson', dtype={oa_id_column:str})
output_areas = output_areas[[oa_id_column, oa_geometry]]
print('Reading features....')
try:
    features = pd.read_csv(db_dir+'/features.csv')
    if not oa_id_column in list(features.columns):
        raise ValueError('Features must be associated with an output area. Please add a column '++' to features.csv')
except:
    features = None
    print('Running without features. features.csv not found....')
    
print('Mapping output areas with tessellation....')    
output_areas['centroid'] = output_areas[oa_geometry].centroid
# prepare and write  oa_gdf.csv.gz
output_areas["area_km2"] = output_areas[oa_geometry].area/ 10**6
output_areas['x'] = output_areas['centroid'].x
output_areas['y'] = output_areas['centroid'].y
output_areas['ctrs'] = '[' + output_areas['x'].astype(str) + ',' + output_areas['y'].astype(str) + ']' 

temp_out = output_areas[[oa_id_column, 'ctrs','area_km2']]
temp_out.rename(columns={oa_id_column:'geo_code', 'ctrs':'centroid'},inplace=True)

temp_out.to_csv(db_dir+'/processed/oa_gdf.csv.gz')

oa2centroid = {}
for i,row in temp_out.iterrows():
    row['centroid'] = literal_eval(row['centroid'])
    oa2centroid[str(row['geo_code'])] = row['centroid']
    
with open(db_dir+'/processed/oa2centroid.pkl', 'wb') as handle:
    pickle.dump(oa2centroid, handle)

output_areas.drop(columns=[oa_geometry], inplace=True)
output_areas.rename(columns={'centroid':oa_geometry},inplace=True)

mapping = geopandas.sjoin(output_areas, tessellation, how="inner", op="within")
try:
    mapping.drop(columns=['index_right'],inplace=True)
except:
    pass

flows = pd.read_csv(db_dir+'/flows.csv', dtype={flow_origin_column:str, flow_destination_column:str, flow_flows_column:int})
flows = flows[[flow_origin_column, flow_destination_column, flow_flows_column]]

flows.rename(columns={flow_origin_column:'residence', flow_destination_column:'workplace', flow_flows_column:'commuters'},inplace=True)
flows.to_csv(db_dir+'/processed/flows_oa.csv.zip')

od2flow = {}
for i,row in flows.iterrows():
    od2flow[(row['residence'],row['workplace'])] = row['commuters']
    
with open(db_dir+'/processed/od2flow.pkl', 'wb') as handle:
    pickle.dump(od2flow, handle)

features = pd.read_csv(db_dir+'/features.csv', dtype={oa_id_column:str})

oa2features = {}
for i,row in features.iterrows():
    oa2features[row[1]]=list(row[2:].values)
with open(db_dir+'/processed/oa2features.pkl', 'wb') as handle:
    pickle.dump(oa2features, handle)

Generating the processed files - it may take a while....
Reading tessellation....
Reading output areas....
Reading features....
Mapping output areas with tessellation....



  output_areas['centroid'] = output_areas[oa_geometry].centroid

  output_areas["area_km2"] = output_areas[oa_geometry].area/ 10**6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_out.rename(columns={oa_id_column:'geo_code', 'ctrs':'centroid'},inplace=True)
  if await self.run_code(code, result, async_=asy):


In [27]:
tileid2oa2handmade_features = dict()
for i,row in mapping.iterrows():
    if row[tile_id_column] not in tileid2oa2handmade_features:
        tileid2oa2handmade_features[row[tile_id_column]] = dict()
        tileid2oa2handmade_features[row[tile_id_column]][row[oa_id_column]]=dict()
    else:
        tileid2oa2handmade_features[row[tile_id_column]][row[oa_id_column]]=dict()

In [28]:
tileid2oa2handmade_features

{'169': {0: {},
  9: {},
  20: {},
  36: {},
  52: {},
  82: {},
  124: {},
  128: {},
  135: {},
  154: {},
  188: {},
  190: {},
  201: {},
  240: {},
  249: {},
  250: {},
  253: {},
  264: {},
  266: {},
  289: {},
  290: {},
  302: {},
  325: {},
  343: {},
  380: {},
  385: {},
  397: {},
  406: {},
  413: {},
  415: {},
  421: {},
  438: {},
  449: {},
  454: {},
  462: {},
  474: {},
  479: {},
  513: {},
  519: {},
  526: {},
  529: {},
  530: {},
  539: {},
  567: {},
  571: {},
  585: {},
  613: {},
  622: {},
  625: {},
  631: {},
  660: {},
  662: {},
  683: {},
  732: {},
  735: {}},
 '157': {1: {},
  8: {},
  10: {},
  13: {},
  14: {},
  28: {},
  29: {},
  42: {},
  43: {},
  59: {},
  70: {},
  78: {},
  85: {},
  88: {},
  97: {},
  101: {},
  119: {},
  121: {},
  127: {},
  145: {},
  146: {},
  150: {},
  160: {},
  166: {},
  183: {},
  185: {},
  194: {},
  211: {},
  214: {},
  224: {},
  225: {},
  226: {},
  235: {},
  237: {},
  242: {},
  260: {},
  297: {}

In [None]:
for i,row in features.iterrows():
    for item in zip(list(row.keys()),row.values):
        tileid2oa2handmade_features[row[tile_id_column]][item[0]]=[item[1]]

with open('tileid2oa2handmade_features_retry.json', 'w') as f:
    json.dump(tileid2oa2handmade_features, f)

In [36]:
features.keys()

Index(['Unnamed: 0', 'GEOID', '0', '1', '2', '3', '4', '5'], dtype='object')

In [37]:
features = features[['GEOID', '0', '1', '2', '3', '4', '5']]

In [38]:
features

Unnamed: 0,GEOID,0,1,2,3,4,5
3,3,1486,2140,16.507081,34.560000,264.960000,9.213950
4,4,2319,3339,5.091071,4.320000,7.200000,13.233970
5,5,904,1302,2.176420,1.440000,1.440000,11.088848
6,6,7961,11464,9.481780,27.360000,427.680000,16.939439
7,7,7960,1566,19.119324,71.016915,271.083956,25.035740
...,...,...,...,...,...,...,...
746,746,602,867,9.124699,31.680000,18.720000,11.796066
747,747,6537,9413,8.847477,30.240000,25.920000,25.567458
748,748,569,819,7.061171,28.800000,21.600000,12.002166
749,749,2730,3931,7.732232,21.600000,48.960000,10.534842


In [53]:
for i,row in features.iterrows():
    for item in zip(list(row.keys()),row.values):
        if row[oa_id_column] in tileid2oa2handmade_features.keys():
            tileid2oa2handmade_features[row[oa_id_column]][item[0]]=[item[1]]

In [None]:

with open('tileid2oa2handmade_features_retry.json', 'w') as f:
    json.dump(tileid2oa2handmade_features, f)

In [54]:
tileid2oa2handmade_features

{'169': {0: {},
  9: {},
  20: {},
  36: {},
  52: {},
  82: {},
  124: {},
  128: {},
  135: {},
  154: {},
  188: {},
  190: {},
  201: {},
  240: {},
  249: {},
  250: {},
  253: {},
  264: {},
  266: {},
  289: {},
  290: {},
  302: {},
  325: {},
  343: {},
  380: {},
  385: {},
  397: {},
  406: {},
  413: {},
  415: {},
  421: {},
  438: {},
  449: {},
  454: {},
  462: {},
  474: {},
  479: {},
  513: {},
  519: {},
  526: {},
  529: {},
  530: {},
  539: {},
  567: {},
  571: {},
  585: {},
  613: {},
  622: {},
  625: {},
  631: {},
  660: {},
  662: {},
  683: {},
  732: {},
  735: {},
  'GEOID': ['169'],
  '0': [780],
  '1': [1123],
  '2': [4.501573275],
  '3': [7.2],
  '4': [7.2],
  '5': [14.22810209]},
 '157': {1: {},
  8: {},
  10: {},
  13: {},
  14: {},
  28: {},
  29: {},
  42: {},
  43: {},
  59: {},
  70: {},
  78: {},
  85: {},
  88: {},
  97: {},
  101: {},
  119: {},
  121: {},
  127: {},
  145: {},
  146: {},
  150: {},
  160: {},
  166: {},
  183: {},
  185: {}