# Prototype workflow generating input for deployment of Demoland for a custom area in England

Requires:

- Area of interest defined in a GDAL-readable file. All geometries present in the file are considered a part of the AOI.
- GTFS data file

1. Get the extent of AoI

In [1]:
import zipfile
from itertools import product
import datetime as dt

import geopandas as gpd
import h3
import pandas as pd
import requests
import shapely
import xarray as xr
import numpy as np
import joblib
import tracc
from libpysal import graph

import demoland_engine
from demoland_engine.indicators import Accessibility, Model

from r5py import TransportNetwork, TravelTimeMatrixComputer, TransportMode

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
aoi = gpd.read_file("https://github.com/Urban-Analytics-Technology-Platform/demoland-web/raw/main/web/src/data/geography.json")

In [3]:
aoi_poly = aoi.to_crs(27700).unary_union

2. Get H3 grid with the data for the AoI

In [4]:
data_folder = "../../../demoland_data"

Read the full grid

In [5]:
grid = gpd.read_parquet(f"{data_folder}/h3/grid_complete.parquet")

Get a portion of the grid covering AoI.

In [6]:
grid_aoi = grid.iloc[grid.sindex.query(aoi_poly, predicate="intersects")]
grid_aoi[["lat", "lon"]] = pd.DataFrame(grid_aoi.index.to_series().apply(h3.h3_to_geo).tolist(), columns=["lat", "lon"], index=grid_aoi.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


3. Make predictive models ready

Read the full matrix and subset it for AOI

In [7]:
matrix = graph.read_parquet(f"{data_folder}/h3/grid_adjacency_binary.parquet").transform("r")

In [8]:
matrix_aoi = matrix.subgraph(grid_aoi.index)

In [13]:
matrix_aoi.to_parquet("matrix.parquet")

5. Make accessibility ready
    6. Get GTFS
  
Go to https://data.bus-data.dft.gov.uk/downloads/, register and download timetable data for your region in GTFS data format.

In [7]:
gtfs_data_file = f"{data_folder}/raw/accessibility/itm_north_east_gtfs.zip"

7. Get network from OSM

Download a fresh OSM snapshot for England.

In [14]:
r = requests.get('http://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf')
with open("england-latest.osm.pbf", "wb") as f:
    f.write(r.content)

Extract the AoI. We need a GeoJSON of the area.

In [8]:
aoi.dissolve().to_file("aoi.geojson")

And then can use osmium to get an extract.

In [16]:
!osmium extract -p aoi.geojson england-latest.osm.pbf -o aoi.osm.pbf



8. Get OS Greenspace

In [30]:
r = requests.get('https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=GeoPackage&redirect')
with open("opgrsp_gpkg_gb.zip", "wb") as f:
    f.write(r.content)

Read the file.

In [9]:
with zipfile.ZipFile('opgrsp_gpkg_gb.zip', 'r') as zip_ref:
    with zip_ref.open("Data/opgrsp_gb.gpkg") as gsp:
        f = gsp.read()
        greenspace_sites = gpd.read_file(f, engine="pyogrio", layer="greenspace_site")
        greenspace_access = gpd.read_file(f, engine="pyogrio", layer="access_point")

  result = ogr_read(
  result = ogr_read(


Extract the AoI

In [10]:
greenspace_sites_aoi = greenspace_sites.iloc[greenspace_sites.sindex.query(aoi_poly, predicate="intersects")]
greenspace_access_aoi = greenspace_access.iloc[greenspace_access.sindex.query(aoi_poly, predicate="intersects")]

9. Process OS Greenspace

In [11]:
greenspace_sites_select = greenspace_sites_aoi.query(
    "function!='Allotments Or Community Growing Spaces' & function!='Golf Course' & function!='Bowling Green'"
)
publicpark = greenspace_sites_select.query("function=='Public Park Or Garden'")
playingfield = greenspace_sites_select.query("function=='Playing Field'")
othersport = greenspace_sites_select.query("function=='Other Sports Facility'")
therest = greenspace_sites_select.query(
    "function!='Playing Field' & function!='Public Park Or Garden' & function!='Other Sports Facility'"
)

In [12]:
# find 'therest' not included in the upper categories
# we use sjoin to performe a spatial filter of 'therest' polygons contained in upper categories
join11 = gpd.sjoin(therest, othersport, predicate="within", how="inner")
join12 = gpd.sjoin(therest, playingfield, predicate="within", how="inner")
join13 = gpd.sjoin(therest, publicpark, predicate="within", how="inner")

# generate list of the IDs of 'therest' contained in upper categories, in order to eliminate the corresponding polygons from the layer
list_for_diff11 = join11["id_left"].drop_duplicates().to_list()

diff11 = therest[
    ~therest.id.isin(list_for_diff11)
]  # 1st difference layer # note the negation character ~ to take the polygons NOT included

list_for_diff12 = join12["id_left"].drop_duplicates().to_list()
diff12 = diff11[~diff11.id.isin(list_for_diff12)]  # 2nd difference layer

list_for_diff13 = join13["id_left"].drop_duplicates().to_list()
diff13 = diff12[
    ~diff12.id.isin(list_for_diff13)
]  # 3rd difference layer, this is for 'therest' categories

In [13]:
# we repeat the same operation for subsequent categories:
# find 'othersport' not included in the upper categories
join21 = gpd.sjoin(othersport, playingfield, predicate="within", how="inner")
join22 = gpd.sjoin(othersport, publicpark, predicate="within", how="inner")

list_for_diff21 = join21["id_left"].drop_duplicates().to_list()
diff21 = othersport[~othersport.id.isin(list_for_diff21)]

list_for_diff22 = join22["id_left"].drop_duplicates().to_list()
diff22 = diff21[~diff21.id.isin(list_for_diff22)]  # 'othersport' difference

In [14]:
# find 'playing fields' not included in the upper categories (and viceversa?)
join31 = gpd.sjoin(playingfield, publicpark, predicate="within", how="inner")
join32 = gpd.sjoin(
    publicpark, playingfield, predicate="within", how="inner"
)  ## check it is not empty ... it is empty, we do not use this join

list_for_diff31 = join31["id_left"].drop_duplicates().to_list()
diff31 = playingfield[
    ~playingfield.id.isin(list_for_diff31)
]  # 'playingfield' difference

In [15]:
# put together all the differences layers: (and should bring out to the desired output)
together1 = pd.concat([diff13, diff22]).pipe(
    gpd.GeoDataFrame
)  # 'therest' + 'othersport' differences
together1.head()
together2 = pd.concat([together1, diff31]).pipe(
    gpd.GeoDataFrame
)  # last gdf + 'playingfield' difference
together_again = gpd.GeoDataFrame(pd.concat([together2, publicpark]), crs=27700)  # last gdf + all the public parks)

In [16]:
list_gs_id = together_again["id"].to_list()
accesspoints_edge = greenspace_access_aoi[greenspace_access_aoi.ref_to_greenspace_site.isin(list_gs_id)]
accesspoints_edge = accesspoints_edge.to_crs(27700)

together_again["area_m2"] = together_again["geometry"].area

together_again.to_file("greenspace.gpkg", layer="sites")
accesspoints_edge.to_file("greenspace.gpkg", layer="access_points")

10. Create traveltime matrix (origins are cells, destinations are cells plus greenspace entrances)

In [34]:
origins = grid_aoi.set_geometry(grid_aoi.centroid).to_crs(4326)
origins["id"] = origins.index

In [24]:
destinations = pd.concat(
    [
        origins[["id", "geometry"]],
        accesspoints_edge[["id", "geometry", "ref_to_greenspace_site"]].to_crs(4326),
    ],
    ignore_index=True
)

In [26]:
transport_network = TransportNetwork("aoi.osm.pbf", [gtfs_data_file])

generate dataframe with all from_id and all to_id pairs

In [62]:
prod = product(origins["id"].unique(), destinations["id"].unique())
empty_ttm = pd.DataFrame(prod)
empty_ttm.columns = ["from_id", "to_id"]

In [63]:
# defining variables
date_time = "2023,11,23,9,30"  # CHOOSE BEST DATE/TIME
max_time = dt.timedelta(seconds=900) # SET TO 15 MIN
walking_speed = 4.8
cycling_speed = 16

dataframe to match legmode and transitmode objects (to be inputted in the ttm computer)

In [64]:
modes_lut = pd.DataFrame(
    [
        ["transit", TransportMode.CAR, TransportMode.WALK],
        ["car", "", TransportMode.CAR],
        ["bicycle", "", TransportMode.BICYCLE],
        ["walk", "", TransportMode.WALK],
    ],
    columns=("Mode", "Transit_mode", "Leg_mode"),
)

In [65]:
# function to generate custom list of transit+transport mode for the parameter transport_modes in TravelTimeMatrixComputer
def list_making(s, z):
    if s:
        return [s] + [z]
    return [z]

In [66]:
ttm_complete = empty_ttm.copy()

# loop to compute a ttm for all the modes and generate one single ttm table in output
for row in modes_lut.itertuples():
    start_time = dt.datetime.now()
    mode = row.Mode
    transit_mode = row.Transit_mode
    leg_mode = row.Leg_mode
    transport_mode = list_making(
        transit_mode, leg_mode
    )  # creating list of objects for transport_modes parameter

    print(
        "The current mode is:",
        mode,
        ", transit is:",
        transit_mode,
        ", transport var is:",
        transport_mode,
    )
    ttm_computer = TravelTimeMatrixComputer(
        transport_network,
        origins=origins,
        destinations=destinations,
        departure=dt.datetime.strptime(date_time, "%Y,%m,%d,%H,%M"),
        max_time = max_time,
        speed_walking=walking_speed,
        speed_cycling=cycling_speed,
        transport_modes=transport_mode,
    )

    ttm = ttm_computer.compute_travel_times()
    ttm = ttm.rename(
        columns={"travel_time": f"time_{mode}"}
    )  # renaming 'travel_time' column (automatically generated) to 'time_{mode of transport}'
    #  merging the empty table generated before (with all possible origins and destinations) with the ttm, per each mode adding a travel time column
    ttm_complete = ttm_complete.merge(
        ttm, how="outer", left_on=["from_id", "to_id"], right_on=["from_id", "to_id"]
    )

    print("finished calculating ttm for mode", mode)
    end_time = dt.datetime.now()
    print("Duration for", mode, ": {}".format(end_time - start_time))

The current mode is: transit , transit is: TransportMode.CAR , transport var is: [<TransportMode.CAR: 'CAR'>, <TransportMode.WALK: 'WALK'>]
finished calculating ttm for mode transit
Duration for transit : 0:09:07.266543
The current mode is: car , transit is:  , transport var is: [<TransportMode.CAR: 'CAR'>]
finished calculating ttm for mode car
Duration for car : 0:08:44.851404
The current mode is: bicycle , transit is:  , transport var is: [<TransportMode.BICYCLE: 'BICYCLE'>]
finished calculating ttm for mode bicycle
Duration for bicycle : 0:09:46.095740
The current mode is: walk , transit is:  , transport var is: [<TransportMode.WALK: 'WALK'>]
finished calculating ttm for mode walk
Duration for walk : 0:10:16.975145


In [67]:
ttm_complete.to_parquet(f"ttm_complete.parquet")

Wrap to a demoland_engine accessibility

In [2]:
ttm_complete = pd.read_parquet("ttm_complete.parquet")

In [3]:
ttm = ttm_complete.set_index(["from_id", "to_id"])
ttm.columns = ["transit", "car", "bike", "walk"]
ttm.columns.name = "mode"
ttm_arr = xr.DataArray.from_series(ttm.stack())
ttm_15 = ttm_arr <= 15
ttm_15.name = "ttm_15"

In [13]:
grid_aoi.columns

Index(['geometry', 'air_quality_index', 'house_price_index', 'population',
       'A, B, D, E. Agriculture, energy and water', 'C. Manufacturing',
       'F. Construction', 'G, I. Distribution, hotels and restaurants',
       'H, J. Transport and communication',
       'K, L, M, N. Financial, real estate, professional and administrative activities',
       'O,P,Q. Public administration, education and health',
       'R, S, T, U. Other', 'Land cover [Discontinuous urban fabric]',
       'Land cover [Continuous urban fabric]',
       'Land cover [Non-irrigated arable land]',
       'Land cover [Industrial or commercial units]',
       'Land cover [Green urban areas]', 'Land cover [Pastures]',
       'Land cover [Sport and leisure facilities]', 'sdbAre', 'sdbCoA',
       'ssbCor', 'ssbCCM', 'ssbCCD', 'sdcAre', 'sscCCo', 'sscERI', 'sicCAR',
       'stbCeA', 'mtbAli', 'mtbNDi', 'mtcWNe', 'ltbIBD', 'sdsSPW', 'sdsSWD',
       'sdsSPO', 'sdsLen', 'sssLin', 'ldsMSL', 'mtdDeg', 'linP3W', 'linP4W

In [15]:
wpz_population = grid_aoi[[
    'A, B, D, E. Agriculture, energy and water', 'C. Manufacturing',
    'F. Construction', 'G, I. Distribution, hotels and restaurants',
    'H, J. Transport and communication',
    'K, L, M, N. Financial, real estate, professional and administrative activities',
    'O,P,Q. Public administration, education and health',
    'R, S, T, U. Other'
]].sum(axis=1)
wpz_population.index.name = "to_id"

In [17]:
da = xr.DataArray.from_series(wpz_population)
da.name = "wpz_population"
baseline = xr.merge([ttm_15, da])
baseline["wpz_population"] = baseline["wpz_population"].fillna(0)

In [18]:
baseline

Load greenspace data.

In [28]:
gs_sites = gpd.read_file("greenspace.gpkg", layer="sites").rename(columns={"id": "id_site"})
gs_entrances = gpd.read_file("greenspace.gpkg", layer="access_points").rename(columns={"id": "id_entrance"})

In [32]:
# associate park area to entrances
gs_entrances_with_parkarea = pd.merge(
    gs_entrances[["id_entrance", "ref_to_greenspace_site"]],
    gs_sites[["id_site", "function", "area_m2"]],
    left_on="ref_to_greenspace_site",
    right_on="id_site",
    how="right",
)

In [36]:
ttm_greenspace = (
    ttm_complete.copy()
)  # saving a copy of the matrix (the following operations will add columns to it, but we want to keep the original one also)

ttm_gs_with_area = pd.merge(
    ttm_greenspace,
    gs_entrances_with_parkarea[["id_entrance", "ref_to_greenspace_site", "area_m2"]],
    left_on="to_id",
    right_on="id_entrance",
    how="left",
)
# generate tracc cost object
ttm_gs_tracc = tracc.costs(ttm_gs_with_area)

modes_list = ["transit", "car", "bicycle", "walk"]

# empty dataframes to be filled up in the next for loop
acc_pot_gs = origins[["id"]]
gs_acc = []

for m in modes_list:
    # generate variable names to be used in the tracc function below
    cost_name = "time_" + m
    travel_costs_ids = ["from_id", "to_id"]
    impedence_param = 15  # value for impedence function, to be changed as needed
    impedence_param_string = str(impedence_param)
    # name of the column
    cost_output = (
        "cum_" + impedence_param_string + "_" + m
    )  # naming depends on impedence function threshold
    area_column_name = "area_" + impedence_param_string + "_" + m
    acc_column_name = (
        "pot_cum_acc_" + impedence_param_string + "_" + m
    )  # naming depends on impedence function threshold
    opportunity = "pop"
    # Computing impedence function based on a 15 minute travel time threshold.
    ttm_gs_tracc.impedence_calc(
        cost_column=cost_name,
        impedence_func="cumulative",
        impedence_func_params=impedence_param,  # to calculate opportunities in X min threshold
        output_col_name=cost_output,
        prune_output=False,
    )
    ttm_gs_df = ttm_gs_tracc.data
    # Setting up the accessibility object. This includes joining the destination data to the travel time data
    # this needed to be done differently for greenspace, as opportunity is sites's area cumulative sum
    # A. Filtering only rows with time travel within the threshold
    print("cost output is", cost_output)
    print("area column name is", area_column_name)
    # tracc_15min = ttm_gs_tracc.data[ttm_gs_tracc.data.loc[:,cost_output]==1] # this doesn't work because of the different lenghts of the columns generated per mode
    ttm_gs_tracc.data[area_column_name] = (
        ttm_gs_tracc.data["area_m2"] * ttm_gs_tracc.data[cost_output]
    )
    ttm_gs_df = ttm_gs_tracc.data

    # B. Filter entrances (only one per park)
    oneaccess_perpark = ttm_gs_df.sort_values(cost_name).drop_duplicates(
        ["from_id", "ref_to_greenspace_site"]
    )
    # C. Assign metric as sum[parks' area]
    # generate df with one row per OA centroid ('from_id') and sum of sites' areas - per each mode
    gs_metric_per_mode = oneaccess_perpark.groupby(["from_id"])[
        area_column_name
    ].sum()  # .reset_index()
    gs_acc.append(gs_metric_per_mode)
gs_acc = pd.concat(gs_acc, axis=1)

cost output is cum_15_transit
area column name is area_15_transit
cost output is cum_15_car
area column name is area_15_car
cost output is cum_15_bicycle
area column name is area_15_bicycle
cost output is cum_15_walk
area column name is area_15_walk


In [37]:
gs_acc.to_parquet(
    f"acc_greenspace_allmodes_15min.parquet"
)

In [38]:
gs_acc.columns = ["transit", "car", "bike", "walk"]
greenspace = xr.DataArray.from_series(gs_acc.stack()).rename(
    {"level_1": "mode"}
)
greenspace.name = "green_accessibility"

In [39]:
baseline = xr.merge([baseline, greenspace])
baseline["green_accessibility"] = baseline["green_accessibility"].fillna(0)

Create demoland class

In [40]:
acc = Accessibility(baseline)

In [69]:
with open(f"accessibility.joblib", "wb") as f:
    joblib.dump(acc, f, compress=True)

12. Generate files for the app

Geography

In [15]:
grid_aoi.geometry.to_crs(4326).to_file("geography.json")

Baseline

In [9]:
with open(f"{data_folder}/h3/house_price_model.joblib", "rb") as f:
    hp_model = joblib.load(f)
with open(f"{data_folder}/h3/air_quality_model.joblib", "rb") as f:
    aq_model = joblib.load(f)

hp = Model(matrix_aoi, hp_model)
aq = Model(matrix_aoi, aq_model)

In [11]:
baseline_hp = hp.predict(grid_aoi.drop(columns=[
    "geometry",
    "air_quality_index",
    "house_price_index",
    "signature_type"
]))
baseline_aq = aq.predict(grid_aoi.drop(columns=[
    "geometry",
    "air_quality_index",
    "house_price_index",
    "signature_type"
]))

In [12]:
with open(f"accessibility.joblib", "rb") as f:
    acc = joblib.load(f)

In [27]:
oa = pd.Series(0, index=grid_aoi.index, name="oa")
oa.index.name = "to_id"

baseline_ja = acc.job_accessibility(oa, "walk")
baseline_ga = acc.greenspace_accessibility(oa, "walk")

In [38]:
mapping = {
        "Wild countryside": 0,
        "Countryside agriculture": 1,
        "Urban buffer": 2,
        "Warehouse/Park land": 3,
        "Open sprawl": 4,
        "Disconnected suburbia": 5,
        "Accessible suburbia": 6,
        "Connected residential neighbourhoods": 7,
        "Dense residential neighbourhoods": 8,
        "Gridded residential quarters": 9,
        "Dense urban neighbourhoods": 10,
        "Local urbanity": 11,
        "Regional urbanity": 12,
        "Metropolitan urbanity": 13,
        "Concentrated urbanity": 14,
        "Hyper concentrated urbanity": 15,
    }
baseline = pd.DataFrame(
    np.array([baseline_aq, baseline_hp, baseline_ja, baseline_ga]).T, 
    index=grid_aoi.index, 
    columns=[
        "air_quality",
        "house_price",
        "job_accessibility",
        "greenspace_accessibility",
    ])
baseline['signature_type'] = grid_aoi.signature_type.map(mapping)


In [40]:
baseline.to_json()

'{"air_quality":{"8919465366bffff":11.9983424398,"89194653477ffff":12.2891333322,"8919465343bffff":12.3517235049,"8919465340fffff":12.596727333,"89194653433ffff":12.155927087,"89194653677ffff":12.0905082676,"8919465363bffff":11.9131534864,"8919465346fffff":12.1389019829,"89194653467ffff":11.9340837186,"8919465346bffff":12.0935197114,"89194653463ffff":12.3582377303,"8919465347bffff":12.2059014529,"89194653473ffff":12.6540659949,"8919465344fffff":12.370791278,"89194653447ffff":12.7650721748,"891946536afffff":12.4432377639,"891946536a7ffff":12.2721504818,"891946536abffff":12.4897438107,"891946536a3ffff":12.4630446084,"891946536b7ffff":12.4379196493,"891946536bbffff":12.2064544528,"891946536b3ffff":11.959657578,"89194653687ffff":12.0698132524,"8919465344bffff":12.0806472093,"89194653633ffff":12.2104309573,"8919465360fffff":12.2974669257,"89194653607ffff":12.5356529282,"89194653603ffff":12.248261564,"89194653617ffff":12.474814224,"8919465361bffff":12.2843621878,"89194653613ffff":12.49590166