# Test spatial units in the modelling

This tests performance of different core spatial units (grids, OA, LSOA) in the modelling to figure out the best unit to be used internally. 

Units to compare:

- 2011 output areas (the data we have are linked to 2011 census, not 2021)
- square grid 100m (mimicking OSGB but not being aligned for simplicity of this exercise)
- H3 grid at resolution 9
- Enclosed tessellation cells

Targets to compare:

- Air pollution
- House price

Models to compare:

- distance band weights using fuzzy
- contiguity of order 5

Geographic locations to compare:

- Leeds (chunk 40)
- Newcastle (chunk 26)

The final models shall be trained on the England-wide data. This trains only on each AOI.

In [1]:
import pandas as pd
import geopandas as gpd
import xarray as xr
import tobler
import libpysal
import numpy as np
from itertools import product
import esda

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import metrics
from tobler.util import h3fy

In [2]:
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

In [3]:
chunks = gpd.read_parquet(f"{data_folder}/raw/urban_morpho/local_auth_chunks.pq")

- get air
- for chunk
    - for geom
        - generate/load geoms
        - interpolate air
        - interpolate price
        - interpolate explanatory
        - save table
        - for weights
            - create weights
            - get lags
            - fit the model
            - save sklearn model
            - save demoland model
            - save some performance data

## Get Air Pollution data

In [4]:
pm10_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
pm25_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
no2_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
so2_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
pollutants_2021 = xr.merge([pm10_21, pm25_21, no2_21, so2_21])
aqi = (
    pollutants_2021.pm252021g
    + pollutants_2021.pm102021g / 2
    + pollutants_2021.no22021 / 4
    + pollutants_2021.so22021 / 10
)
pollutants_2021 = pollutants_2021.assign(aqi=aqi)

## Get House Price data

In [5]:
house_prices = gpd.read_parquet(f"{data_folder}/processed/house_prices/price_per_sqm_england.parquet")

## Get population data

In [6]:
pop = gpd.read_parquet(f"{data_folder}/processed/gb_population_estimates.pq")

## Get workplace data

In [7]:
wp = gpd.read_parquet(
    f"{data_folder}/raw/workplace_population/workplace_by_industry_gb.pq"
)

## Get CORINE data

In [8]:
corine = gpd.read_parquet(f"{data_folder}/raw/land_cover/corine_gb.pq")

In [9]:
corine_names = {
    "Code_18_124": "Land cover [Airports]",
    "Code_18_211": "Land cover [Non-irrigated arable land]",
    "Code_18_121": "Land cover [Industrial or commercial units]",
    "Code_18_421": "Land cover [Salt marshes]",
    "Code_18_522": "Land cover [Estuaries]",
    "Code_18_142": "Land cover [Sport and leisure facilities]",
    "Code_18_141": "Land cover [Green urban areas]",
    "Code_18_112": "Land cover [Discontinuous urban fabric]",
    "Code_18_231": "Land cover [Pastures]",
    "Code_18_311": "Land cover [Broad-leaved forest]",
    "Code_18_131": "Land cover [Mineral extraction sites]",
    "Code_18_123": "Land cover [Port areas]",
    "Code_18_122": "Land cover [Road and rail networks and associated land]",
    "Code_18_512": "Land cover [Water bodies]",
    "Code_18_243": "Land cover [Land principally occupied by agriculture, with significant areas of natural vegetation]",
    "Code_18_313": "Land cover [Mixed forest]",
    "Code_18_412": "Land cover [Peat bogs]",
    "Code_18_321": "Land cover [Natural grasslands]",
    "Code_18_322": "Land cover [Moors and heathland]",
    "Code_18_324": "Land cover [Transitional woodland-shrub]",
    "Code_18_111": "Land cover [Continuous urban fabric]",
    "Code_18_423": "Land cover [Intertidal flats]",
    "Code_18_523": "Land cover [Sea and ocean]",
    "Code_18_312": "Land cover [Coniferous forest]",
    "Code_18_133": "Land cover [Construction sites]",
    "Code_18_333": "Land cover [Sparsely vegetated areas]",
    "Code_18_332": "Land cover [Bare rocks]",
    "Code_18_411": "Land cover [Inland marshes]",
    "Code_18_132": "Land cover [Dump sites]",
    "Code_18_222": "Land cover [Fruit trees and berry plantations]",
    "Code_18_242": "Land cover [Complex cultivation patterns]",
    "Code_18_331": "Land cover [Beaches, dunes, sands]",
    "Code_18_511": "Land cover [Water courses]",
    "Code_18_334": "Land cover [Burnt areas]",
    "Code_18_244": "Land cover [Agro-forestry areas]",
    "Code_18_521": "Land cover [Coastal lagoons]",
}

Evaluation script

In [10]:
meta = {}

In [11]:
def fit_and_eval(geoms, place, geom_type):
    """Fit the model and evaluate each fold

    Parameters
    ----------
    geoms : GeoDataFrame
        gdf with everything
    place : str
        name of a place
    geom_type : str
        name of a geom type
    """
    meta[place][geom_type]["air"] = {}
    meta[place][geom_type]["hp"] = {}
    for loop in range(4):
        meta[place][geom_type]["air"][f"loop_{loop}"] = {}
        meta[place][geom_type]["hp"][f"loop_{loop}"] = {}

        # avoid special treatment for HP and AQ
        geoms["house_price_index"] = geoms["house_price_index"].replace(0, np.nan)
        geoms = geoms.dropna(subset="house_price_index")

        mask = geoms["split"]==loop
        train = geoms[~mask]
        test = geoms[mask]
        if geom_type == "oa":
            W_train = libpysal.weights.fuzzy_contiguity(train.reset_index(), buffering=True, buffer=2000)
            W_test = libpysal.weights.fuzzy_contiguity(test.reset_index(), buffering=True, buffer=2000)
        else:
            W_train = libpysal.weights.DistanceBand.from_dataframe(train.centroid.reset_index(), 2000)
            W_test = libpysal.weights.DistanceBand.from_dataframe(test.centroid.reset_index(), 2000)

        no_exvars =[
                "geometry",
                "air_quality_index",
                "house_price_index",
            ]
        exvars_train = train.drop(columns=no_exvars)
        exvars_test = test.drop(columns=no_exvars)

        W_train.transform = "r"
        W_test.transform = "r"
        for col in exvars_train.columns.copy():
            exvars_train[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(W_train, exvars_train[col])
            exvars_test[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(W_test, exvars_test[col])

        # Air pollution
        regressor_air = HistGradientBoostingRegressor(
            random_state=0, max_bins=64, max_iter=1000
        )

        regressor_air.fit(exvars_train, train.air_quality_index)
        pred = regressor_air.predict(exvars_test)
        residuals = test.air_quality_index - pred

        meta[place][geom_type]["air"][f"loop_{loop}"]["mse"] = metrics.mean_squared_error(test.air_quality_index, pred)
        meta[place][geom_type]["air"][f"loop_{loop}"]["me"] = residuals.abs().mean()
        meta[place][geom_type]["air"][f"loop_{loop}"]["r2"] = metrics.r2_score(test.air_quality_index, pred)
        moran_obs = esda.Moran(test.air_quality_index, W_test)
        moran_pred = esda.Moran(pred, W_test)
        meta[place][geom_type]["air"][f"loop_{loop}"]["moran_obs"] = moran_obs.I
        meta[place][geom_type]["air"][f"loop_{loop}"]["moran_pred"] = moran_pred.I


        # House price

        regressor_hp = HistGradientBoostingRegressor(
            random_state=0, max_bins=64, max_iter=1000
        )

        regressor_hp.fit(exvars_train, np.log(train.house_price_index))
        pred = regressor_air.predict(exvars_test)
        residuals = np.log(test.house_price_index) - pred

        meta[place][geom_type]["hp"][f"loop_{loop}"]["mse"] = metrics.mean_squared_error(np.log(test.house_price_index), pred)
        meta[place][geom_type]["hp"][f"loop_{loop}"]["me"] = residuals.abs().mean()
        meta[place][geom_type]["hp"][f"loop_{loop}"]["r2"] = metrics.r2_score(np.log(test.house_price_index), pred)
        moran_obs = esda.Moran(np.log(test.house_price_index), W_test)
        moran_pred = esda.Moran(pred, W_test)
        meta[place][geom_type]["hp"][f"loop_{loop}"]["moran_obs"] = moran_obs.I
        meta[place][geom_type]["hp"][f"loop_{loop}"]["moran_pred"] = moran_pred.I

## Leeds

#### Prep Leeds data

In [12]:
opt_id = 40
opt = "leeds"

chunk = chunks.loc[[opt_id]]
meta[opt] = {}

Prepare Air pollution data

In [13]:
bds = chunk.buffer(1000).total_bounds
pollutants_aoi = pollutants_2021.sel(
    x=slice(bds[0], bds[2]), y=slice(bds[1], bds[3])
)
pollutants_aoi_df = pollutants_aoi.to_dataframe().reset_index()
pollutants_aoi_df = gpd.GeoDataFrame(
    pollutants_aoi_df,
    geometry=gpd.points_from_xy(
        pollutants_aoi_df.x, pollutants_aoi_df.y, crs=27700
    ).buffer(500, cap_style=3),
)

Get CV splits

In [14]:
cv_ids = np.tile(np.arange(5), pollutants_aoi_df.shape[0]//5 + 1)[:pollutants_aoi_df.shape[0]]
rng = np.random.default_rng()
rng.shuffle(cv_ids)
pollutants_aoi_df["split"] = cv_ids

Prepare house price data

In [15]:
house_prices_aoi = house_prices.iloc[house_prices.sindex.query(chunk.geometry.item())]

Prepare population data

In [16]:
pop_aoi = pop[pop.code.isin(house_prices_aoi.code)]

Link population and house price (both are on OA).

In [17]:
pop_hp = house_prices_aoi.merge(pop_aoi[["code", "population"]], on="code")

 Prepare workplace pop data

In [18]:
wp_aoi = wp.iloc[wp.sindex.query(chunk.geometry.item())]

Prepare CORINE data

In [19]:
corine_aoi = corine.iloc[corine.sindex.query(chunk.geometry.item())]

Get morphometric data

In [20]:
data = gpd.read_parquet(f"{data_folder}/raw/urban_morpho/cells_{opt_id}.pq")

### H3
Create geometries

In [21]:
meta[opt]["h3"] = {}

In [22]:
geoms = h3fy(chunk, resolution=9, buffer=False)

  proj = self._crs.to_proj4(version=version)


Interpolate Air Quality

In [23]:
interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=["aqi"])
geoms["air_quality_index"] = interp.aqi.values

 Interpolate OA data

In [24]:
interp_oa = tobler.area_weighted.area_interpolate(
    pop_hp,
    geoms,
    intensive_variables=["priceper"],
    extensive_variables=["population"],
)
geoms["house_price_index"] = interp_oa.priceper.values
geoms["population"] = interp_oa.population.values

Interpolate workplace population

In [25]:
wp_interpolated = tobler.area_weighted.area_interpolate(
    wp_aoi,
    geoms,
    extensive_variables=[
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ],
)

geoms[
    [
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ]
] = wp_interpolated.drop(columns="geometry").values

Interpolate CORINE

In [26]:
corine_interpolated = tobler.area_weighted.area_interpolate(
    corine_aoi, geoms, categorical_variables=["Code_18"]
)
corine_interpolated.columns = corine_interpolated.columns.map(corine_names)
interesting = [
    "Land cover [Discontinuous urban fabric]",
    "Land cover [Continuous urban fabric]",
    "Land cover [Non-irrigated arable land]",
    "Land cover [Industrial or commercial units]",
    "Land cover [Green urban areas]",
    "Land cover [Pastures]",
    "Land cover [Sport and leisure facilities]",
]
geoms[interesting] = corine_interpolated[interesting].values

Interpolate morphometrics

In [27]:
chars = data.columns.drop(
    [
        "hindex",
        "tessellation",
        "buildings",
        "nodeID",
        "edgeID_keys",
        "edgeID_values",
        "edgeID_primary",
        "sdbPer",
        "ssbElo",
        "stcOri",
        "sdcLAL",
        "mdcAre",
        "ltcAre",
        "ltcWRE",
        "mtdMDi",
        "lcdMes",
        "lddNDe",
        "sddAre",
        "mdsAre",
        "ldsAre",
        "lisCel",
        "ldePer",
        "lseCWA",
    ]
)
morhp_interpolated = tobler.area_weighted.area_interpolate(
    data, geoms, intensive_variables=chars.tolist()
)

geoms[morhp_interpolated.columns.drop("geometry")] = morhp_interpolated.drop(
    columns="geometry"
).values

  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")


Get split IDs

In [28]:
geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate="within", sort=True)

In [29]:
geoms["split"] = pollutants_aoi_df["split"].values[poll_ix]

Save the table

In [30]:
geoms.to_parquet(f"{data_folder}/unit_test/tables/h3_{opt}.pq")

Loop over splits, get W, train, eval, save results.

In [31]:
fit_and_eval(geoms, opt, "h3")

 There are 16 disconnected components.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 15 disconnected components.
 There are 11 disconnected components.
 There are 10 disconnected components.


In [32]:
meta

{'leeds': {'h3': {'air': {'loop_0': {'mse': 1.0215962668214376,
     'me': 0.7268292438504929,
     'r2': 0.6471124647540352,
     'moran_obs': 0.9139658896608336,
     'moran_pred': 0.8911782193252566},
    'loop_1': {'mse': 0.9052719580939292,
     'me': 0.7173481454198483,
     'r2': 0.7214704719652031,
     'moran_obs': 0.9382819221632059,
     'moran_pred': 0.9108587337675578},
    'loop_2': {'mse': 1.189713977353201,
     'me': 0.7906349848032027,
     'r2': 0.6480546399551688,
     'moran_obs': 0.9270422555656327,
     'moran_pred': 0.9186198786505408},
    'loop_3': {'mse': 0.9411979683024067,
     'me': 0.7241649062582691,
     'r2': 0.7489344824312717,
     'moran_obs': 0.9386609332461856,
     'moran_pred': 0.9026754574770531}},
   'hp': {'loop_0': {'mse': 45.97914682149453,
     'me': 6.607405179022031,
     'r2': -818.245515704516,
     'moran_obs': 0.7485176481450501,
     'moran_pred': 0.8911782193252566},
    'loop_1': {'mse': 48.6391071221501,
     'me': 6.802131639333

### square grid
Create geometries

In [33]:
meta["leeds"]["square"] = {}

In [34]:
coords = np.array(
    list(product(
        np.arange(bds[0], bds[2], 250),
        np.arange(bds[1], bds[3], 250),
    ))
)
points = gpd.GeoSeries.from_xy(coords[:, 0], coords[:, 1], crs=27700)
geoms = points.iloc[points.sindex.query(chunk.geometry.item(), predicate="contains")].buffer(100, cap_style=3).to_frame("geometry")

Interpolate Air Quality

In [35]:
interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=["aqi"])
geoms["air_quality_index"] = interp.aqi.values

 Interpolate OA data

In [36]:
interp_oa = tobler.area_weighted.area_interpolate(
    pop_hp,
    geoms,
    intensive_variables=["priceper"],
    extensive_variables=["population"],
)
geoms["house_price_index"] = interp_oa.priceper.values
geoms["population"] = interp_oa.population.values

Interpolate workplace population

In [37]:
wp_interpolated = tobler.area_weighted.area_interpolate(
    wp_aoi,
    geoms,
    extensive_variables=[
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ],
)

geoms[
    [
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ]
] = wp_interpolated.drop(columns="geometry").values

Interpolate CORINE

In [38]:
corine_interpolated = tobler.area_weighted.area_interpolate(
    corine_aoi, geoms, categorical_variables=["Code_18"]
)
corine_interpolated.columns = corine_interpolated.columns.map(corine_names)
interesting = [
    "Land cover [Discontinuous urban fabric]",
    "Land cover [Continuous urban fabric]",
    "Land cover [Non-irrigated arable land]",
    "Land cover [Industrial or commercial units]",
    "Land cover [Green urban areas]",
    "Land cover [Pastures]",
    "Land cover [Sport and leisure facilities]",
]
geoms[interesting] = corine_interpolated[interesting].values

Interpolate morphometrics

In [39]:
chars = data.columns.drop(
    [
        "hindex",
        "tessellation",
        "buildings",
        "nodeID",
        "edgeID_keys",
        "edgeID_values",
        "edgeID_primary",
        "sdbPer",
        "ssbElo",
        "stcOri",
        "sdcLAL",
        "mdcAre",
        "ltcAre",
        "ltcWRE",
        "mtdMDi",
        "lcdMes",
        "lddNDe",
        "sddAre",
        "mdsAre",
        "ldsAre",
        "lisCel",
        "ldePer",
        "lseCWA",
    ]
)
morhp_interpolated = tobler.area_weighted.area_interpolate(
    data, geoms, intensive_variables=chars.tolist()
)

geoms[morhp_interpolated.columns.drop("geometry")] = morhp_interpolated.drop(
    columns="geometry"
).values

  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")


Get split IDs

In [40]:
geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate="within", sort=True)

In [41]:
geoms["split"] = pollutants_aoi_df["split"].values[poll_ix]

Save the table

In [42]:
geoms.to_parquet(f"{data_folder}/unit_test/tables/square_{opt}.pq")

Loop over splits, get W, train, eval, save results.

In [43]:
fit_and_eval(geoms, opt, "square")

 There are 14 disconnected components.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 14 disconnected components.
 There are 12 disconnected components.
 There are 10 disconnected components.


In [44]:
meta

{'leeds': {'h3': {'air': {'loop_0': {'mse': 1.0215962668214376,
     'me': 0.7268292438504929,
     'r2': 0.6471124647540352,
     'moran_obs': 0.9139658896608336,
     'moran_pred': 0.8911782193252566},
    'loop_1': {'mse': 0.9052719580939292,
     'me': 0.7173481454198483,
     'r2': 0.7214704719652031,
     'moran_obs': 0.9382819221632059,
     'moran_pred': 0.9108587337675578},
    'loop_2': {'mse': 1.189713977353201,
     'me': 0.7906349848032027,
     'r2': 0.6480546399551688,
     'moran_obs': 0.9270422555656327,
     'moran_pred': 0.9186198786505408},
    'loop_3': {'mse': 0.9411979683024067,
     'me': 0.7241649062582691,
     'r2': 0.7489344824312717,
     'moran_obs': 0.9386609332461856,
     'moran_pred': 0.9026754574770531}},
   'hp': {'loop_0': {'mse': 45.97914682149453,
     'me': 6.607405179022031,
     'r2': -818.245515704516,
     'moran_obs': 0.7485176481450501,
     'moran_pred': 0.8911782193252566},
    'loop_1': {'mse': 48.6391071221501,
     'me': 6.802131639333

### output area
Create geometries

In [45]:
meta["leeds"]["oa"] = {}

In [46]:
geoms = pop_hp.set_index("code")[["geometry", "priceper", "population"]].set_crs(27700, allow_override=True)

Interpolate Air Quality

In [47]:
interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=["aqi"])
geoms["air_quality_index"] = interp.aqi.values

  return lib.intersection(a, b, **kwargs)


 Rename OA data

In [48]:
geoms = geoms.rename(columns={"priceper": "house_price_index"})

Interpolate workplace population

In [49]:
wp_interpolated = tobler.area_weighted.area_interpolate(
    wp_aoi,
    geoms,
    extensive_variables=[
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ],
)

geoms[
    [
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ]
] = wp_interpolated.drop(columns="geometry").values

  return lib.intersection(a, b, **kwargs)


Interpolate CORINE

In [50]:
corine_interpolated = tobler.area_weighted.area_interpolate(
    corine_aoi, geoms, categorical_variables=["Code_18"]
)
corine_interpolated.columns = corine_interpolated.columns.map(corine_names)
interesting = [
    "Land cover [Discontinuous urban fabric]",
    "Land cover [Continuous urban fabric]",
    "Land cover [Non-irrigated arable land]",
    "Land cover [Industrial or commercial units]",
    "Land cover [Green urban areas]",
    "Land cover [Pastures]",
    "Land cover [Sport and leisure facilities]",
]
geoms[interesting] = corine_interpolated[interesting].values

Interpolate morphometrics

In [51]:
chars = data.columns.drop(
    [
        "hindex",
        "tessellation",
        "buildings",
        "nodeID",
        "edgeID_keys",
        "edgeID_values",
        "edgeID_primary",
        "sdbPer",
        "ssbElo",
        "stcOri",
        "sdcLAL",
        "mdcAre",
        "ltcAre",
        "ltcWRE",
        "mtdMDi",
        "lcdMes",
        "lddNDe",
        "sddAre",
        "mdsAre",
        "ldsAre",
        "lisCel",
        "ldePer",
        "lseCWA",
    ]
)
morhp_interpolated = tobler.area_weighted.area_interpolate(
    data, geoms, intensive_variables=chars.tolist()
)

geoms[morhp_interpolated.columns.drop("geometry")] = morhp_interpolated.drop(
    columns="geometry"
).values

  return lib.intersection(a, b, **kwargs)
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")
  warn(f"nan values in variable: {column}, replacing with 0")


Get split IDs

In [52]:
geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate="within", sort=True)

In [53]:
geoms.loc[geoms.index[geoms_ix], "split"] = pollutants_aoi_df["split"].values[poll_ix]

Save the table

In [54]:
geoms.to_parquet(f"{data_folder}/unit_test/tables/oa_{opt}.pq")

Loop over splits, get W, train, eval, save results.

In [55]:
fit_and_eval(geoms, opt, "oa")

  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 3 disconnected components.
 There is 1 island with id: 723.




  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 2 disconnected components.
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 2 disconnected components.
 There is 1 island with id: 642.




### ET cells
Create geometries

In [56]:
meta["leeds"]["et"] = {}

In [57]:
geoms = data.set_index("hindex").drop(columns=
    [
        "buildings",
        "nodeID",
        "edgeID_keys",
        "edgeID_values",
        "edgeID_primary",
        "sdbPer",
        "ssbElo",
        "stcOri",
        "sdcLAL",
        "mdcAre",
        "ltcAre",
        "ltcWRE",
        "mtdMDi",
        "lcdMes",
        "lddNDe",
        "sddAre",
        "mdsAre",
        "ldsAre",
        "lisCel",
        "ldePer",
        "lseCWA",
    ]
)

Interpolate Air Quality

In [58]:
interp = tobler.area_weighted.area_interpolate(pollutants_aoi_df, geoms, intensive_variables=["aqi"])
geoms["air_quality_index"] = interp.aqi.values

  return lib.intersection(a, b, **kwargs)


 Interpolate OA data

In [59]:
interp_oa = tobler.area_weighted.area_interpolate(
    pop_hp,
    geoms,
    intensive_variables=["priceper"],
    extensive_variables=["population"],
)
geoms["house_price_index"] = interp_oa.priceper.values
geoms["population"] = interp_oa.population.values

  return lib.intersection(a, b, **kwargs)


Interpolate workplace population

In [60]:
wp_interpolated = tobler.area_weighted.area_interpolate(
    wp_aoi,
    geoms,
    extensive_variables=[
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ],
)

geoms[
    [
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ]
] = wp_interpolated.drop(columns="geometry").values

  return lib.intersection(a, b, **kwargs)


Interpolate CORINE

In [61]:
corine_interpolated = tobler.area_weighted.area_interpolate(
    corine_aoi, geoms, categorical_variables=["Code_18"]
)
corine_interpolated.columns = corine_interpolated.columns.map(corine_names)
interesting = [
    "Land cover [Discontinuous urban fabric]",
    "Land cover [Continuous urban fabric]",
    "Land cover [Non-irrigated arable land]",
    "Land cover [Industrial or commercial units]",
    "Land cover [Green urban areas]",
    "Land cover [Pastures]",
    "Land cover [Sport and leisure facilities]",
]
geoms[interesting] = corine_interpolated[interesting].values

Get split IDs

In [62]:
geoms_ix, poll_ix = pollutants_aoi_df.sindex.query(geoms.centroid, predicate="within", sort=True)

In [63]:
geoms.loc[geoms.index[geoms_ix], "split"] = pollutants_aoi_df["split"].values[poll_ix]

Save the table

In [64]:
geoms.to_parquet(f"{data_folder}/unit_test/tables/et_{opt}.pq")

Loop over splits, get W, train, eval, save results.

In [66]:
fit_and_eval(geoms, opt, "et")

In [65]:
meta

{'leeds': {'h3': {'air': {'loop_0': {'mse': 1.0215962668214376,
     'me': 0.7268292438504929,
     'r2': 0.6471124647540352,
     'moran_obs': 0.9139658896608336,
     'moran_pred': 0.8911782193252566},
    'loop_1': {'mse': 0.9052719580939292,
     'me': 0.7173481454198483,
     'r2': 0.7214704719652031,
     'moran_obs': 0.9382819221632059,
     'moran_pred': 0.9108587337675578},
    'loop_2': {'mse': 1.189713977353201,
     'me': 0.7906349848032027,
     'r2': 0.6480546399551688,
     'moran_obs': 0.9270422555656327,
     'moran_pred': 0.9186198786505408},
    'loop_3': {'mse': 0.9411979683024067,
     'me': 0.7241649062582691,
     'r2': 0.7489344824312717,
     'moran_obs': 0.9386609332461856,
     'moran_pred': 0.9026754574770531}},
   'hp': {'loop_0': {'mse': 45.97914682149453,
     'me': 6.607405179022031,
     'r2': -818.245515704516,
     'moran_obs': 0.7485176481450501,
     'moran_pred': 0.8911782193252566},
    'loop_1': {'mse': 48.6391071221501,
     'me': 6.802131639333