# Run predictions

In [2]:
import json

import pandas as pd
import geopandas as gpd
import xarray as xr
import tobler
import libpysal
import numpy as np
from itertools import product
import esda

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import metrics
from tobler.util import h3fy

In [3]:
def fit_and_eval(geoms, place, geom_type):
    """Fit the model and evaluate each fold

    Parameters
    ----------
    geoms : GeoDataFrame
        gdf with everything
    place : str
        name of a place
    geom_type : str
        name of a geom type
    """
    meta[place][geom_type]["air"] = {}
    meta[place][geom_type]["hp"] = {}
    for loop in range(4):
        meta[place][geom_type]["air"][f"loop_{loop}"] = {}
        meta[place][geom_type]["hp"][f"loop_{loop}"] = {}

        # avoid special treatment for HP and AQ
        geoms["house_price_index"] = geoms["house_price_index"].replace(0, np.nan)
        geoms = geoms.dropna(subset="house_price_index")
        
        # full df leads to memory issues
        if geom_type == "et":
            geoms = geoms.sample(50_000)

        mask = geoms["split"]==loop
        train = geoms[~mask]
        test = geoms[mask]
        if geom_type == "oa":
            W_train = libpysal.weights.fuzzy_contiguity(train.reset_index(), buffering=True, buffer=2000)
            W_test = libpysal.weights.fuzzy_contiguity(test.reset_index(), buffering=True, buffer=2000)
        else:
            W_train = libpysal.weights.DistanceBand.from_dataframe(train.centroid.reset_index(), 2000)
            W_test = libpysal.weights.DistanceBand.from_dataframe(test.centroid.reset_index(), 2000)

        no_exvars =[
                geoms.geometry.name,
                "air_quality_index",
                "house_price_index",
            ]
        exvars_train = train.drop(columns=no_exvars)
        exvars_test = test.drop(columns=no_exvars)

        W_train.transform = "r"
        W_test.transform = "r"
        for col in exvars_train.columns.copy():
            exvars_train[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(W_train, exvars_train[col])
            exvars_test[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(W_test, exvars_test[col])

        # Air pollution
        regressor_air = HistGradientBoostingRegressor(
            random_state=0, max_bins=64, max_iter=1000
        )

        regressor_air.fit(exvars_train, train.air_quality_index)
        pred = regressor_air.predict(exvars_test)
        residuals = test.air_quality_index - pred

        meta[place][geom_type]["air"][f"loop_{loop}"]["mse"] = metrics.mean_squared_error(test.air_quality_index, pred)
        meta[place][geom_type]["air"][f"loop_{loop}"]["me"] = residuals.abs().mean()
        meta[place][geom_type]["air"][f"loop_{loop}"]["r2"] = metrics.r2_score(test.air_quality_index, pred)
        moran_obs = esda.Moran(test.air_quality_index, W_test)
        moran_pred = esda.Moran(pred, W_test)
        meta[place][geom_type]["air"][f"loop_{loop}"]["moran_obs"] = moran_obs.I
        meta[place][geom_type]["air"][f"loop_{loop}"]["moran_pred"] = moran_pred.I


        # House price

        regressor_hp = HistGradientBoostingRegressor(
            random_state=0, max_bins=64, max_iter=1000
        )

        regressor_hp.fit(exvars_train, np.log(train.house_price_index))
        pred = regressor_air.predict(exvars_test)
        residuals = np.log(test.house_price_index) - pred

        meta[place][geom_type]["hp"][f"loop_{loop}"]["mse"] = metrics.mean_squared_error(np.log(test.house_price_index), pred)
        meta[place][geom_type]["hp"][f"loop_{loop}"]["me"] = residuals.abs().mean()
        meta[place][geom_type]["hp"][f"loop_{loop}"]["r2"] = metrics.r2_score(np.log(test.house_price_index), pred)
        moran_obs = esda.Moran(np.log(test.house_price_index), W_test)
        moran_pred = esda.Moran(pred, W_test)
        meta[place][geom_type]["hp"][f"loop_{loop}"]["moran_obs"] = moran_obs.I
        meta[place][geom_type]["hp"][f"loop_{loop}"]["moran_pred"] = moran_pred.I

In [4]:
data_folder = "../../../demoland_data"

In [5]:
meta = {}

In [6]:
opt = "leeds"
meta[opt] = {}

In [7]:
meta[opt]["h3"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/h3_{opt}.pq")

fit_and_eval(geoms, opt, "h3")

meta

 There are 18 disconnected components.
 There is 1 island with id: 2149.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 12 disconnected components.
 There is 1 island with id: 2314.




 There are 10 disconnected components.
 There are 12 disconnected components.
 There is 1 island with id: 747.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [8]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [9]:
meta["leeds"]["square"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/square_{opt}.pq")

fit_and_eval(geoms, opt, "square")

meta

 There are 17 disconnected components.
 There is 1 island with id: 2185.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 10 disconnected components.
 There are 8 disconnected components.
 There are 12 disconnected components.
 There is 1 island with id: 4221.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [10]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [11]:
meta["leeds"]["oa"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/oa_{opt}.pq")

fit_and_eval(geoms, opt, "oa")

meta

  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 2 disconnected components.
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 2 disconnected components.
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 3 disconnected components.
 There is 1 island with id: 469.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [12]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [13]:
meta["leeds"]["et"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/et_{opt}.pq")

fit_and_eval(geoms, opt, "et")

meta

 There are 27 disconnected components.
 There are 3 islands with ids: 7962, 8100, 8339.




 There are 34 disconnected components.
 There are 4 islands with ids: 736, 1623, 2756, 3710.




 There are 3 disconnected components.
 There are 2 islands with ids: 2367, 7395.
 There are 36 disconnected components.
 There are 3 islands with ids: 3902, 8527, 9275.




 There are 30 disconnected components.
 There are 2 islands with ids: 2863, 8296.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [14]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

## Newcastle

In [15]:
with open(f"{data_folder}/spatial_units_test/meta.json", "r") as f:
    meta = json.load(f)

In [16]:
opt = "newcastle"
meta[opt] = {}

In [17]:
meta[opt]["h3"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/h3_{opt}.pq")

fit_and_eval(geoms, opt, "h3")

meta

 There are 6 disconnected components.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 11 disconnected components.
 There is 1 island with id: 935.




 There are 7 disconnected components.
 There are 3 islands with ids: 24, 470, 602.




 There are 10 disconnected components.


{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [18]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [19]:
meta[opt]["square"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/square_{opt}.pq")

fit_and_eval(geoms, opt, "square")

meta

 There are 7 disconnected components.
 There is 1 island with id: 1086.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 10 disconnected components.
 There are 4 disconnected components.
 There are 10 disconnected components.
 There is 1 island with id: 815.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [20]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [21]:
meta[opt]["oa"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/oa_{opt}.pq")

fit_and_eval(geoms, opt, "oa")

meta

  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)


{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [22]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [23]:
meta[opt]["et"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/et_{opt}.pq")

fit_and_eval(geoms, opt, "et")

meta

 There are 11 disconnected components.
 There is 1 island with id: 6449.




 There are 10 disconnected components.
 There is 1 island with id: 1338.




 There are 10 disconnected components.
 There are 2 islands with ids: 6436, 10276.




 There are 10 disconnected components.
 There is 1 island with id: 9459.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248201526181,
     'moran_pred': 0.9114773620763844}},
   'hp': {'loop_0': {'mse': 47.5022926019654,
     'me': 6.658872240473533,
     'r2': -800.8773227040432,
     'moran_obs': 0.7723645051619082,
     'moran_pred': 0.9166598268930258},
    'loop_1': {'mse': 48.79762468644221,
     'me': 6.793274843280011

In [24]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)