# Run predictions

In [14]:
import json

import geopandas as gpd
import libpysal
import numpy as np
import esda
import tobler
import matplotlib.pyplot as plt

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import metrics


In [97]:
def fit_and_eval(geoms, place, geom_type, et):
    """Fit the model and evaluate each fold

    Parameters
    ----------
    geoms : GeoDataFrame
        gdf with everything
    place : str
        name of a place
    geom_type : str
        name of a geom type
    """
    meta[place][geom_type]["air"] = {}
    meta[place][geom_type]["hp"] = {}
    for loop in range(5):
        meta[place][geom_type]["air"][f"loop_{loop}"] = {}
        meta[place][geom_type]["hp"][f"loop_{loop}"] = {}

        # avoid special treatment for HP and AQ
        geoms["house_price_index"] = geoms["house_price_index"].replace(0, np.nan)
        geoms = geoms.dropna(subset="house_price_index")

        # full df leads to memory issues
        if geom_type == "et":
            geoms = geoms.sample(50_000)

        mask = geoms["split"] == loop
        train = geoms[~mask]
        test = geoms[mask]
        if geom_type == "oa":
            W_train = libpysal.weights.fuzzy_contiguity(
                train.reset_index(), buffering=True, buffer=2000
            )
            W_test = libpysal.weights.fuzzy_contiguity(
                test.reset_index(), buffering=True, buffer=2000
            )
        else:
            W_train = libpysal.weights.DistanceBand.from_dataframe(
                train.centroid.reset_index(), 2000
            )
            W_test = libpysal.weights.DistanceBand.from_dataframe(
                test.centroid.reset_index(), 2000
            )

        no_exvars = [
            geoms.geometry.name,
            "air_quality_index",
            "house_price_index",
        ]
        exvars_train = train.drop(columns=no_exvars)
        exvars_test = test.drop(columns=no_exvars)

        W_train.transform = "r"
        W_test.transform = "r"
        for col in exvars_train.columns.copy():
            exvars_train[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
                W_train, exvars_train[col]
            )
            exvars_test[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
                W_test, exvars_test[col]
            )

        # Air pollution
        regressor_air = HistGradientBoostingRegressor(
            random_state=0, max_bins=64, max_iter=1000
        )

        regressor_air.fit(exvars_train, train.air_quality_index)
        pred = regressor_air.predict(exvars_test)
        residuals = test.air_quality_index - pred

        meta[place][geom_type]["air"][f"loop_{loop}"][
            "mse"
        ] = metrics.mean_squared_error(test.air_quality_index, pred)
        meta[place][geom_type]["air"][f"loop_{loop}"]["me"] = residuals.abs().mean()
        meta[place][geom_type]["air"][f"loop_{loop}"]["r2"] = metrics.r2_score(
            test.air_quality_index, pred
        )
        moran_obs = esda.Moran(test.air_quality_index, W_test)
        moran_pred = esda.Moran(pred, W_test)
        meta[place][geom_type]["air"][f"loop_{loop}"]["moran_obs"] = moran_obs.I
        meta[place][geom_type]["air"][f"loop_{loop}"]["moran_pred"] = moran_pred.I
        
        # plot residuals
        max_resid = np.abs(residuals).max()
        test.plot(residuals, cmap="RdBu", vmin=-max_resid, vmax=max_resid, legend=True, figsize=(16, 16))
        plt.savefig(f"../../images/residuals/{place}_{geom_type}_air_{loop}.png")
        plt.close()
        
        if geom_type != "et":
            # interpolate prediction to ET and get ET-level errors
            geom_hits, et_hits = et.sindex.query(test.geometry, predicate="intersects")
            affected_et = et.iloc[et_hits][[et.geometry.name]]
            interpolated = tobler.area_weighted.area_interpolate(
                test.assign(pred=pred), 
                affected_et, 
                intensive_variables=["air_quality_index", "pred"]
            )
            meta[place][geom_type]["air"][f"loop_{loop}"][
                "mse_ET"
            ] = metrics.mean_squared_error(interpolated.air_quality_index, interpolated.pred)
            meta[place][geom_type]["air"][f"loop_{loop}"]["me_ET"] = (interpolated.air_quality_index - interpolated.pred).abs().mean()
            meta[place][geom_type]["air"][f"loop_{loop}"]["r2_ET"] = metrics.r2_score(
                interpolated.air_quality_index, interpolated.pred
            )
        else:
            meta[place][geom_type]["air"][f"loop_{loop}"][
                "mse_ET"
            ] = meta[place][geom_type]["air"][f"loop_{loop}"]["mse"]
            meta[place][geom_type]["air"][f"loop_{loop}"]["me_ET"] = meta[place][geom_type]["air"][f"loop_{loop}"]["me"]
            meta[place][geom_type]["air"][f"loop_{loop}"]["r2_ET"] = meta[place][geom_type]["air"][f"loop_{loop}"]["r2"]

        # House price

        regressor_hp = HistGradientBoostingRegressor(
            random_state=0, max_bins=64, max_iter=1000
        )

        regressor_hp.fit(exvars_train, np.log(train.house_price_index))
        pred = regressor_hp.predict(exvars_test)
        residuals = np.log(test.house_price_index) - pred

        meta[place][geom_type]["hp"][f"loop_{loop}"][
            "mse"
        ] = metrics.mean_squared_error(np.log(test.house_price_index), pred)
        meta[place][geom_type]["hp"][f"loop_{loop}"]["me"] = residuals.abs().mean()
        meta[place][geom_type]["hp"][f"loop_{loop}"]["r2"] = metrics.r2_score(
            np.log(test.house_price_index), pred
        )
        moran_obs = esda.Moran(np.log(test.house_price_index), W_test)
        moran_pred = esda.Moran(pred, W_test)
        meta[place][geom_type]["hp"][f"loop_{loop}"]["moran_obs"] = moran_obs.I
        meta[place][geom_type]["hp"][f"loop_{loop}"]["moran_pred"] = moran_pred.I
        
        # plot residuals
        max_resid = np.abs(residuals).max()
        test.plot(residuals, cmap="RdBu", vmin=-max_resid, vmax=max_resid, legend=True, figsize=(16, 16))
        plt.savefig(f"../../images/residuals/{place}_{geom_type}_hp_{loop}.png")
        plt.close()
        
        if geom_type != "et":
            # interpolate prediction to OA and get OA-level errors
            interpolated = tobler.area_weighted.area_interpolate(
                test.assign(pred=pred), 
                affected_et, 
                intensive_variables=["house_price_index", "pred"]
            )
            meta[place][geom_type]["hp"][f"loop_{loop}"][
                "mse_ET"
            ] = metrics.mean_squared_error(np.log(interpolated.house_price_index + 0.000001), interpolated.pred)
            meta[place][geom_type]["hp"][f"loop_{loop}"]["me_ET"] = (np.log(interpolated.house_price_index + 0.000001) - interpolated.pred).abs().mean()
            meta[place][geom_type]["hp"][f"loop_{loop}"]["r2_ET"] = metrics.r2_score(
                np.log(interpolated.house_price_index + 0.000001), interpolated.pred
            )
        else:
            meta[place][geom_type]["hp"][f"loop_{loop}"][
                "mse_ET"
            ] = meta[place][geom_type]["hp"][f"loop_{loop}"]["mse"]
            meta[place][geom_type]["hp"][f"loop_{loop}"]["me_ET"] = meta[place][geom_type]["hp"][f"loop_{loop}"]["me"]
            meta[place][geom_type]["hp"][f"loop_{loop}"]["r2_ET"] = meta[place][geom_type]["hp"][f"loop_{loop}"]["r2"]


In [98]:
data_folder = "../../../demoland_data"

In [99]:
meta = {}

In [100]:
opt = "leeds"
meta[opt] = {}

In [101]:
et = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/et_{opt}.pq")

In [102]:
meta[opt]["h3"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/h3_{opt}.pq")

fit_and_eval(geoms, opt, "h3", et)

meta

 There are 18 disconnected components.
 There is 1 island with id: 2149.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 12 disconnected components.
 There is 1 island with id: 2314.




 There are 10 disconnected components.
 There are 12 disconnected components.
 There is 1 island with id: 747.




 There are 9 disconnected components.


{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258,
     'mse_ET': 1.518356666930176,
     'me_ET': 0.8968138773118687,
     'r2_ET': 0.6504683895155177},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884,
     'mse_ET': 1.2620869853561523,
     'me_ET': 0.8902205069616883,
     'r2_ET': 0.7003543053499883},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895,
     'mse_ET': 1.615757883882144,
     'me_ET': 1.010752776208472,
     'r2_ET': 0.6818719322134003},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248

In [103]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [104]:
meta["leeds"]["square"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/square_{opt}.pq")

fit_and_eval(geoms, opt, "square", et)

meta

 There are 17 disconnected components.
 There is 1 island with id: 2185.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 10 disconnected components.
 There are 8 disconnected components.
 There are 12 disconnected components.
 There is 1 island with id: 4221.




 There are 8 disconnected components.


{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258,
     'mse_ET': 1.518356666930176,
     'me_ET': 0.8968138773118687,
     'r2_ET': 0.6504683895155177},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884,
     'mse_ET': 1.2620869853561523,
     'me_ET': 0.8902205069616883,
     'r2_ET': 0.7003543053499883},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895,
     'mse_ET': 1.615757883882144,
     'me_ET': 1.010752776208472,
     'r2_ET': 0.6818719322134003},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248

In [105]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [106]:
meta["leeds"]["oa"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/oa_{opt}.pq")

fit_and_eval(geoms, opt, "oa", et)

meta

  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 2 disconnected components.
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 2 disconnected components.
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 3 disconnected components.
 There is 1 island with id: 469.




  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
 There are 4 disconnected components.
 There is 1 island with id: 538.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258,
     'mse_ET': 1.518356666930176,
     'me_ET': 0.8968138773118687,
     'r2_ET': 0.6504683895155177},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884,
     'mse_ET': 1.2620869853561523,
     'me_ET': 0.8902205069616883,
     'r2_ET': 0.7003543053499883},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895,
     'mse_ET': 1.615757883882144,
     'me_ET': 1.010752776208472,
     'r2_ET': 0.6818719322134003},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248

In [107]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [108]:
meta["leeds"]["et"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/et_{opt}.pq")

fit_and_eval(geoms, opt, "et", et)

meta

 There are 31 disconnected components.
 There are 5 islands with ids: 3292, 3756, 3779, 3974, 9182.




 There are 36 disconnected components.
 There are 7 islands with ids: 1747, 2434, 3492, 3991, 5602, 6705, 8291.




 There are 2 disconnected components.
 There is 1 island with id: 3513.
 There are 33 disconnected components.
 There are 4 islands with ids: 3049, 6101, 6513, 6621.




 There are 31 disconnected components.
 There are 3 islands with ids: 1492, 3167, 5032.




 There are 32 disconnected components.
 There are 5 islands with ids: 1274, 3279, 3400, 7057, 9805.




{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258,
     'mse_ET': 1.518356666930176,
     'me_ET': 0.8968138773118687,
     'r2_ET': 0.6504683895155177},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884,
     'mse_ET': 1.2620869853561523,
     'me_ET': 0.8902205069616883,
     'r2_ET': 0.7003543053499883},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895,
     'mse_ET': 1.615757883882144,
     'me_ET': 1.010752776208472,
     'r2_ET': 0.6818719322134003},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248

In [109]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

## Newcastle

In [110]:
with open(f"{data_folder}/spatial_units_test/meta.json", "r") as f:
    meta = json.load(f)

In [111]:
opt = "newcastle"
meta[opt] = {}

In [112]:
et = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/et_{opt}.pq")

In [113]:
meta[opt]["h3"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/h3_{opt}.pq")

fit_and_eval(geoms, opt, "h3", et)

meta

 There are 6 disconnected components.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 11 disconnected components.
 There is 1 island with id: 935.




 There are 7 disconnected components.
 There are 3 islands with ids: 24, 470, 602.




 There are 10 disconnected components.
 There are 7 disconnected components.


{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258,
     'mse_ET': 1.518356666930176,
     'me_ET': 0.8968138773118687,
     'r2_ET': 0.6504683895155177},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884,
     'mse_ET': 1.2620869853561523,
     'me_ET': 0.8902205069616883,
     'r2_ET': 0.7003543053499883},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895,
     'mse_ET': 1.615757883882144,
     'me_ET': 1.010752776208472,
     'r2_ET': 0.6818719322134003},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248

In [114]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [115]:
meta[opt]["square"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/square_{opt}.pq")

fit_and_eval(geoms, opt, "square", et)

meta

 There are 7 disconnected components.
 There is 1 island with id: 1086.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
 There are 10 disconnected components.
 There are 4 disconnected components.
 There are 10 disconnected components.
 There is 1 island with id: 815.




 There are 7 disconnected components.


{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258,
     'mse_ET': 1.518356666930176,
     'me_ET': 0.8968138773118687,
     'r2_ET': 0.6504683895155177},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884,
     'mse_ET': 1.2620869853561523,
     'me_ET': 0.8902205069616883,
     'r2_ET': 0.7003543053499883},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895,
     'mse_ET': 1.615757883882144,
     'me_ET': 1.010752776208472,
     'r2_ET': 0.6818719322134003},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248

In [116]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [117]:
meta[opt]["oa"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/oa_{opt}.pq")

fit_and_eval(geoms, opt, "oa", et)

meta

  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)
  inp, res = gdf.sindex.query_bulk(gdf.geometry, predicate=predicate)


{'leeds': {'h3': {'air': {'loop_0': {'mse': 0.84460401056791,
     'me': 0.6436981864136232,
     'r2': 0.7623934197709308,
     'moran_obs': 0.9363669447076342,
     'moran_pred': 0.9166598268930258,
     'mse_ET': 1.518356666930176,
     'me_ET': 0.8968138773118687,
     'r2_ET': 0.6504683895155177},
    'loop_1': {'mse': 1.02636924959567,
     'me': 0.7373477412766667,
     'r2': 0.6849255600114099,
     'moran_obs': 0.9307507718954314,
     'moran_pred': 0.9114954425947884,
     'mse_ET': 1.2620869853561523,
     'me_ET': 0.8902205069616883,
     'r2_ET': 0.7003543053499883},
    'loop_2': {'mse': 1.0085343332097125,
     'me': 0.7599851805753213,
     'r2': 0.7150641735303911,
     'moran_obs': 0.931455022237344,
     'moran_pred': 0.9245464529450895,
     'mse_ET': 1.615757883882144,
     'me_ET': 1.010752776208472,
     'r2_ET': 0.6818719322134003},
    'loop_3': {'mse': 0.7496983508460958,
     'me': 0.6380400171488202,
     'r2': 0.7694484449836938,
     'moran_obs': 0.9284248

In [118]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)

In [None]:
meta[opt]["et"] = {}

geoms = gpd.read_parquet(f"{data_folder}/spatial_units_test/tables/et_{opt}.pq")

fit_and_eval(geoms, opt, "et", et)

meta

 There are 13 disconnected components.
 There are 9 disconnected components.
 There are 2 islands with ids: 4544, 6347.




In [None]:
with open(f"{data_folder}/spatial_units_test/meta.json", "w") as f:
    json.dump(meta, f)