# House price model training

Training of the final house price model based on England-wide training data.

In [1]:
import geopandas as gpd
import numpy as np
import pickle
import libpysal
import pandas as pd

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load the data

In [3]:
data = gpd.read_parquet( f"{data_folder}/processed/oa_data_england.parquet")

Filter only explanatory variables.

In [4]:
exvars = data.drop(
    columns=[
        "geometry",
        "air_quality",
        "house_price",
    ]
)

Create weights of the order 5 identified as optimal.

In [9]:
queen = libpysal.weights.Queen.from_dataframe(data)
queen5 = libpysal.weights.higher_order(queen, k=5, lower_order=True)

 There are 51 disconnected components.
 There are 19 islands with ids: 1676, 2132, 3036, 21306, 33133, 34428, 42635, 42654, 68863, 72800, 74393, 105153, 108399, 134057, 140649, 141143, 141475, 144653, 149708.


Compute spatial lag.

In [10]:
queen5.transform = "r"
for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen5, exvars[col])



Create mask to ignore missing values in training.

In [11]:
mask = data.house_price.notna()

Initialise the model.

In [12]:
regressor = HistGradientBoostingRegressor(
    random_state=0, max_bins=128, max_iter=1000
)


Train the model.

In [14]:
regressor.fit(exvars[mask], np.log(data.house_price[mask]))

Test the prediction.

In [15]:
regressor.predict(exvars.iloc[:10])

array([7.43681357, 7.52412204, 7.34451913, 7.64584292, 7.45234704,
       7.7307261 , 7.74621834, 7.5703991 , 7.53491617, 7.4739106 ])

Save to file.

In [None]:
with open(f"{data_folder}/models/house_price_predictor_england_no_london.pickle", "wb") as f:
    pickle.dump(regressor, f)