# Air quality model training

Training of the final air quality model based on England-wide training data. We use all the urbanities and complete Tyne and Wear data.

In [5]:
import geopandas as gpd
import numpy as np
import joblib
import libpysal
import pandas as pd

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

In [3]:
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load the data

In [4]:
data = gpd.read_parquet( f"{data_folder}/processed/oa_data_england.parquet").reset_index()

In [16]:
data

Unnamed: 0,OA11CD,geometry,air_quality,house_price,population,"A, B, D, E. Agriculture, energy and water",C. Manufacturing,F. Construction,"G, I. Distribution, hotels and restaurants","H, J. Transport and communication",...,sdsAre,sisBpM,misCel,ltcRea,ldeAre,lseCCo,lseERI,lteOri,lteWNB,lieWCe
0,E00024141,"POLYGON ((371069.995 412699.817, 371071.042 41...",15.934102,1922.197510,332,0.990638,6.726873,11.046278,6.136546,2.492399,...,13975.666597,0.056219,19.954050,35.246575,1.256267e+06,0.319418,0.816373,20.132149,0.015252,0.000419
1,E00024142,"POLYGON ((372065.268 412451.708, 372209.845 41...",15.448224,1906.095472,248,0.001346,3.223105,5.636509,3.545102,1.128334,...,15410.342389,0.100076,28.093151,44.830189,3.925836e+05,0.449345,0.710619,2.959019,0.014953,0.001140
2,E00024143,"POLYGON ((371660.000 411501.000, 371713.600 41...",16.867974,1637.266845,344,0.012007,16.303660,5.735694,25.856348,10.764267,...,29083.154721,0.091624,32.059264,41.927536,2.570456e+05,0.376142,0.710968,0.725296,0.015635,0.000501
3,E00024144,"POLYGON ((372060.628 413005.554, 372135.500 41...",15.428515,2309.758910,267,0.835283,5.659664,9.330154,5.139762,2.095734,...,16525.316453,0.136274,41.319858,36.449275,5.520947e+05,0.343661,0.540198,2.378640,0.009774,0.000410
4,E00024145,"POLYGON ((371044.000 412456.000, 371054.856 41...",15.934103,1892.330673,342,1.115215,10.030827,14.307288,10.770798,3.715121,...,20519.501061,0.090342,34.058382,35.644444,3.432917e+04,0.365621,0.832452,13.366394,0.018709,0.000634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150957,W00010260,"POLYGON ((267910.000 198012.000, 267999.940 19...",14.179153,1632.955752,388,110.687501,300.948318,27.233474,287.566966,110.478280,...,46212.801285,0.090101,25.266093,46.838235,2.605957e+05,0.390833,0.828420,22.939585,0.007081,0.000552
150958,W00010261,"POLYGON ((262156.208 196600.223, 262074.703 19...",13.828992,1834.401969,350,0.159798,1.538887,3.336814,4.093612,2.150986,...,26908.056789,0.086915,38.950354,37.342857,1.152950e+05,0.481360,0.734035,38.749182,0.010361,0.000873
150959,W00010262,"POLYGON ((263241.217 197440.210, 263271.904 19...",13.670047,1511.677538,255,0.461842,4.608673,10.359996,12.534188,6.675875,...,132838.218299,0.047589,25.111880,48.400000,5.917093e+05,0.282517,0.679996,15.017141,0.004100,0.001458
150960,W00010263,"POLYGON ((262156.208 196600.223, 262205.269 19...",14.125627,1679.802956,266,0.143086,1.394383,3.084563,3.854233,1.985713,...,32281.163838,0.081168,34.711174,44.390625,1.653941e+05,0.432894,0.656936,42.673309,0.005929,0.000321


Filter only explanatory variables.

In [6]:
exvars = data.drop(
    columns=[
        "OA11CD",
        "geometry",
        "air_quality",
        "house_price",
    ]
)

Create weights of the order 5 identified as optimal.

In [7]:
queen = libpysal.weights.Queen.from_dataframe(data)
dist2000 = libpysal.weights.DistanceBand.from_dataframe(data, 2000)
W = libpysal.weights.w_union(queen,dist2000)

  queen = libpysal.weights.Queen.from_dataframe(data)
 There are 51 disconnected components.
 There are 19 islands with ids: 1676, 2132, 3036, 21306, 33133, 34428, 42635, 42654, 68863, 72800, 74393, 105153, 108399, 134057, 140649, 141143, 141475, 144653, 149708.
 There are 5289 disconnected components.
 There are 3592 islands with ids: 2922, 2953, 3591, 12424, 12425, 12427, 12742, 12744, 13181, 13182, 13278, 13409, 13960, 14168, 14924, 15712, 15729, 15731, 15733, 28156, 28170, 28272, 28288, 28309, 28480, 28638, 28648, 29700, 31495, 31496, 33251, 33357, 34067, 34099, 34140, 34428, 34748, 35053, 35055, 35056, 35057, 35217, 35218, 36381, 36383, 37762, 37767, 37769, 37786, 37804, 37809, 38033, 38074, 38075, 38076, 38077, 38086, 38087, 38088, 38089, 38091, 38094, 38099, 38104, 38115, 38116, 38117, 38118, 38119, 38121, 38236, 38237, 38251, 38252, 38260, 38261, 38277, 38279, 38283, 38294, 38295, 38296, 38318, 38320, 38322, 38323, 38324, 38335, 38339, 38340, 38392, 38393, 38422, 38469, 38547, 

Compute spatial lag.

In [8]:
W.transform = "r"
for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(W, exvars[col])



Create a filter to use only urbanities and Tyne and Wear.

In [9]:
tyne_wear = gpd.read_file(f"{data_folder}/processed/OA_TyneWear.gpkg")
signature_key = pd.read_csv("https://figshare.com/ndownloader/files/30904894")
data_w_type = data.merge(signature_key[["OA11CD", "primary_type"]], on="OA11CD", how="left")
mask_urbanity = data_w_type.primary_type.str.contains("urbanity")
mask_tw = data.OA11CD.isin(tyne_wear.geo_code)
mask = (mask_urbanity | mask_tw)

Initialise the model.

In [10]:
regressor = HistGradientBoostingRegressor(
    random_state=0, max_bins=64, max_iter=1000
)


Train the model.

In [11]:
regressor.fit(exvars[mask], data.air_quality[mask])

Test the prediction.

In [None]:
regressor.predict(exvars.iloc[:10])

array([16.08597591, 16.18028682, 16.75840614, 16.83340318, 16.94072672,
       16.30293912, 16.42741979, 16.588092  , 16.49096282, 16.6718441 ])

Save to file.

In [13]:
with open(f"{data_folder}/models/air_quality_model.joblib", "wb") as f:
    joblib.dump(regressor, f)