# Training data sampled from the whole England

In [13]:
import datetime

import geopandas as gpd
import tobler
import pandas as pd
import numpy as np
import xarray as xr
import pooch

Specify a path to the data folder.

In [2]:
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load 2011 Output Area geoemtry (generalised).

In [6]:
oa = gpd.read_file(f"{data_folder}/raw/Output_Areas_Dec_2011_Boundaries_EW_BGC_2022.zip")

## Air Quality

In [14]:
pm10_21 = pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv", header=5, na_values=["MISSING"]).set_index(["x", "y"]).drop(columns="gridcode").to_xarray()
pm25_21 = pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv", header=5, na_values=["MISSING"]).set_index(["x", "y"]).drop(columns="gridcode").to_xarray()
no2_21 = pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv", header=5, na_values=["MISSING"]).set_index(["x", "y"]).drop(columns="gridcode").to_xarray()
so2_21 = pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv", header=5, na_values=["MISSING"]).set_index(["x", "y"]).drop(columns="gridcode").to_xarray()
pollutants_2021 = xr.merge([pm10_21, pm25_21, no2_21, so2_21])
pollutants_2021

We can compute the index based on the formula.

In [15]:
aqi = pollutants_2021.pm252021g + pollutants_2021.pm102021g / 2 + pollutants_2021.no22021 / 4 + pollutants_2021.so22021 / 10
pollutants_2021 = pollutants_2021.assign(aqi=aqi)

We convert the array to a GeoDataFrame with polygons representing grid area. This will be needed for areal interpolation to LSOA/MSOA.

In [16]:
pollutants_2021 = pollutants_2021.to_dataframe().reset_index()
pollutants_2021 = gpd.GeoDataFrame(pollutants_2021, geometry=gpd.points_from_xy(pollutants_2021.x, pollutants_2021.y, crs=27700).buffer(500, cap_style=3))

In [17]:
pollutants_2021.to_parquet(f"{data_folder}/processed/air_quality/air_quality_grid_2021_england.parquet")

In [19]:
pollutants_2021

Unnamed: 0,x,y,pm102021g,pm252021g,no22021,so22021,aqi,geometry
0,-500,5500,,,,,,"POLYGON ((0.000 6000.000, 0.000 5000.000, -100..."
1,-500,6500,,,,,,"POLYGON ((0.000 7000.000, 0.000 6000.000, -100..."
2,-500,7500,,,,,,"POLYGON ((0.000 8000.000, 0.000 7000.000, -100..."
3,-500,8500,,,,,,"POLYGON ((0.000 9000.000, 0.000 8000.000, -100..."
4,-500,9500,,,,,,"POLYGON ((0.000 10000.000, 0.000 9000.000, -10..."
...,...,...,...,...,...,...,...,...
768685,655500,1215500,,,,,,"POLYGON ((656000.000 1216000.000, 656000.000 1..."
768686,655500,1216500,,,,,,"POLYGON ((656000.000 1217000.000, 656000.000 1..."
768687,655500,1217500,,,,,,"POLYGON ((656000.000 1218000.000, 656000.000 1..."
768688,655500,1218500,,,,,,"POLYGON ((656000.000 1219000.000, 656000.000 1..."


In [20]:
%%time
interp = tobler.area_weighted.area_interpolate(pollutants_2021, oa, intensive_variables=["aqi"])

  return lib.intersection(a, b, **kwargs)
  warn(f"nan values in variable: {column}, replacing with 0")


CPU times: user 17.2 s, sys: 215 ms, total: 17.4 s
Wall time: 17.4 s


In [24]:
oa["air_quality"] = interp.aqi

## House price

In [None]:
# It is 6.3GB...
linked_epc_path = "https://reshare.ukdataservice.ac.uk/854942/1/tranall2011_19.csv"

epc = pd.read_csv(linked_epc_path)
epc["dateoftransfer"] = pd.to_datetime(epc.dateoftransfer)
last2years = epc[epc.dateoftransfer > datetime.datetime(2018, 1, 1)]

price_per_oa = last2years[['oa11', 'priceper']].groupby('oa11').mean().reset_index()
price_per_oa.to_parquet(f"{data_folder}/processed/house_prices/price_per_oa.parquet")

In [26]:
price_per_oa = pd.read_parquet(f"{data_folder}/processed/house_prices/price_per_oa.parquet")

In [27]:
price_per_oa

Unnamed: 0,oa11,priceper
0,E00000003,15722.758094
1,E00000005,12587.314452
2,E00000007,12640.262294
3,E00000010,9535.653363
4,E00000012,10341.463415
...,...,...
173553,W00010260,1632.955752
173554,W00010261,1834.401969
173555,W00010262,1511.677538
173556,W00010263,1679.802956


In [29]:
oa

Unnamed: 0,OA11CD,LAD11CD,GlobalID,geometry,air_quality
0,E00000001,E09000001,f8512cce-6727-42cf-9840-6866cdbb2deb,"POLYGON ((532303.492 181814.110, 532213.378 18...",27.501372
1,E00000003,E09000001,9eeeb3aa-ce92-4cea-bd70-3a0e680cddc1,"POLYGON ((532180.131 181763.020, 532155.909 18...",27.501372
2,E00000005,E09000001,012372dd-5e03-43c3-a915-b697d02f88e2,"POLYGON ((532124.321 181682.675, 532127.299 18...",27.501372
3,E00000007,E09000001,b61eb464-9c5b-4f0e-8f78-000ea938ee78,"POLYGON ((532124.321 181682.675, 532201.292 18...",27.545142
4,E00000010,E09000001,efc37450-7064-4f5e-bedd-433bf4de1167,"POLYGON ((532071.302 182159.586, 532127.958 18...",27.587650
...,...,...,...,...,...
181403,W00010261,W06000011,c1051d13-4fa6-48d8-8f32-5020938ce7d6,"POLYGON ((262156.208 196600.223, 262074.703 19...",13.828992
181404,W00010262,W06000011,ea1fef0a-138d-4035-a401-48c459abbca0,"POLYGON ((263241.217 197440.210, 263271.904 19...",13.670047
181405,W00010263,W06000011,38819942-d3db-47e1-a85d-eec4d7a091e4,"POLYGON ((262156.208 196600.223, 262205.269 19...",14.125627
181406,W00010264,W06000011,61fbb0fb-10bf-4000-a669-7114865776ff,"POLYGON ((268829.001 198038.000, 268708.179 19...",14.219191


In [30]:
oa = oa.merge(price_per_oa, left_on="OA11CD", right_on="oa11", how="left")

In [32]:
oa = oa.drop(columns="oa11").rename(columns={"priceper": "house_price"})

In [36]:
oa[["OA11CD", "geometry", "air_quality", "house_price"]].to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

## Explanatory variables

We also want to have all our explanatory variables linked to the OA geometries.

### Population estimates

ONS population estimates are reported on the OA level and can be merged.

Read the file processed in the Urban Grammar project.

In [41]:
pop = pd.read_parquet(f"{data_folder}/processed/population.parquet")

Attribute join.

In [43]:
oa = oa.merge(pop, left_on="OA11CD", right_on="code", how="left")

In [47]:
oa = oa.drop(columns=["code"])

### Workplace population

Workplace population is reported on Workplace Zones and needs to be interpolated to OA. We use the preprocessed data from the Urban Grammar project. 

In [49]:
wp = gpd.read_parquet(f"{data_folder}/raw/workplace_population/workplace_by_industry_gb.pq")

In [50]:
%%time
wp_interpolated = tobler.area_weighted.area_interpolate(wp, oa, extensive_variables=[
    'A, B, D, E. Agriculture, energy and water',
    'C. Manufacturing', 'F. Construction',
    'G, I. Distribution, hotels and restaurants',
    'H, J. Transport and communication',
    'K, L, M, N. Financial, real estate, professional and administrative activities',
    'O,P,Q. Public administration, education and health',
    'R, S, T, U. Other'
])

  return lib.intersection(a, b, **kwargs)


CPU times: user 31.5 s, sys: 351 ms, total: 31.9 s
Wall time: 32 s


In [52]:
oa[[
    'A, B, D, E. Agriculture, energy and water',
    'C. Manufacturing', 'F. Construction',
    'G, I. Distribution, hotels and restaurants',
    'H, J. Transport and communication',
    'K, L, M, N. Financial, real estate, professional and administrative activities',
    'O,P,Q. Public administration, education and health',
    'R, S, T, U. Other'
]] = wp_interpolated.drop(columns="geometry").values

In [55]:
oa = oa.drop(columns=["LAD11CD", "GlobalID"])

In [56]:
oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

### Land cover (CORINE)

CORINE is shipped as custom polygons. We use the data downloaded for the Urban Grammar project.

In [57]:
corine = gpd.read_parquet(f"{data_folder}/raw/land_cover/corine_gb.pq")

In [59]:
%%time
corine_interpolated = tobler.area_weighted.area_interpolate(corine, oa, categorical_variables=["Code_18"])

  return lib.intersection(a, b, **kwargs)


CPU times: user 22min 56s, sys: 5.49 s, total: 23min 1s
Wall time: 23min 4s


In [60]:
corine_names = {
    'Code_18_124': 'Land cover [Airports]',
    'Code_18_211': 'Land cover [Non-irrigated arable land]',
    'Code_18_121': 'Land cover [Industrial or commercial units]',
    'Code_18_421': 'Land cover [Salt marshes]',
    'Code_18_522': 'Land cover [Estuaries]',
    'Code_18_142': 'Land cover [Sport and leisure facilities]',
    'Code_18_141': 'Land cover [Green urban areas]',
    'Code_18_112': 'Land cover [Discontinuous urban fabric]',
    'Code_18_231': 'Land cover [Pastures]',
    'Code_18_311': 'Land cover [Broad-leaved forest]',
    'Code_18_131': 'Land cover [Mineral extraction sites]',
    'Code_18_123': 'Land cover [Port areas]',
    'Code_18_122': 'Land cover [Road and rail networks and associated land]',
    'Code_18_512': 'Land cover [Water bodies]',
    'Code_18_243': 'Land cover [Land principally occupied by agriculture, with significant areas of natural vegetation]',
    'Code_18_313': 'Land cover [Mixed forest]',
    'Code_18_412': 'Land cover [Peat bogs]',
    'Code_18_321': 'Land cover [Natural grasslands]',
    'Code_18_322': 'Land cover [Moors and heathland]',
    'Code_18_324': 'Land cover [Transitional woodland-shrub]',
    'Code_18_111': 'Land cover [Continuous urban fabric]',
    'Code_18_423': 'Land cover [Intertidal flats]',
    'Code_18_523': 'Land cover [Sea and ocean]',
    'Code_18_312': 'Land cover [Coniferous forest]',
    'Code_18_133': 'Land cover [Construction sites]',
    'Code_18_333': 'Land cover [Sparsely vegetated areas]',
    'Code_18_332': 'Land cover [Bare rocks]',
    'Code_18_411': 'Land cover [Inland marshes]',
    'Code_18_132': 'Land cover [Dump sites]',
    'Code_18_222': 'Land cover [Fruit trees and berry plantations]',
    'Code_18_242': 'Land cover [Complex cultivation patterns]',
    'Code_18_331': 'Land cover [Beaches, dunes, sands]',
    'Code_18_511': 'Land cover [Water courses]',
    'Code_18_334': 'Land cover [Burnt areas]',
    'Code_18_244': 'Land cover [Agro-forestry areas]',
    'Code_18_521': 'Land cover [Coastal lagoons]',
}

corine_interpolated.columns = corine_interpolated.columns.map(corine_names)

Assign the subset of classes that may be interesting for our purposes to OA dataframe.

In [61]:
interesting = [
    "Land cover [Discontinuous urban fabric]",
    "Land cover [Continuous urban fabric]",
    "Land cover [Non-irrigated arable land]",
    "Land cover [Industrial or commercial units]",
    "Land cover [Green urban areas]",
    "Land cover [Pastures]",
    "Land cover [Sport and leisure facilities]",
]
oa[interesting] = corine_interpolated[interesting].values

In [63]:
oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

## Morphometrics

Getting the file pre-processed based on the Urban Grammar

In [None]:
oa_morpho = pd.read_parquet(f"{data_folder}/processed/morphometrics_oa.parquet")
oa = oa.merge(oa_morpho, left_on="OA11CD", right_index=True)

In [63]:
oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

## Cleanup

Drop unusable rows

In [65]:
oa = gpd.read_parquet(f"{data_folder}/processed/oa_data_england.parquet")

In [68]:
to_drop = [
    "sdbPer",
    "ssbElo",
    "stcOri",
    "sdcLAL",
    "mdcAre",
    "ltcAre",
    "ltcWRE",
    "mtdMDi",
    "lcdMes",
    "lddNDe",
    "sddAre",
    "mdsAre",
    "ldsAre",
    "lisCel",
    "ldePer",
    "lseCWA",
]

oa = oa.drop(columns=to_drop)

In [70]:
oa_clean = oa.dropna()

Remove London data

In [74]:
london = gpd.read_file(f"{data_folder}/raw/london/OA_2011_London_gen_MHW.shp")

In [80]:
oa_clean = oa_clean[~oa_clean.OA11CD.isin(london.OA11CD)]

In [81]:
oa_clean.set_index("OA11CD").to_parquet(f"{data_folder}/processed/oa_data_england.parquet")