# Nearest living postcode
UK postcodes are sometimes retired, this finds the closest 'living' postcode to use instead 
***

In [2]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
from AZURE_VARS import *

In [4]:
usecols = ['pcds', 'doterm', 'oseast1m', 'osnrth1m']
onspd = pd.read_csv('ONSPD_FEB_2022_UK.csv', usecols=usecols)
onspd.head()

Unnamed: 0,pcds,doterm,oseast1m,osnrth1m
0,AB1 0AA,199606.0,385386.0,801193.0
1,AB1 0AB,199606.0,385177.0,801314.0
2,AB1 0AD,199606.0,385053.0,801092.0
3,AB1 0AE,199606.0,384600.0,799300.0
4,AB1 0AF,199207.0,384460.0,800660.0


In [5]:
live = onspd[onspd.doterm.isna()]
terminated = onspd[~onspd.doterm.isna()]

In [6]:
print(len(live))
print(len(terminated))

1785448
887570


In [7]:
# kdtree will make this fast
from scipy.spatial import cKDTree
import numpy as np

In [8]:
# reference points = police stations
# other points = postcode centroids
ref_points = np.array([(x, y) for x, y in zip(live.oseast1m, live.osnrth1m)])
other_points = np.array([(x, y) for x, y in zip(terminated.oseast1m, terminated.osnrth1m)])

In [9]:
print(len(ref_points))
print(len(other_points))

1785448
887570


In [10]:
# initialise tree
kdtree = cKDTree(ref_points)

In [11]:
distances, index = kdtree.query(other_points, k=1)

In [12]:
distances.shape

(887570,)

In [13]:
index.shape

(887570,)

In [15]:
import matplotlib.pyplot as plt

In [16]:
terminated['live_index'] = index 

In [17]:
terminated['live_distance'] = distances

In [18]:
terminated

Unnamed: 0,pcds,doterm,oseast1m,osnrth1m,live_index,live_distance
0,AB1 0AA,199606.0,385386.0,801193.0,2123,51.478151
1,AB1 0AB,199606.0,385177.0,801314.0,2109,116.965807
2,AB1 0AD,199606.0,385053.0,801092.0,2110,154.207652
3,AB1 0AE,199606.0,384600.0,799300.0,1846,34.000000
4,AB1 0AF,199207.0,384460.0,800660.0,2243,36.687873
...,...,...,...,...,...,...
2673001,ZE2 9YQ,200601.0,447759.0,1141280.0,1784858,0.000000
2673002,ZE2 9YR,200507.0,436936.0,1136065.0,1785435,21.540659
2673003,ZE2 9YZ,201502.0,417320.0,1159962.0,1785324,540.707869
2673004,ZE2 9ZG,201111.0,462222.0,1208843.0,1785173,62.801274


In [19]:
%%time
live_postcodes = list(live.pcds.values)
# last index is for 'inf' distance corresponding to no coords
live_postcodes.append('')
nearest_live_pcus = [live_postcodes[i] for i in index]

CPU times: user 157 ms, sys: 2.86 ms, total: 160 ms
Wall time: 158 ms


In [20]:
terminated['live_pcu'] = nearest_live_pcus

In [21]:
(terminated.live_pcu=='').sum()

10657

In [22]:
terminated

Unnamed: 0,pcds,doterm,oseast1m,osnrth1m,live_index,live_distance,live_pcu
0,AB1 0AA,199606.0,385386.0,801193.0,2123,51.478151,AB13 0DL
1,AB1 0AB,199606.0,385177.0,801314.0,2109,116.965807,AB13 0AB
2,AB1 0AD,199606.0,385053.0,801092.0,2110,154.207652,AB13 0AD
3,AB1 0AE,199606.0,384600.0,799300.0,1846,34.000000,AB12 5FF
4,AB1 0AF,199207.0,384460.0,800660.0,2243,36.687873,AB14 0QL
...,...,...,...,...,...,...,...
2673001,ZE2 9YQ,200601.0,447759.0,1141280.0,1784858,0.000000,ZE1 0EH
2673002,ZE2 9YR,200507.0,436936.0,1136065.0,1785435,21.540659,ZE2 9XW
2673003,ZE2 9YZ,201502.0,417320.0,1159962.0,1785324,540.707869,ZE2 9PW
2673004,ZE2 9ZG,201111.0,462222.0,1208843.0,1785173,62.801274,ZE2 9DP


In [23]:
# confirm these ones have no coords and inf distance
terminated[terminated.live_pcu == '']

Unnamed: 0,pcds,doterm,oseast1m,osnrth1m,live_index,live_distance,live_pcu
3551,AB11 3AG,199707.0,,,1785448,inf,
12745,AB23 9AA,201007.0,,,1785448,inf,
12747,AB23 9AD,200912.0,,,1785448,inf,
12748,AB23 9AE,200912.0,,,1785448,inf,
12750,AB23 9AG,201102.0,,,1785448,inf,
...,...,...,...,...,...,...,...
2664119,YO42 9AB,200911.0,,,1785448,inf,
2669970,YO7 9AE,200910.0,,,1785448,inf,
2669971,YO7 9AF,201005.0,,,1785448,inf,
2669972,YO7 9AG,201105.0,,,1785448,inf,


In [24]:
onspd = onspd.merge(terminated[['pcds', 'live_distance', 'live_pcu']], on='pcds', how='left')

In [25]:
onspd

Unnamed: 0,pcds,doterm,oseast1m,osnrth1m,live_distance,live_pcu
0,AB1 0AA,199606.0,385386.0,801193.0,51.478151,AB13 0DL
1,AB1 0AB,199606.0,385177.0,801314.0,116.965807,AB13 0AB
2,AB1 0AD,199606.0,385053.0,801092.0,154.207652,AB13 0AD
3,AB1 0AE,199606.0,384600.0,799300.0,34.000000,AB12 5FF
4,AB1 0AF,199207.0,384460.0,800660.0,36.687873,AB14 0QL
...,...,...,...,...,...,...
2673013,ZE3 9JW,,438975.0,1110038.0,,
2673014,ZE3 9JX,,438872.0,1110219.0,,
2673015,ZE3 9JY,,438498.0,1112029.0,,
2673016,ZE3 9JZ,,438662.0,1112122.0,,


In [26]:
# set inf distances where we don't have coordinates to NaNs
onspd.loc[np.isinf(onspd.live_distance), 'live_distance'] = np.nan

In [27]:
islands = ('GY', 'JE', 'IM', 'BT')
onspd[onspd.pcds.str.startswith(islands)]

Unnamed: 0,pcds,doterm,oseast1m,osnrth1m,live_distance,live_pcu
303207,BT1 1AA,,334316.0,374675.0,,
303208,BT1 1AE,200204.0,333720.0,374670.0,5.385165,BT1 1DL
303209,BT1 1AF,200111.0,333720.0,374670.0,5.385165,BT1 1DL
303210,BT1 1AG,199706.0,334302.0,374705.0,0.000000,BT1 1AR
303211,BT1 1AH,200705.0,,,,
...,...,...,...,...,...,...
1133503,JE5 0LG,201009.0,,,,
1133504,JE5 0LH,201009.0,,,,
1133505,JE5 0LJ,201009.0,,,,
1133506,JE5 0LL,201009.0,,,,


In [28]:
onspd.to_csv('nearest_living_relation_feb22.csv')

CPU times: user 7.63 s, sys: 32.1 ms, total: 7.66 s
Wall time: 14.8 s
