In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 50)

## National Statistics Postcode Lookup

In [None]:
postcodes = pd.read_csv('input/National_Statistics_Postcode_Lookup_UK.csv.gz')
postcodes.head()

### Pick One Postcode

The three postcode fields differ in their spacing. It looks like `Postcode 3` matches the My EU definition of a 'clean' postcode.

In [None]:
postcodes['postcode'] = postcodes['Postcode 1'].\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')
assert not np.any(postcodes['postcode'] != postcodes['Postcode 3'])

### Save Useful Fields

In [None]:
output_postcodes = postcodes[[
    'Postcode 3',
    'Parliamentary Constituency Code',
    'Parliamentary Constituency Name',
    'Latitude',
    'Longitude'
]].rename(columns={
    'Postcode 3': 'postcode',
    'Parliamentary Constituency Code': 'parliamentary_constituency_code',
    'Parliamentary Constituency Name': 'parliamentary_constituency_name',
    'Latitude': 'latitude',
    'Longitude': 'longitude'
})
output_postcodes.head()

In [None]:
output_postcodes.sort_values('postcode', inplace=True)
assert output_postcodes.shape[0] == output_postcodes.postcode.unique().shape[0]

In [None]:
output_postcodes.count()

## Postcode to NUTS

In [None]:
postcode_to_nuts = pd.read_csv('input/pc2018_uk_NUTS-2016_v3.0.csv.gz', delimiter=';', quotechar="'")
postcode_to_nuts.head()

In [None]:
postcode_to_nuts.count()

The spacing rules for the postcodes are not quite the same, e.g. `BT1  1AA` here vs `BT1 1AA` in the other dataset.

In [None]:
postcode_to_nuts['postcode'] = postcode_to_nuts.CODE.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')
np.sum(postcode_to_nuts.postcode != postcode_to_nuts.CODE)

In [None]:
postcode_to_nuts.head()

In [None]:
output_postcodes_with_nuts = pd.merge(
    output_postcodes,
    postcode_to_nuts[['postcode', 'NUTS3']].rename(columns={'NUTS3': 'nuts3'}),
    'left'
)
output_postcodes_with_nuts.head()

In [None]:
output_postcodes_with_nuts.count()

## Save Output 

In [None]:
output_postcodes.to_pickle('output/postcode_lookup.pkl.gz')