In [None]:
from collections import OrderedDict
import copy
import json
import os
import re

from ipyleaflet import Map, GeoJSON
from ipywidgets import Label, Layout, VBox
import numpy as np
import pandas as pd

## Outcodes of Interest

Which outcodes occur in our farm data?

In [None]:
FARM_DATA_FILES = [
    'DAERA-Table 1.csv',
    'RPA-Table 1.csv',
    'RPA2-Table 1.csv',
    'SGRPID-Table 1.csv',
    'WG-Table 1.csv'
]
farm_funding = pd.concat([
    pd.read_csv(os.path.join('data', file))
    for file in FARM_DATA_FILES
])
farm_funding.shape

In [None]:
farm_funding.columns

In [None]:
# I see some trailing spaces and lower case codes.
farm_funding['CleanPostcodePrefix'] = farm_funding['PostcodePrefix_F202B'].str.strip().str.upper()

In [None]:
outcodes = set(list(farm_funding['CleanPostcodePrefix']))
len(outcodes)

They don't look very clean. Let's see how many are valid.

This is based on https://en.wikipedia.org/wiki/Postcodes_in_the_United_Kingdom#Validation . It accepts the outward code and optionally the first digit of the inward code, because some of the data have said first digit.

In [None]:
VALID_OUTCODE_RE = re.compile(
    r'^([Gg][Ii][Rr] 0)'
    r'|((([A-Za-z][0-9]{1,2})|'
    r'(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9]?[A-Za-z])))'
    r')( [0-9])?)$'
)
valid_outcodes = set([
    outcode for outcode in outcodes
    if VALID_OUTCODE_RE.match(outcode)
])
len(valid_outcodes)

In [None]:
outcodes - valid_outcodes

In [None]:
farm_funding[farm_funding['PostcodePrefix_F202B'] == 'CRO ']

In [None]:
farm_funding[farm_funding['PostcodePrefix_F202B'] == 'WA 6']

### Check Against Authoritative List

In [None]:
outcode_to_location = pd.read_csv('data/postcode-outcodes.csv').drop('id', axis=1)
outcode_to_location.shape

In [None]:
pd.merge(farm_funding, outcode_to_location, left_on='CleanPostcodePrefix', right_on='postcode', validate='m:1').shape

So, it's not too bad.

## The miDrive Data

Not bad (after some fixing for the PL area: https://github.com/miDrive/uk-outcode-geometry/pull/1), but missing Northern Ireland.

In [None]:
JSON_ROOT = 'data/uk-outcode-geometry-master/json/'
all_outcodes = None
for area_file in os.listdir(JSON_ROOT):
    with open(os.path.join(JSON_ROOT, area_file)) as f:
        data = json.load(f)
    if all_outcodes is None:
        all_outcodes = data
    else:
        all_outcodes['features'].extend(data['features'])
        
len(all_outcodes['features'])

In [None]:
all_outcodes['features'][0]

In [None]:
m = Map(center=(54.3781, 3.4360), zoom = 5)
geo_json = GeoJSON(data=all_outcodes)
m.add_layer(geo_json)
m

## The Google Fusion Tables Data

From [Fusion Tables](https://fusiontables.google.com/data?docid=1jgWYtlqGSPzlIa-is8wl1cZkVIWEm_89rWUwqFU).

License: http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/

In [None]:
fusion_districts_raw = pd.read_csv('data/uk_postcode_districts.csv')
fusion_districts_raw.shape

In [None]:
# We have some duplicates. We don't want that for merging.
fusion_districts_raw_prefixes = list(fusion_districts_raw['Postcode district'])
fusion_districts_duplicate_prefixes = set([
    prefix for prefix in fusion_districts_raw_prefixes
    if fusion_districts_raw_prefixes.count(prefix) > 1
])
fusion_districts_duplicate_prefixes

In [None]:
# Let's keep the largest shape for each one, on the assumption that it's the most detailed.
fusion_districts_raw['Area len'] = fusion_districts_raw['Area data'].apply(len)
fusion_districts_raw[fusion_districts_raw['Postcode district'] == 'BT21 0']

In [None]:
fusion_districts_raw = fusion_districts_raw.sort_values(['Postcode district', 'Area len'])
fusion_districts_raw.drop_duplicates('Postcode district', keep='last', inplace=True)
fusion_districts_raw[fusion_districts_raw['Postcode district'] == 'BT21 0']

In [None]:
fusion_districts_raw = fusion_districts_raw.drop('Area len', axis=1)
fusion_districts_raw[fusion_districts_raw['Postcode district'] == 'BT21 0']

In [None]:
fusion_districts_raw.shape

In [None]:
fusion_districts_raw.to_csv('data/uk_postcode_districts_deduplicated.csv')

Then ran the `togeojson` utility to get GeoJSON...

In [None]:
with open('data/uk_postcode_districts_deduplicated.json') as file:
    fusion_districts = json.load(file)
len(fusion_districts['features'])

In [None]:
fusion_districts_prefixes = [
    feature['properties']['name']
    for feature in fusion_districts['features']
]
len(fusion_districts_prefixes)

In [None]:
# Should not have any duplicates any more.
set([
    prefix for prefix in fusion_districts_prefixes
    if fusion_districts_prefixes.count(prefix) > 1
])

In [None]:
def make_district_map(data):
    m = Map(center=(54.3781, 3.4360), zoom = 5)
    label = Label(layout=Layout(width='100%'))
    
    layer = GeoJSON(data=data, hover_style={'fillColor': 'red'})
    
    def hover_handler(event=None, id=None, properties=None):
        label.value = properties['name']

    layer.on_hover(hover_handler)
    m.add_layer(layer)

    return VBox([m, label])
make_district_map(fusion_districts)

In [None]:
missing = valid_outcodes - set(fusion_districts_prefixes)
missing

In [None]:
fusion_districts_raw[fusion_districts_raw['Postcode district'].str.startswith('B1')]

So, it looks like we have to aggregate some prefixes to fit the map data, and vice versa. However, it's not too bad. Let's see what it looks like if we just use the data for which we have map shapes.

## Plots

In [None]:
farm_funding_map = farm_funding[
    farm_funding['CleanPostcodePrefix'].isin(set(fusion_districts_prefixes))
]
farm_funding_map.shape

In [None]:
100 * farm_funding_map.shape[0] / farm_funding.shape[0]

In [None]:
farm_funding_by_district = farm_funding_map.groupby('CleanPostcodePrefix').aggregate(OrderedDict([
    ('OtherEAGFTotal', sum),
    ('DirectEAGFTotal', sum),
    ('RuralDevelopmentTotal', sum),
    ('Total', [sum, len])
]))
farm_funding_by_district.reset_index(inplace=True)
farm_funding_by_district.columns = [
    'CleanPostcodePrefix',
    'otherEAGF',
    'directEAGF',
    'ruralDevelopment',
    'total',
    'count'
]
PROPERTY_COLUMNS = set(farm_funding_by_district.columns) - set(['CleanPostcodePrefix'])
for column in PROPERTY_COLUMNS:
    farm_funding_by_district[column] = farm_funding_by_district[column].round().astype('int32')
farm_funding_by_district

### GeoJSON

In [None]:
fusion_districts_features = pd.DataFrame.from_dict({
    'name': [feature['properties']['name'] for feature in fusion_districts['features']],
    'geometry': [feature['geometry'] for feature in fusion_districts['features']],
})
fusion_districts_features.head()

In [None]:
fusion_districts_data = pd.merge(
    fusion_districts_features, farm_funding_by_district,
    left_on='name', right_on='CleanPostcodePrefix', validate='m:1')
fusion_districts_data.head()

In [None]:
fusion_districts_data.describe()

In [None]:
(fusion_districts_data['total'] / fusion_districts_data['count']).describe()

In [None]:
def make_funding_data_geo_json(fusion_districts_data):
    def make_feature(row):
        properties = {
            property: row[property] for property in PROPERTY_COLUMNS
        }
        properties['name'] = row['name']
        return {
            'type': 'Feature',
            'geometry': row['geometry'],
            'properties': properties
        }
    features = list(fusion_districts_data.apply(make_feature, axis=1))
    return { 'type': 'FeatureCollection', 'features': features }
with open('data/farm_funding_data.geo.json', 'w') as file:
    json.dump(make_funding_data_geo_json(fusion_districts_data), file)