# Build Data for the Map

In [None]:
%matplotlib inline

import glob
import json
import os

import pandas as pd

## Define Validity Checks

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv')
ukpostcodes.shape

In [None]:
def validate_postcodes(df):
    assert 'postcode' in df.columns
    assert (~df['postcode'].isin(ukpostcodes.postcode)).sum() == 0
    
def validate_date_range(df):
    assert 'start_date' in df.columns
    assert 'end_date' in df.columns
    assert df['start_date'].dtype == 'datetime64[ns]'
    assert df['end_date'].dtype == 'datetime64[ns]'
    assert (df['start_date'] > df['end_date']).sum() == 0

## Load Cleaned Data

### CAP

In [None]:
cap_by_area = pd.read_pickle('../cap/output/cap_by_area.pkl.gz')
cap_by_area.head()

### CORDIS

In [None]:
fp7_organizations = pd.read_pickle('../cordis/output/fp7_organizations.pkl.gz')
fp7_organizations['my_eu_id'] = 'fp7_organization_' + fp7_organizations.organizationId.astype('str')
validate_postcodes(fp7_organizations)
fp7_organizations.head()

In [None]:
fp7_projects = pd.read_pickle('../cordis/output/fp7_projects.pkl.gz')
fp7_projects['my_eu_id'] = 'fp7_project_' + fp7_projects.rcn.astype('str')
fp7_projects.rename({
    'startDate': 'start_date',
    'endDate': 'end_date',
}, axis=1, inplace=True)
validate_date_range(fp7_projects)
fp7_projects.head()

In [None]:
fp7 = pd.merge(
    fp7_projects, fp7_organizations,
    left_on='rcn', right_on='projectRcn', validate='1:m'
)
fp7.head()

In [None]:
fp7.describe()

In [None]:
(fp7.ecContribution > fp7.totalCost).sum()

In [None]:
fp7.title.isna().sum()

### ESIF (ESF/ERDF)

In [None]:
esif_england = pd.read_pickle('../esif/output/esif_england_2014_2020.pkl.gz')
validate_postcodes(esif_england)
validate_date_range(esif_england)
esif_england.head()

In [None]:
esif_scotland = pd.read_pickle('../esif/output/esif_scotland.pkl.gz')
validate_postcodes(esif_scotland)
validate_date_range(esif_scotland)
esif_scotland.head()

In [None]:
esif_wales = pd.read_pickle('../esif/output/esif_wales.pkl.gz')
validate_postcodes(esif_wales)
validate_date_range(esif_wales)
esif_wales.head()

## Idea 1: All Points on Map, Data by District

This should make the map look fairly similar to how it looks now, so it seems like a good starting point.

In [None]:
ALL_PLACES = [
    fp7_organizations,
    esif_england,
    esif_scotland,
    esif_wales
]

GeoJSON is very inefficient for representing a bunch of points, so let's use a relatively simple packed format.
```
min_longitude min_latitude
outcode incode delta_longitude delta_latitude incode delta_longitude delta_latitude
```
We need [about 4 decimal places](https://gis.stackexchange.com/questions/8650/measuring-accuracy-of-latitude-and-longitude).

In [None]:
def add_outward_and_inward_codes(df):
    df['outward_code'] = df.postcode.str.split(' ').str[0]
    df['inward_code'] = df.postcode.str.split(' ').str[1]
    return df

def pack_geocoded_postcodes(dfs):
    postcodes = pd.DataFrame({
        'postcode': pd.concat([df['postcode'] for df in dfs]).unique()
    })
    add_outward_and_inward_codes(postcodes)
    
    geocoded_postcodes = pd.merge(postcodes, ukpostcodes, validate='1:1')
    
    min_longitude = geocoded_postcodes.longitude.min()
    min_latitude = geocoded_postcodes.latitude.min()
    
    geocoded_postcodes['delta_longitude'] = geocoded_postcodes.longitude - min_longitude
    geocoded_postcodes['delta_latitude'] = geocoded_postcodes.latitude - min_latitude
    
    return {
        'min_longitude': min_longitude,
        'min_latitude': min_latitude,
        'geocoded_postcodes': geocoded_postcodes
    }

packed_postcodes = pack_geocoded_postcodes(ALL_PLACES)
[
    packed_postcodes['min_longitude'],
    packed_postcodes['min_latitude'],
    packed_postcodes['geocoded_postcodes'].shape[0]
]

In [None]:
packed_postcodes['geocoded_postcodes'].head()

In [None]:
def write_packed_postcodes(packed_postcodes, file):
    datum = [packed_postcodes['min_longitude'], packed_postcodes['min_latitude']]
    file.write(' '.join(['{0:.6f}'.format(coordinate) for coordinate in datum]))
    file.write('\n')
    
    grouped_postcodes = packed_postcodes['geocoded_postcodes'].\
        sort_values('outward_code').groupby('outward_code')

    def write_code_tuples(row):
        coordinate = '{0:.4f}'
        file.write(' ')
        file.write(' '.join([
            row['inward_code'],
            coordinate.format(row['delta_longitude']),
            coordinate.format(row['delta_latitude'])
        ]))

    for outward_code, group in grouped_postcodes:
        file.write(outward_code)
        group.sort_values('inward_code').apply(write_code_tuples, axis=1)
        file.write('\n')

# with open('output/packed_postcodes.txt', 'w') as file:
#     write_packed_postcodes(packed_postcodes, file)

Here's the same sort of format in JSON for comparison. It actually compresses down to about the same, and it will probably parse faster, so we might as well go with this JSON rather than the harder parsing job.

In [None]:
def make_packed_postcode_json(packed_postcodes):
    packed_postcodes = packed_postcodes.copy()
   
    grouped_postcodes = packed_postcodes['geocoded_postcodes'].\
        sort_values('outward_code').groupby('outward_code')
     
    def make_code_tuples(row):
        coordinate = '{0:.4f}'
        return [
            row['inward_code'],
            float(coordinate.format(row['delta_longitude'])),
            float(coordinate.format(row['delta_latitude']))
        ]
    
    postcodes = {}
    for outward_code, group in grouped_postcodes:
        postcodes[outward_code] = [
            x
            for code in group.sort_values('inward_code').apply(make_code_tuples, axis=1)
            for x in code
        ]

    min_coordinate = '{0:.6f}'
    return {
        'min_longitude': float(min_coordinate.format(packed_postcodes['min_longitude'])),
        'min_latitude': float(min_coordinate.format(packed_postcodes['min_latitude'])),
        'postcodes': postcodes
    }

with open('output/packed_postcodes.data.json', 'w') as file:
    json.dump(make_packed_postcode_json(packed_postcodes), file, sort_keys=True)

### Data by District

#### CORDIS

In [None]:
# Dump to JSON using pandas, because it puts in nulls instead of NaNs for
# missing values. Then load the JSON into dicts for 
def make_district_data_json(df):
    def to_json(group):
        group.drop('outwardCode', axis=1, inplace=True)
        return json.loads(group.to_json(orient='split', index=False))
    return df.groupby('outwardCode').apply(to_json)

def make_cordis_district_data(cordis):
    cordis = add_outward_and_inward_codes(cordis.copy())

    cordis = cordis[[
        'outward_code',
        'inward_code',
        'title',
        'name', # of organization
        'objective',
        'ecContribution',
        'totalCost',
        'acronym',
        'briefTitle',
        'teaser',
        'article',
        'projectUrl',
        'organizationUrl',
        'imageUri'
    ]]
    
    cordis.rename({
        'outward_code': 'outwardCode',
        'inward_code': 'inwardCode',
        'title': 'projectTitle',
        'name': 'organisationName',
        'imageUri': 'imageUrl'
    }, axis=1, inplace=True)
    
    return make_district_data_json(cordis)

print(fp7[fp7.postcode == 'CA4 9QY'])

fp7_district_data = make_cordis_district_data(fp7)
fp7_district_data.head()

#### ESIF

In [None]:
def make_esif_district_data(esif):
    esif = add_outward_and_inward_codes(esif.copy())
    esif = esif[[
        'outward_code',
        'inward_code',
        'project',
        'beneficiary',
        'summary',
        'funds',
        'eu_investment',
        'project_cost'
    ]]
    
    esif.rename({
        'outward_code': 'outwardCode',
        'inward_code': 'inwardCode',
        'project': 'projectTitle',
        'beneficiary': 'organisationName',
        'eu_investment': 'euInvestment',
        'project_cost': 'projectCost'
    }, axis=1, inplace=True)
    
    return make_district_data_json(esif)

esif_england_district_data = make_esif_district_data(esif_england)
esif_england_district_data.head()

In [None]:
esif_scotland_district_data = make_esif_district_data(esif_scotland)
esif_scotland_district_data.head()

In [None]:
esif_wales_district_data = make_esif_district_data(esif_wales)
esif_wales_district_data.head()

### Save Data

In [None]:
def merge_district_data(datasets):
    all_outward_codes = pd.concat([
        part.reset_index().outwardCode
        for outward_code, parts in datasets.items()
        for part in parts
    ]).unique()
    
    return {
        outward_code: {
            dataset: part[part.index == outward_code][0]
            for dataset, parts in datasets.items()
            for part in parts
            if (part.index == outward_code).any()
        }
        for outward_code in all_outward_codes
    }

district_data = merge_district_data({
    'cordis': [fp7_district_data],
    'esif': [esif_england_district_data, esif_scotland_district_data, esif_wales_district_data]
})
district_data['CA4']

In [None]:
OUTPUT_DISTRICT_PATH = 'output/district'

def list_district_data(path):
    return glob.glob(os.path.join(path, '*.data.json'))

def clear_district_data(path):
    for f in list_district_data(path):
        os.remove(f)

def write_district_data(district_data, path):
    os.makedirs(path, exist_ok=True)
    clear_district_data(path)
    for outward_code, datasets in district_data.items():
        output_pathname = os.path.join(path, outward_code + '.data.json')
        with open(output_pathname, 'w') as file:
            json.dump({
                'outwardCode': outward_code,
                'datasets': datasets
            }, file, sort_keys=True)
write_district_data(district_data, OUTPUT_DISTRICT_PATH)

In [None]:
def find_district_data_stats():
    files = list_district_data(OUTPUT_DISTRICT_PATH)
    return pd.DataFrame({
        'file': [file for file in files],
        'byte_size': [os.stat(file).st_size for file in files]
    })
district_data_stats = find_district_data_stats()
district_data_stats.describe()

In [None]:
district_data_stats[district_data_stats.byte_size > 1024*1024]

In [None]:
find_district_data_stats().describe().hist()

#### Data Index

Generate a JS file that webpack can use to make paths for all of the data files.

In [None]:
def write_district_data_js():
    data_files = list_district_data(OUTPUT_DISTRICT_PATH)
    
    def make_require(data_file):
        basename = os.path.basename(data_file)
        pathname = os.path.join('.', 'district', basename)
        outward_code = basename.split('.')[0]
        return "  {}: require('{}')".format(outward_code, pathname)

    with open('output/district.js', 'w') as file:
        file.write('// NB: This file is generated automatically. Do not edit.\n')
        file.write('export default {\n')
        requires = [
            make_require(data_file)
            for data_file in data_files
        ]
        file.write(',\n'.join(requires))
        file.write('\n}\n')
write_district_data_js()

## Idea 2: Aggregate over each Area

In [None]:
# TODO

In [None]:
# output: an area map with 

# is it worth splitting out the 'projects' from the 'places'?
# we could denormalize to region level --- include all projects in an area in the json blob for that area
# what to do with the ESIF data? It's already mostly denormalized. There is not much to deduplicate anyway.
# Maybe just a special case for CORDIS (or other things with multiple partners per project)

    
# go through by postcode area
# find all the things in that area
# group them by postcode
# for each postcode, write out a list of projects in that postcode
# for CORDIS, maybe just have a separate cordis_projects.json file with the data?
# I guess that might be too large... but we could split it up on rcn, for example.
# or denormalize it and just stuff it in with the rest... maybe that's the place to start.


