# Build Data for the Map

In [None]:
%matplotlib inline

import glob
import json
import os

import pandas as pd

pd.set_option('display.max_columns', 50)

## Define Validity Checks

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
def validate_postcodes(df):
    assert 'postcode' in df.columns
    assert (~df['postcode'].isin(ukpostcodes.postcode)).sum() == 0
    
def validate_date_range(df):
    assert 'start_date' in df.columns
    assert 'end_date' in df.columns
    assert df['start_date'].dtype == 'datetime64[ns]'
    assert df['end_date'].dtype == 'datetime64[ns]'
    assert (df['start_date'] > df['end_date']).sum() == 0

## Load Cleaned Data

### CORDIS

In [None]:
fp7_organizations = pd.read_pickle('../cordis/output/fp7_organizations.pkl.gz')
validate_postcodes(fp7_organizations)
fp7_organizations.head()

In [None]:
fp7_projects = pd.read_pickle('../cordis/output/fp7_projects.pkl.gz')
validate_date_range(fp7_projects)
fp7_projects.head()

In [None]:
fp7 = pd.merge(
    fp7_projects, fp7_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
fp7['my_eu_id'] = 'fp7_' + fp7.project_rcn.astype('str') + '_' + fp7.organization_id.astype('str')
fp7['total_cost_gbp'] = (fp7.total_cost_eur * fp7.eur_gbp).round()
fp7['max_contribution_gbp'] = (fp7.max_contribution_eur * fp7.eur_gbp).round()
fp7['contribution_gbp'] = (fp7.contribution_eur * fp7.eur_gbp).round()
fp7.head()

In [None]:
fp7.describe()

In [None]:
(fp7.contribution_eur > fp7.total_cost_eur).sum()

In [None]:
h2020_organizations = pd.read_pickle('../cordis/output/h2020_organizations.pkl.gz')
validate_postcodes(h2020_organizations)
h2020_organizations.head()

In [None]:
h2020_projects = pd.read_pickle('../cordis/output/h2020_projects.pkl.gz')
validate_date_range(h2020_projects)
h2020_projects.head()

In [None]:
h2020 = pd.merge(
    h2020_projects, h2020_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
h2020['my_eu_id'] = 'h2020_' + h2020.project_rcn.astype('str') + '_' + h2020.organization_id.astype('str')
h2020['total_cost_gbp'] = (h2020.total_cost_eur * h2020.eur_gbp).round()
h2020['max_contribution_gbp'] = (h2020.max_contribution_eur * h2020.eur_gbp).round()
h2020['contribution_gbp'] = (h2020.contribution_eur * h2020.eur_gbp).round()

# no briefs available for H2020
h2020['brief_title'] = float('nan')
h2020['teaser'] = float('nan')
h2020['article'] = float('nan')
h2020['image_path'] = float('nan')

h2020.head()

In [None]:
(h2020.contribution_eur > h2020.total_cost_eur).sum()

### Creative Europe

In [None]:
creative_organisations = pd.read_pickle('../creative/output/creative_europe_organisations.pkl.gz')
creative_organisations.shape

In [None]:
creative_projects = pd.read_pickle('../creative/output/creative_europe_projects.pkl.gz')
creative_projects.shape

In [None]:
creative = pd.merge(creative_projects, creative_organisations, on='project_number', validate='1:m')
creative.shape

In [None]:
validate_postcodes(creative)
validate_date_range(creative)
creative['max_contribution_gbp'] = (creative.max_contribution_eur * creative.eur_gbp).round()
creative['my_eu_id'] = \
    'creative_' + creative.project_number + '_' + \
    creative.partner_number.apply('{:.0f}'.format).\
    str.replace('nan', 'coordinator', regex=False)
assert creative.shape[0] == creative.my_eu_id.unique().shape[0]
creative.head()

### ESIF (ESF/ERDF)

In [None]:
esif_england = pd.read_pickle('../esif/output/esif_england_2014_2020.pkl.gz')
validate_postcodes(esif_england)
validate_date_range(esif_england)
esif_england.head()

In [None]:
esif_ni = pd.read_pickle('../esif/output/esif_ni_2014_2020.pkl.gz')
validate_postcodes(esif_ni)
validate_date_range(esif_ni)
esif_ni.head()

In [None]:
esif_scotland = pd.read_pickle('../esif/output/esif_scotland.pkl.gz')
validate_postcodes(esif_scotland)
validate_date_range(esif_scotland)
esif_scotland.head()

In [None]:
esif_wales = pd.read_pickle('../esif/output/esif_wales.pkl.gz')
validate_postcodes(esif_wales)
validate_date_range(esif_wales)
esif_wales.head()

### FTS

In [None]:
fts_2016 = pd.read_pickle('../fts/output/fts_2016.pkl.gz')
validate_postcodes(fts_2016)
fts_2016['amount_gbp'] = (fts_2016.amount * fts_2016.eur_gbp).round()
fts_2016['total_amount_gbp'] = (fts_2016.total_amount_eur * fts_2016.eur_gbp).round()
fts_2016.head()

In [None]:
fts_2017 = pd.read_pickle('../fts/output/fts_2017.pkl.gz')
validate_postcodes(fts_2017)
fts_2017['amount_gbp'] = (fts_2017.amount * fts_2017.eur_gbp).round()
fts_2017['total_amount_gbp'] = (fts_2017.total_amount_eur * fts_2017.eur_gbp).round()
fts_2017.head()

### Erasmus

In [None]:
erasmus_organisations = pd.read_pickle('../erasmus/output/erasmus_2017_organisations.pkl.gz')
erasmus_organisations.shape

In [None]:
erasmus_projects = pd.read_pickle('../erasmus/output/erasmus_2017_projects.pkl.gz')
erasmus_projects.shape

In [None]:
erasmus = pd.merge(erasmus_projects, erasmus_organisations, on='project_identifier', validate='1:m')
erasmus.shape

In [None]:
validate_postcodes(erasmus)

erasmus['max_contribution_gbp'] = (erasmus.max_contribution_eur * erasmus.eur_gbp).round()
erasmus['my_eu_id'] = \
    'erasmus_' + erasmus.project_identifier + '_' + \
    erasmus.partner_number.apply('{:.0f}'.format).\
    str.replace('nan', 'coordinator', regex=False)
assert erasmus.shape[0] == erasmus.my_eu_id.unique().shape[0]
erasmus.head()

## Idea 1: All Points on Map, Data by District

This should make the map look fairly similar to how it looks now, so it seems like a good starting point.

In [None]:
ALL_PLACES = [
    (fp7, 'contribution_gbp'),
    (h2020, 'contribution_gbp'),
    (creative, 'max_contribution_gbp'), # TODO: split it out
    (esif_england, 'eu_investment'),
    (esif_ni, 'eu_investment'),
    (esif_scotland, 'eu_investment'),
    (esif_wales, 'eu_investment'),
    (fts_2016.drop('amount', axis=1), 'amount_gbp'),
    (fts_2017.drop('amount', axis=1), 'amount_gbp'),
    (erasmus, 'max_contribution_gbp')
]

GeoJSON is very inefficient for representing a bunch of points, so let's use a relatively simple packed format.
```
min_longitude min_latitude
outcode incode delta_longitude delta_latitude incode delta_longitude delta_latitude
```
We need [about 4 decimal places](https://gis.stackexchange.com/questions/8650/measuring-accuracy-of-latitude-and-longitude).

In [None]:
def add_outward_and_inward_codes(df):
    df['outward_code'] = df.postcode.str.split(' ').str[0]
    df['inward_code'] = df.postcode.str.split(' ').str[1]
    return df

def pack_geocoded_postcodes(dfs):
    all_postcode_amounts = pd.concat([
        df.rename(columns={amount_column: 'amount'})[['postcode', 'amount']]
        for df, amount_column in dfs
    ])
    postcode_amounts = all_postcode_amounts.groupby('postcode').aggregate({'amount': sum})
    postcode_amounts.reset_index(inplace=True)
    postcode_amounts.amount = postcode_amounts.amount.astype('int32')
    add_outward_and_inward_codes(postcode_amounts)
    
    geocoded_postcodes = pd.merge(postcode_amounts, ukpostcodes, validate='1:1')
    
    min_longitude = geocoded_postcodes.longitude.min()
    min_latitude = geocoded_postcodes.latitude.min()
    
    geocoded_postcodes['delta_longitude'] = geocoded_postcodes.longitude - min_longitude
    geocoded_postcodes['delta_latitude'] = geocoded_postcodes.latitude - min_latitude
    
    return {
        'min_longitude': min_longitude,
        'min_latitude': min_latitude,
        'geocoded_postcodes': geocoded_postcodes
    }

packed_postcodes = pack_geocoded_postcodes(ALL_PLACES)
[
    packed_postcodes['min_longitude'],
    packed_postcodes['min_latitude'],
    packed_postcodes['geocoded_postcodes'].shape[0]
]

In [None]:
packed_postcodes['geocoded_postcodes'].head()

In [None]:
def make_packed_postcode_json(packed_postcodes):
    packed_postcodes = packed_postcodes.copy()
   
    grouped_postcodes = packed_postcodes['geocoded_postcodes'].\
        sort_values('outward_code').groupby('outward_code')
     
    def make_code_tuples(row):
        coordinate = '{0:.4f}'
        return [
            row['inward_code'],
            float(coordinate.format(row['delta_longitude'])),
            float(coordinate.format(row['delta_latitude'])),
            row['amount']
        ]
    
    postcodes = {}
    for outward_code, group in grouped_postcodes:
        postcodes[outward_code] = [
            x
            for code in group.sort_values('inward_code').apply(make_code_tuples, axis=1)
            for x in code
        ]

    min_coordinate = '{0:.6f}'
    return {
        'min_longitude': float(min_coordinate.format(packed_postcodes['min_longitude'])),
        'min_latitude': float(min_coordinate.format(packed_postcodes['min_latitude'])),
        'postcodes': postcodes
    }

with open('output/packed_postcodes.data.json', 'w') as file:
    json.dump(make_packed_postcode_json(packed_postcodes), file, sort_keys=True)

### Data by District

#### CORDIS

In [None]:
# Dump to JSON using pandas, because it puts in nulls instead of NaNs for
# missing values. Then load the JSON into dicts for 
def make_district_data_json(df):
    def to_json(group):
        group.drop('outwardCode', axis=1, inplace=True)
        return json.loads(group.to_json(orient='split', index=False))
    return df.groupby('outwardCode').apply(to_json)

def make_cordis_district_data(cordis):
    cordis = add_outward_and_inward_codes(cordis.copy())

    cordis = cordis[[
        'outward_code',
        'inward_code',
        'title',
        'name', # of organization
        'objective',
        'contribution_gbp',
        'total_cost_gbp',
        'acronym',
        'brief_title',
        'teaser',
        'article',
        'project_url',
        'organization_url',
        'image_path',
        'my_eu_id'
    ]]
    
    cordis.rename({
        'outward_code': 'outwardCode',
        'inward_code': 'inwardCode',
        'title': 'projectTitle',
        'name': 'organisationName',
        'contribution_gbp': 'contribution',
        'total_cost_gbp': 'totalCost',
        'brief_title': 'briefTitle',
        'project_url': 'projectUrl',
        'organization_url': 'organizationUrl',
        'image_path': 'imagePath',
        'my_eu_id': 'myEuId'
    }, axis=1, inplace=True)
    
    return make_district_data_json(cordis)

fp7_district_data = make_cordis_district_data(fp7)
fp7_district_data.head()

In [None]:
h2020_district_data = make_cordis_district_data(h2020)
h2020_district_data.head()

#### Creative Europe

In [None]:
def make_creative_district_data(creative):
    creative = add_outward_and_inward_codes(creative.copy())
    
    coordinators = creative[creative.organisation_coordinator]
    coordinators = coordinators[['project_number', 'organisation_name']]
    creative = pd.merge(
        creative, coordinators,
        how='left', on='project_number', suffixes=('', '_coordinator'))

    creative = creative[[
        'outward_code',
        'inward_code',
        'project',
        'organisation_name',
        'max_contribution_gbp',
        'summary',
        'organisation_website',
        'organisation_name_coordinator',
        'my_eu_id'
    ]]
    
    creative.rename({
        'outward_code': 'outwardCode',
        'inward_code': 'inwardCode',
        'organisation_name': 'organisationName',
        'max_contribution_gbp': 'maxContribution',
        'organisation_website': 'organisationWebsite',
        'organisation_name_coordinator': 'coordinatorName',
        'my_eu_id': 'myEuId'
    }, axis=1, inplace=True)
    
    return make_district_data_json(creative)

creative_district_data = make_creative_district_data(creative)
creative_district_data.head()

In [None]:
creative.columns

#### ESIF

In [None]:
def make_esif_district_data(esif):
    esif = add_outward_and_inward_codes(esif.copy())
    esif = esif[[
        'outward_code',
        'inward_code',
        'project',
        'beneficiary',
        'summary',
        'funds',
        'eu_investment',
        'project_cost',
        'my_eu_id'
    ]]
    
    esif.rename({
        'outward_code': 'outwardCode',
        'inward_code': 'inwardCode',
        'project': 'projectTitle',
        'beneficiary': 'organisationName',
        'eu_investment': 'euInvestment',
        'project_cost': 'projectCost',
        'my_eu_id': 'myEuId'
    }, axis=1, inplace=True)
    
    return make_district_data_json(esif)

esif_england_district_data = make_esif_district_data(esif_england)
esif_england_district_data.head()

In [None]:
esif_ni_district_data = make_esif_district_data(esif_ni)
esif_ni_district_data.head()

In [None]:
esif_scotland_district_data = make_esif_district_data(esif_scotland)
esif_scotland_district_data.head()

In [None]:
esif_wales_district_data = make_esif_district_data(esif_wales)
esif_wales_district_data.head()

#### FTS

In [None]:
fts_2016.columns

In [None]:
def make_fts_district_data(fts, year):
    fts = add_outward_and_inward_codes(fts.copy())
    fts = fts[[
        'outward_code',
        'inward_code',
        'beneficiary',
        'amount_gbp',
        'budget_line_name_and_number',
        'my_eu_id'
    ]]
    
    fts.rename({
        'outward_code': 'outwardCode',
        'inward_code': 'inwardCode',
        'amount_gbp': 'amount',
        'budget_line_name_and_number': 'budgetLineNameAndNumber',
        'my_eu_id': 'myEuId'
    }, axis=1, inplace=True)
    
    fts['year'] = year
    
    return make_district_data_json(fts)

fts_2016_district_data = make_fts_district_data(fts_2016, 2016)
fts_2016_district_data.head()

In [None]:
fts_2017_district_data = make_fts_district_data(fts_2017, 2017)
fts_2017_district_data.head()

#### Erasmus

In [None]:
def make_erasmus_district_data(erasmus):
    erasmus = add_outward_and_inward_codes(erasmus.copy())

    coordinators = erasmus[erasmus.organisation_coordinator]
    coordinators = coordinators[['project_identifier', 'organisation_name']]
    erasmus = pd.merge(
        erasmus, coordinators,
        how='left', on='project_identifier', suffixes=('', '_coordinator'))

    erasmus = erasmus[[
        'outward_code',
        'inward_code',
        'project',
        'organisation_name',
        'max_contribution_gbp',
        'summary',
        'organisation_website',
        'organisation_name_coordinator',
        'my_eu_id'
    ]]
    
    erasmus.rename({
        'outward_code': 'outwardCode',
        'inward_code': 'inwardCode',
        'organisation_name': 'organisationName',
        'max_contribution_gbp': 'maxContribution',
        'organisation_website': 'organisationWebsite',
        'organisation_name_coordinator': 'coordinatorName',
        'my_eu_id': 'myEuId'
    }, axis=1, inplace=True)
    
    return make_district_data_json(erasmus)

erasmus_district_data = make_erasmus_district_data(erasmus)
erasmus_district_data.head()

### Save Data

In [None]:
def merge_district_data(datasets):
    all_outward_codes = pd.concat([
        part.reset_index().outwardCode
        for outward_code, parts in datasets.items()
        for part in parts
    ]).unique()
    
    def merge_parts_data(parts, outward_code):
        return {
            'columns': parts[0].iloc[0]['columns'],
            'data': [
                datum
                for part in parts
                if (part.index == outward_code).any()
                for datum in part[outward_code]['data']
            ]
        }
    
    return {
        outward_code: {
            dataset: merge_parts_data(parts, outward_code)
            for dataset, parts in datasets.items()
            if any((part.index == outward_code).any() for part in parts)
        }
        for outward_code in all_outward_codes
    }

district_data = merge_district_data({
    'cordis': [fp7_district_data, h2020_district_data],
    'creative': [creative_district_data],
    'esif': [
        esif_england_district_data, esif_ni_district_data,
        esif_scotland_district_data, esif_wales_district_data],
    'fts': [fts_2016_district_data, fts_2017_district_data],
    'erasmus': [erasmus_district_data]
})
district_data['CA4']

In [None]:
OUTPUT_DISTRICT_PATH = 'output/district'

def list_district_data(path):
    return glob.glob(os.path.join(path, '*.data.json'))

def clear_district_data(path):
    for f in list_district_data(path):
        os.remove(f)

def write_district_data(district_data, path):
    os.makedirs(path, exist_ok=True)
    clear_district_data(path)
    for outward_code, datasets in district_data.items():
        output_pathname = os.path.join(path, outward_code + '.data.json')
        with open(output_pathname, 'w') as file:
            json.dump({
                'outwardCode': outward_code,
                'datasets': datasets
            }, file, sort_keys=True)
write_district_data(district_data, OUTPUT_DISTRICT_PATH)

In [None]:
def find_district_data_stats():
    files = list_district_data(OUTPUT_DISTRICT_PATH)
    return pd.DataFrame({
        'file': [file for file in files],
        'byte_size': [os.stat(file).st_size for file in files]
    })
district_data_stats = find_district_data_stats()
district_data_stats.describe()

In [None]:
district_data_stats.byte_size.sum() / 1024 / 1024

In [None]:
district_data_stats[district_data_stats.byte_size > 1024*1024]

In [None]:
find_district_data_stats().describe().hist()

#### Data Index

Generate a JS file that webpack can use to make paths for all of the data files.

In [None]:
def write_district_data_js():
    data_files = list_district_data(OUTPUT_DISTRICT_PATH)
    
    def make_require(data_file):
        basename = os.path.basename(data_file)
        pathname = os.path.join('.', 'district', basename)
        outward_code = basename.split('.')[0]
        return "  {}: require('{}')".format(outward_code, pathname)

    with open('output/district.js', 'w') as file:
        file.write('// NB: This file is generated automatically. Do not edit.\n')
        file.write('export default {\n')
        requires = [
            make_require(data_file)
            for data_file in data_files
        ]
        file.write(',\n'.join(requires))
        file.write('\n}\n')
write_district_data_js()