# Build Aggregate Data

In [None]:
%matplotlib inline

import json

import pandas as pd

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

## Load Cleaned Data

### CAP

The FAQs say:

> I understand that some CAP payments are jointly paid for by EU and UK funding. How are such payments shown on the website?
> These are known as co-financed payments and mostly concern Rural Development measures. For these payment types, the website shows the total amounts received by the beneficiary, not just the EU financed element. (http://cap-payments.defra.gov.uk/FAQs.aspx)

According to https://www.instituteforgovernment.org.uk/explainers/common-agricultural-policy, in 2015 the UK contribution to rural development was €250M out of €959M, so we can use that to adjust the total.

In [None]:
raw_cap_by_area = pd.concat([
    pd.read_pickle('../cap/output/cap_by_area_{}.pkl.gz'.format(year))
    for year in range(2014, 2018)
])
raw_cap_by_area.head()

In [None]:
cap_by_area = raw_cap_by_area.copy()
cap_by_area['eagf'] = cap_by_area.otherEAGF + cap_by_area.directEAGF
cap_by_area['eafrd'] = cap_by_area.ruralDevelopment * ((959 - 250) / 959)
cap_by_area.drop(['otherEAGF', 'directEAGF', 'ruralDevelopment', 'total'], axis=1, inplace=True)
cap_by_area['funds'] = 'CAP'
cap_by_area['total'] = cap_by_area.eagf + cap_by_area.eafrd
cap_by_area.head()

### CORDIS

In [None]:
def add_postcode_area(df):
    df['postcode_area'] = df['postcode'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
    return df

In [None]:
fp7_organizations = pd.read_pickle('../cordis/output/fp7_organizations.pkl.gz')
fp7_projects = pd.read_pickle('../cordis/output/fp7_projects.pkl.gz')
fp7 = pd.merge(
    fp7_projects, fp7_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
add_postcode_area(fp7)
fp7['my_eu_id'] = 'fp7_' + fp7.project_rcn.astype('str') + '_' + fp7.organization_id.astype('str')
fp7['contribution_gbp'] = fp7.contribution_eur * fp7.eur_gbp
fp7.head()

### Creative Europe

TODO: need to estimate per-partner contribution by dividing through by number of partners.

In [None]:
creative_organisations = pd.read_pickle('../creative/output/creative_europe_organisations.pkl.gz')
creative_projects = pd.read_pickle('../creative/output/creative_europe_projects.pkl.gz')
creative = pd.merge(creative_projects, creative_organisations, on='project_number', validate='1:m')
creative.shape

In [None]:
add_postcode_area(creative)
creative['max_contribution_gbp'] = creative.max_contribution_eur * creative.eur_gbp
creative['my_eu_id'] = \
    'creative_' + creative.project_number + '_' + \
    creative.partner_number.apply('{:.0f}'.format).\
    str.replace('nan', 'coordinator', regex=False)
assert creative.shape[0] == creative.my_eu_id.unique().shape[0]
creative.head()

### ESIF (ESF/ERDF)

In [None]:
esif = pd.concat([
    pd.read_pickle('../esif/output/esif_{}.pkl.gz'.format(dataset))
    for dataset in ['england_2014_2020', 'ni_2014_2020', 'scotland', 'wales']
], sort=True)
add_postcode_area(esif)
esif.head()

## Idea 2: Aggregate over each Area

### Time Ranges

In [None]:
[
    (cap_by_area.year.min(), cap_by_area.year.max()),
    (fp7_projects.start_date.min(), fp7_projects.start_date.max()),
    (creative.start_date.min(), creative.start_date.max()),
    (esif.start_date.min(), esif.start_date.max()),
]

### Aggregate by Year

In [None]:
cap_annual_total = cap_by_area.copy()
cap_annual_total = cap_annual_total[['funds', 'year', 'total']]
cap_annual_total.groupby('year').sum()

In [None]:
def find_annual_sum(df, keys, column):
    def reweight(row):
        # create timeseries from start to end
        days = pd.date_range(row.start_date, row.end_date, closed='left')
        daily = pd.DataFrame({
            'year_start': days,
            column: row[column] / days.shape[0]
        })
        annual = daily.resample('AS', on='year_start').sum()
        for key in keys:
            annual[key] = row[key]
        return annual
    result = pd.concat(list(df.apply(reweight, axis=1)))
    result.reset_index(inplace=True)
    result['year'] = result.year_start.apply(lambda x: x.year)
    result.drop('year_start', axis=1, inplace=True)
    return result[['year'] + keys + [column]]

fp7_annual = find_annual_sum(fp7, ['postcode_area', 'my_eu_id'], 'contribution_gbp')
fp7_annual.shape

In [None]:
fp7_annual['funds'] = 'FP7'
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
fp7_annual_total = find_fund_annual_totals(fp7_annual, 'contribution_gbp')
fp7_annual_total.head()

In [None]:
creative_annual = find_annual_sum(creative, ['funds', 'postcode_area', 'my_eu_id'], 'max_contribution_gbp')
creative_annual_total = find_fund_annual_totals(creative_annual, 'max_contribution_gbp')
creative_annual_total

In [None]:
esif_annual = find_annual_sum(esif, ['funds', 'postcode_area', 'my_eu_id'], 'eu_investment')
esif_annual_total = find_fund_annual_totals(esif_annual, 'eu_investment')
esif_annual_total.head()

In [None]:
annual_totals = pd.concat([
    cap_annual_total,
    fp7_annual_total,
    creative_annual_total,
    esif_annual_total
]).\
groupby(['funds', 'year']).sum().reset_index().\
pivot(index='year', columns='funds', values='total')
annual_totals

In [None]:
annual_totals.plot.bar(stacked=True)

### Aggregate by Area and Year

In [None]:
cap_by_area.head()

In [None]:
def find_fund_annual_area_totals(annual, column):
    annual_total = annual.groupby(['funds', 'postcode_area', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
fp7_annual_area_total = find_fund_annual_area_totals(fp7_annual, 'contribution_gbp')
fp7_annual_area_total.head()

In [None]:
creative_annual_area_total = find_fund_annual_area_totals(creative_annual, 'max_contribution_gbp')
creative_annual_area_total.head()

In [None]:
esif_annual_area_total = find_fund_annual_area_totals(esif_annual, 'eu_investment')
esif_annual_area_total.head()

In [None]:
annual_area_total = pd.concat([
    cap_by_area[['funds', 'postcode_area', 'year', 'total']],
    fp7_annual_area_total,
    creative_annual_area_total,
    esif_annual_area_total
])
annual_area_total.head()

In [None]:
cap_by_area[cap_by_area.postcode_area == 'BT']

In [None]:
annual_area_total[annual_area_total.postcode_area == 'BT'].groupby(['funds']).total.sum()

In [None]:
area_total = annual_area_total[
    (annual_area_total.year >= 2014) & (annual_area_total.year <= 2017)
].groupby(['postcode_area']).total.mean()
area_total = area_total.reset_index()
area_total.total = area_total.total.round().astype('int32')
area_total.sort_values('total', ascending=False).head()

### Save for Map

In [None]:
with open('../postcodes/output/postcode-area-boundaries-simplified.geo.json') as file:
    postcode_areas_json = json.load(file)
len(postcode_areas_json['features'])

In [None]:
postcode_areas_features = pd.DataFrame.from_dict({
    'name': [feature['properties']['name'] for feature in postcode_areas_json['features']],
    'geometry': [feature['geometry'] for feature in postcode_areas_json['features']],
})
postcode_areas_features.head()

In [None]:
postcode_areas_data = pd.merge(
    postcode_areas_features, area_total,
    left_on='name', right_on='postcode_area', validate='m:1')
postcode_areas_data.head()

In [None]:
postcode_area_names = pd.read_excel('../postcodes/input/postcode-area-names.xlsx')
postcode_area_names.head()

In [None]:
postcode_area_names.shape

In [None]:
postcode_areas_data = pd.merge(
    postcode_areas_data,
    postcode_area_names,
    validate='1:1'
)
postcode_areas_data.head()

In [None]:
def make_area_geo_json(data):
    def make_feature(row):
        properties = {
            'total': row['total'],
            'postcodeArea': row['postcode_area'],
            'postcodeAreaName': row['postcode_area_name'],
        }
        return {
            'type': 'Feature',
            'geometry': row['geometry'],
            'properties': properties
        }
    features = list(data.apply(make_feature, axis=1))
    return { 'type': 'FeatureCollection', 'features': features }
with open('output/area.geo.json', 'w') as file:
    json.dump(make_area_geo_json(postcode_areas_data), file, sort_keys=True)

In [None]:
with open('../postcodes/output/postcode-area-boundaries-centroids.geo.json') as file:
    postcode_area_centroids_json = json.load(file)
len(postcode_area_centroids_json['features'])

In [None]:
postcode_area_centroid_features = pd.DataFrame.from_dict({
    'name': [feature['properties']['name'] for feature in postcode_area_centroids_json['features']],
    'geometry': [feature['geometry'] for feature in postcode_area_centroids_json['features']],
})
postcode_area_centroid_features.head()

In [None]:
postcode_area_centroids_data = pd.merge(
    postcode_area_centroid_features, area_total,
    left_on='name', right_on='postcode_area', validate='m:1')
postcode_area_centroids_data.head()

In [None]:
postcode_area_centroids_data = pd.merge(
    postcode_area_centroids_data,
    postcode_area_names,
    validate='1:1'
)
postcode_area_centroids_data.head()

In [None]:
with open('output/area_centroids.geo.json', 'w') as file:
    json.dump(make_area_geo_json(postcode_area_centroids_data), file, sort_keys=True)