# Build Aggregate Data

In [None]:
%matplotlib inline

import pandas as pd

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

## Load Cleaned Data

### CAP

TODO: Need to account for co-financing.

In [None]:
cap_by_area = pd.read_pickle('../cap/output/cap_by_area.pkl.gz')
cap_by_area.head()

### CORDIS

In [None]:
def add_postcode_area(df):
    df['postcode_area'] = df['postcode'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
    return df

In [None]:
fp7_organizations = pd.read_pickle('../cordis/output/fp7_organizations.pkl.gz')
fp7_projects = pd.read_pickle('../cordis/output/fp7_projects.pkl.gz')
fp7 = pd.merge(
    fp7_projects, fp7_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
add_postcode_area(fp7)
fp7['contribution_gbp'] = fp7.contribution_eur * fp7.eur_gbp
fp7.head()

### Creative Europe

TODO: need to estimate per-partner contribution by dividing through by number of partners.

In [None]:
creative_organisations = pd.read_pickle('../creative/output/creative_europe_organisations.pkl.gz')
creative_projects = pd.read_pickle('../creative/output/creative_europe_projects.pkl.gz')
creative = pd.merge(creative_projects, creative_organisations, on='project_number', validate='1:m')
creative.shape

In [None]:
add_postcode_area(creative)
creative['max_contribution_gbp'] = creative.max_contribution_eur * creative.eur_gbp
creative['my_eu_id'] = \
    'creative_' + creative.project_number + '_' + \
    creative.partner_number.apply('{:.0f}'.format).\
    str.replace('nan', 'coordinator', regex=False)
assert creative.shape[0] == creative.my_eu_id.unique().shape[0]
creative.head()

### ESIF (ESF/ERDF)

In [None]:
esif_england = pd.read_pickle('../esif/output/esif_england_2014_2020.pkl.gz')
add_postcode_area(esif_england)
esif_england.head()

In [None]:
esif_ni = pd.read_pickle('../esif/output/esif_ni_2014_2020.pkl.gz')
add_postcode_area(esif_ni)
esif_ni.head()

In [None]:
esif_scotland = pd.read_pickle('../esif/output/esif_scotland.pkl.gz')
add_postcode_area(esif_scotland)
esif_scotland.head()

In [None]:
esif_wales = pd.read_pickle('../esif/output/esif_wales.pkl.gz')
add_postcode_area(esif_wales)
esif_wales.head()

## Idea 2: Aggregate over each Area

### Time Ranges

In [None]:
[
    (cap_by_area.year.min(), cap_by_area.year.max()),
    (fp7_projects.start_date.min(), fp7_projects.start_date.max()),
    (creative.start_date.min(), creative.start_date.max()),
    (esif_england.start_date.min(), esif_england.start_date.max()),
    (esif_ni.start_date.min(), esif_ni.start_date.max()),
    (esif_scotland.start_date.min(), esif_scotland.start_date.max()),
    (esif_wales.start_date.min(), esif_wales.start_date.max())
]

### Annual Aggregates

In [None]:
cap_annual_total = cap_by_area.groupby('year').sum()
cap_annual_total['funds'] = 'CAP'
cap_annual_total

In [None]:
fp7.end_date.describe()

In [None]:
def find_annual_sum(df, keys, column):
    def reweight(row):
        # create timeseries from start to end
        days = pd.date_range(row.start_date, row.end_date, closed='left')
        daily = pd.DataFrame({
            'year_start': days,
            column: row[column] / days.shape[0]
        })
        annual = daily.resample('AS', on='year_start').sum()
        for key in keys:
            annual[key] = row[key]
        return annual
    result = pd.concat(list(df.apply(reweight, axis=1)))
    result.reset_index(inplace=True)
    result['year'] = result.year_start.apply(lambda x: x.year)
    result.drop('year_start', axis=1, inplace=True)
    return result[['year'] + keys + [column]]

fp7_annual = find_annual_sum(fp7, ['rcn', 'organization_id'], 'contribution_gbp')
fp7_annual.shape

In [None]:
fp7_annual['funds'] = 'FP7'
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
fp7_annual_total = find_fund_annual_totals(fp7_annual, 'contribution_gbp')
fp7_annual_total

In [None]:
creative_annual = find_annual_sum(creative, ['funds', 'my_eu_id'], 'max_contribution_gbp')
creative_annual_total = find_fund_annual_totals(creative_annual, 'max_contribution_gbp')
creative_annual_total

In [None]:
def find_esif_annual_total(esif):
    annual = find_annual_sum(esif, ['funds', 'my_eu_id'], 'eu_investment')
    return find_fund_annual_totals(annual, 'eu_investment')
esif_england_annual_total = find_esif_annual_total(esif_england)
esif_england_annual_total

In [None]:
esif_ni_annual_total = find_esif_annual_total(esif_ni)
esif_ni_annual_total

In [None]:
esif_scotland_annual_total = find_esif_annual_total(esif_scotland)
esif_scotland_annual_total

In [None]:
esif_wales_annual_total = find_esif_annual_total(esif_wales)
esif_wales_annual_total

In [None]:
annual_totals = pd.concat([
    cap_annual_total.reset_index()[['funds', 'year', 'total']],
    fp7_annual_total,
    creative_annual_total,
    esif_england_annual_total,
    esif_ni_annual_total,
    esif_scotland_annual_total,
    esif_wales_annual_total
]).\
groupby(['funds', 'year']).sum().reset_index().\
pivot(index='year', columns='funds', values='total')
annual_totals

In [None]:
annual_totals.plot.bar(stacked=True)