# Build Aggregate Data

In [None]:
import pandas as pd

## Load Cleaned Data

### CAP

In [None]:
cap_by_area = pd.read_pickle('../cap/output/cap_by_area.pkl.gz')
cap_by_area.head()

### CORDIS

In [None]:
fp7_organizations = pd.read_pickle('../cordis/output/fp7_organizations.pkl.gz')
fp7_organizations.head()

In [None]:
fp7_projects = pd.read_pickle('../cordis/output/fp7_projects.pkl.gz')
fp7_projects.rename({
    'startDate': 'start_date',
    'endDate': 'end_date',
}, axis=1, inplace=True)
fp7_projects.head()

In [None]:
fp7 = pd.merge(
    fp7_projects, fp7_organizations,
    left_on='rcn', right_on='projectRcn', validate='1:m'
)
fp7.head()

In [None]:
fp7.describe()

### Creative Europe

In [None]:
creative_organisations = pd.read_pickle('../creative/output/creative_europe_organisations.pkl.gz')
creative_organisations.shape

In [None]:
creative_projects = pd.read_pickle('../creative/output/creative_europe_projects.pkl.gz')
creative_projects.shape

In [None]:
creative = pd.merge(creative_projects, creative_organisations, on='project_number', validate='1:m')
creative.shape

In [None]:
creative.head()

### ESIF (ESF/ERDF)

In [None]:
esif_england = pd.read_pickle('../esif/output/esif_england_2014_2020.pkl.gz')
esif_england.head()

In [None]:
esif_ni = pd.read_pickle('../esif/output/esif_ni_2014_2020.pkl.gz')
esif_ni.head()

In [None]:
esif_scotland = pd.read_pickle('../esif/output/esif_scotland.pkl.gz')
esif_scotland.head()

In [None]:
esif_wales = pd.read_pickle('../esif/output/esif_wales.pkl.gz')
esif_wales.head()

## Idea 2: Aggregate over each Area

To do this, we need to convert from EUR to GBP. Exchange rates have been quite variable, so if we want to get a reasonable estimate we first need to disaggregate over time. It looks like we can go down to monthly, which should be enough.

### Time Ranges

In [None]:
[
    cap_by_area.year.min(),
    fp7_projects.start_date.min(),
    creative.start_date.min(),
    esif_england.start_date.min(),
    esif_ni.start_date.min(),
    esif_scotland.start_date.min(),
    esif_wales.start_date.min()
]

In [None]:
[
    cap_by_area.year.max(),
    fp7_projects.end_date.max(),
    creative.end_date.max(),
    esif_england.end_date.max(),
    esif_ni.end_date.max(),
    esif_scotland.end_date.max(),
    esif_wales.end_date.max()
]

### Exchange Rates

In [None]:
euro_gbp = pd.read_pickle('../exchange_rates/output/exchange_rates.pkl.gz')
euro_gbp.tail()

### Monthly Aggregates

#### CAP

TODO: Need to account for co-financing.

In [None]:
monthly_cap_by_area = cap_by_area.copy()
monthly_cap_by_area.otherEAGF /= 12
monthly_cap_by_area.directEAGF /= 12
monthly_cap_by_area.ruralDevelopment /= 12
monthly_cap_by_area.total /= 12
monthly_cap_by_area.head()

In [None]:
months_2015 = pd.DataFrame({
    'year': 2015,
    'month_start': ['{:4d}-{:02d}-01'.format(2015, month) for month in range(1, 13)]
})
months_2015

In [None]:
monthly_cap_by_area.shape

In [None]:
monthly_cap_by_area = pd.merge(monthly_cap_by_area, months_2015, on='year')
monthly_cap_by_area.shape

In [None]:
monthly_cap_by_area.head()

#### CORDIS

In [None]:
def find_monthly_money_by_area(df):
    df = df.copy()
    df['postcode_area'] = df.postcode.str.replace(r'^([A-Z]{1,2}).+$', r'\1')
    df = df.loc[
        ~df.eu_money.isna() & ~df.start_date.isna() & ~df.end_date.isna(),
        ['start_date', 'end_date', 'eu_money', 'postcode_area']
    ]
    
    def expand_monthly(row):
        days = pd.date_range(row.start_date, row.end_date, closed='left')
        daily = pd.DataFrame({
            'month_start': days,
            'eu_money': row.eu_money / days.shape[0]
        })
        monthly = daily.resample('MS', on='month_start').sum()
        monthly['postcode_area'] = row.postcode_area
        return monthly
    
    result = pd.concat(list(
        df.apply(expand_monthly, axis=1, result_type='reduce')
    ))
    result.reset_index(inplace=True)
    return result
    
monthly_fp7_by_area = find_monthly_money_by_area(
    fp7.rename(columns={'ecContribution': 'eu_money'}))
monthly_fp7_by_area.head()

In [None]:
monthly_fp7_by_area.shape

In [None]:
def convert_eur_to_gbp(monthly_eu_money, euro_gbp):
    monthly_eu_money_with_rate = pd.merge(
        monthly_eu_money, euro_gbp, on='month_start', how='left', validate='m:1')
    monthly_eu_money_with_rate.eu_money *= monthly_eu_money_with_rate.rate
    monthly_eu_money_with_rate.drop('rate', axis=1, inplace=True)
    return monthly_eu_money_with_rate

monthly_fp7_by_area_gbp_records = convert_eur_to_gbp(monthly_fp7_by_area, euro_gbp)
assert monthly_fp7_by_area_gbp_records.shape == monthly_fp7_by_area.shape
monthly_fp7_by_area_gbp_records.head()

In [None]:
monthly_fp7_by_area_gbp = \
    monthly_fp7_by_area_gbp_records.groupby(['month_start', 'postcode_area']).sum().reset_index()
monthly_fp7_by_area_gbp.head()

In [None]:
all_time_fp7_by_area_gbp = \
    monthly_fp7_by_area_gbp_records.groupby('postcode_area').sum().reset_index()
all_time_fp7_by_area_gbp.head()

### Creative Europe

TODO: need to estimate per-partner contribution by dividing through by number of partners.

In [None]:
monthly_creative_by_area = find_monthly_money_by_area(
    creative.rename(columns={'eu_investment': 'eu_money'}))
monthly_creative_by_area.head()

In [None]:
monthly_creative_by_area_gbp_records = convert_eur_to_gbp(monthly_creative_by_area, euro_gbp)
assert monthly_creative_by_area_gbp_records.shape == monthly_creative_by_area.shape
monthly_creative_by_area_gbp_records.head()

In [None]:
monthly_creative_by_area_gbp = \
    monthly_creative_by_area_gbp_records.groupby(['month_start', 'postcode_area']).sum().reset_index()
monthly_creative_by_area_gbp.head()

In [None]:
all_time_creative_by_area_gbp = \
    monthly_creative_by_area_gbp_records.groupby('postcode_area').sum().reset_index()
all_time_creative_by_area_gbp.head()

### ESIF (ESF/ERDF)

These are already in GBP, but we might as well get the monthly numbers to see what they look like.