In [None]:
import pandas as pd
import ezodf

import numpy as np

## Revenue from each region

Let's look at revenue from each region. 

In [None]:
contributions = pd.read_excel('input/b794db89.xls', skiprows=3)
contributions.shape

In [None]:
contributions.head()

In [None]:
list(contributions)

We're not interested in revenue per person per region, so we'll remove these figures

In [None]:
contributions = contributions.drop(['Unnamed: 4', '2014/15.1', '2015/16.1', ' 2016/17.1'], axis=1)
contributions.head()

In [None]:
contributions

We'll use the figures that divide noth sea oil by population rather than by geography, but it doesn't make much difference. 

In [None]:
contributions = contributions[16:29].copy()
contributions.head()

The spend on the EU in the 2016/17 tax year was 13.8bn, not including the rebate, which we have ignored here as it is paid straight back. As a % of spending, this is 1.69%, but as a proportion of revenue, this is 1.90% (figures from https://www.ons.gov.uk/economy/governmentpublicsectorandtaxes/publicsectorfinance/articles/theukcontributiontotheeubudget/2017-10-31)

In [None]:
contributions['EU_funding_2016'] = contributions[' 2016/17']*0.019
contributions.head(20)

## CAP data

In [None]:
raw_cap_by_area = pd.concat([
    pd.read_pickle('../cap/output/cap_by_area_{}.pkl.gz'.format(year))
    for year in range(2016, 2017)
])
raw_cap_by_area.head()

In [None]:
nuts = pd.read_csv('input/pc2018_uk_NUTS-2016_v1.0.zip', sep=';')
nuts.head()

In [None]:
nuts = nuts.applymap(lambda x: x.replace("'", ''))
nuts.head()

In [None]:
nuts['postcode_area'] = \
   nuts['CODE'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
nuts.head()

In [None]:
nuts['nuts_1'] = \
   nuts['NUTS3'].str.replace(r'^([A-Z]{1,3}).+$', r'\1')
nuts.head()

In [None]:

group = nuts.groupby('nuts_1')['postcode_area'].unique()
group.head()

In [None]:
group_df = pd.DataFrame.from_dict(group)
group_df.head()

In [None]:
group_df.postcode_area['UKC']

In [None]:
group_df.postcode_area['UKC'][np.isin(group_df.postcode_area['UKC'], group_df.postcode_area['UKD'])]

In [None]:
group_df.postcode_area['UKC'][np.isin(group_df.postcode_area['UKC'], group_df.postcode_area['UKE'])]

In [None]:
group_df.postcode_area['UKD'][np.isin(group_df.postcode_area['UKD'], group_df.postcode_area['UKE'])]

In [None]:
group_df.postcode_area['UKD'][np.isin(group_df.postcode_area['UKD'], group_df.postcode_area['UKF'])]

In [None]:
group_df.postcode_area['UKD'][np.isin(group_df.postcode_area['UKD'], group_df.postcode_area['UKG'])]

In [None]:
group_df.postcode_area['UKE'][np.isin(group_df.postcode_area['UKE'], group_df.postcode_area['UKF'])]

It turns out, even Scotland had crossovers!

In [None]:
group_df.postcode_area['UKM'][np.isin(group_df.postcode_area['UKM'], group_df.postcode_area['UKC'])]

In [None]:
group_df.postcode_area['UKM'][np.isin(group_df.postcode_area['UKM'], group_df.postcode_area['UKD'])]

Tried this but it didn't work - saving in case I want to come back to it

for nuts_1 in group_df:
    print(group_df.iloc[nuts_1]['nuts_1'])
    

I'm pretty sure Northern Ireland just has one postocde, which should be unique, let's check there is a single postcode

In [None]:
group_df.postcode_area['UKN']

In [None]:
raw_cap_by_area[raw_cap_by_area['postcode_area'] == 'BT']

So, in 2016, Northern Ireland gave £316,692,000  and received £283,568,147 in CAP - this means we need to look at other data to see if they are a net receiver or giver. Interestingly, if we use the 1.6% figure, then they are a net receiver already. 

In [None]:
def add_postcode_area(df):
    df['postcode_area'] = df['postcode'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
    return df

In [None]:
esif = pd.concat([
    pd.read_pickle('../esif/output/esif_{}.pkl.gz'.format(dataset))
    for dataset in ['ni_2014_2020']
], sort=True)
add_postcode_area(esif)
esif.head()

In [None]:
def find_annual_sum(df, keys, column):
    def reweight(row):
        # create timeseries from start to end
        days = pd.date_range(row.start_date, row.end_date, closed='left')
        daily = pd.DataFrame({
            'year_start': days,
            column: row[column] / days.shape[0]
        })
        annual = daily.resample('AS', on='year_start').sum()
        for key in keys:
            annual[key] = row[key]
        return annual
    result = pd.concat(list(df.apply(reweight, axis=1)))
    result.reset_index(inplace=True)
    result['year'] = result.year_start.apply(lambda x: x.year)
    result.drop('year_start', axis=1, inplace=True)
    return result[['year'] + keys + [column]]

In [None]:
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total

In [None]:
esif_annual = find_annual_sum(esif, ['funds', 'postcode_area', 'my_eu_id'], 'eu_investment')
esif_annual_total = find_fund_annual_totals(esif_annual, 'eu_investment')
esif_annual_total.head()

In [None]:
esif_annual_total[esif_annual['year'] == 2016]

OK, so this is £21,215,000 - adding it to the £283,568,147 we already have, that's £304,783,147 - still not quite a net receiver

#### Horizon 2020

In [None]:
h2020_organizations = pd.read_pickle('../cordis/output/h2020_organizations.pkl.gz')
h2020_projects = pd.read_pickle('../cordis/output/h2020_projects.pkl.gz')
h2020 = pd.merge(
    h2020_projects, h2020_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
add_postcode_area(h2020)
h2020['my_eu_id'] = 'h2020_' + h2020.project_rcn.astype('str') + '_' + h2020.organization_id.astype('str')
h2020['funds'] = 'H2020'
h2020.head()

In [None]:
def estimate_missing_cordis_contributions():
    c = h2020.contribution_eur.copy()
    c[c.isna()] = h2020.max_contribution_eur[c.isna()] / h2020.num_organizations[c.isna()]
    h2020['estimated_contribution_eur'] = c
estimate_missing_cordis_contributions()
h2020[h2020.contribution_eur.isna()].head()

In [None]:
h2020['contribution_gbp'] = h2020.contribution_eur * h2020.eur_gbp
h2020['estimated_contribution_gbp'] = h2020.estimated_contribution_eur * h2020.eur_gbp

In [None]:
h2020_ni = h2020[h2020['postcode_area'] == 'BT'].copy()
h2020_ni.head()

In [None]:
h2020_ni.columns

In [None]:
h2020_ni_annual = find_annual_sum(h2020_ni, ['postcode_area', 'my_eu_id', 'funds'], 'estimated_contribution_gbp')


In [None]:
 h2020_ni_annual.head()

In [None]:
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
cordis_annual_total = find_fund_annual_totals(h2020_ni_annual, 'estimated_contribution_gbp')
cordis_annual_total.head()

That's £7,961,557, so now we're at £312,744,704 in total
