In [None]:
import pandas as pd
import ezodf

import numpy as np

## Revenue from each region

Let's look at revenue from each region. We've got this data from here: https://www.ons.gov.uk/economy/governmentpublicsectorandtaxes/publicsectorfinance/articles/countryandregionalpublicsectorfinances/2016to2017

In [None]:
contributions = pd.read_excel('input/b794db89.xls', skiprows=3)
contributions.shape

In [None]:
contributions.head(20)

In [None]:
list(contributions)

We're not interested in revenue per person per region, so we'll remove these figures

In [None]:
contributions = contributions.drop(['Unnamed: 4', '2014/15.1', '2015/16.1', ' 2016/17.1'], axis=1)
contributions.head()

In [None]:
contributions

We'll use the figures that divide noth sea oil by population rather than by geography, but it doesn't make much difference. 

In [None]:
contributions = contributions[16:29].copy()
contributions.head()

The spend on the EU in the 2016/17 tax year was 13.8bn, not including the rebate, which we have ignored here as it is paid straight back. As a % of spending, this is 1.69%. As a proportion of revenue, this is 1.90%, but after reflection I think the 1.69% figure is the right one to use, as it is reflective of the proportion of our income that we spend on the EU, the other 0.21% is borrowed. (figures from https://www.ons.gov.uk/economy/governmentpublicsectorandtaxes/publicsectorfinance/articles/theukcontributiontotheeubudget/2017-10-31)

In [None]:
contributions['EU_funding_2016'] = contributions[' 2016/17']*0.0169
contributions.head(20)

## CAP data

CAP data is all at outcode level, so we will need to translate from outcode, where we have the investment figures, to NUTS1 region, where we have the tax figures. 

Let's import the CAP Data

In [None]:
raw_cap_by_area = pd.concat([
    pd.read_pickle('../cap/output/cap_by_area_{}.pkl.gz'.format(year))
    for year in range(2016, 2017)
])
raw_cap_by_area.head()

And the link from NUTS data to postcode - this is at NUTS3 level, but the first 3 digits of the NUTS3 code are the NUTS1 code. We'll clean the data and find the postcode area

In [None]:
nuts = pd.read_csv('input/pc2018_uk_NUTS-2016_v1.0.zip', sep=';')
nuts.head()

In [None]:
nuts = nuts.applymap(lambda x: x.replace("'", ''))
nuts.head()

In [None]:
nuts['postcode_area'] = \
   nuts['CODE'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
nuts.head()

In [None]:
nuts['nuts_1'] = \
   nuts['NUTS3'].str.replace(r'^([A-Z]{1,3}).+$', r'\1')
nuts.head()

Let's see if there is overlap with the same postcode area in different NUTS1 areas. 

In [None]:

group = nuts.groupby('nuts_1')['postcode_area'].unique()
group.head()

In [None]:
group_df = pd.DataFrame.from_dict(group)
group_df.head()

In [None]:
group_df.postcode_area['UKC']

In [None]:
group_df.postcode_area['UKC'][np.isin(group_df.postcode_area['UKC'], group_df.postcode_area['UKD'])]

In [None]:
group_df.postcode_area['UKC'][np.isin(group_df.postcode_area['UKC'], group_df.postcode_area['UKE'])]

In [None]:
group_df.postcode_area['UKD'][np.isin(group_df.postcode_area['UKD'], group_df.postcode_area['UKE'])]

In [None]:
group_df.postcode_area['UKD'][np.isin(group_df.postcode_area['UKD'], group_df.postcode_area['UKF'])]

In [None]:
group_df.postcode_area['UKD'][np.isin(group_df.postcode_area['UKD'], group_df.postcode_area['UKG'])]

In [None]:
group_df.postcode_area['UKE'][np.isin(group_df.postcode_area['UKE'], group_df.postcode_area['UKF'])]

It turns out, even Scotland had crossovers!

In [None]:
group_df.postcode_area['UKM'][np.isin(group_df.postcode_area['UKM'], group_df.postcode_area['UKC'])]

In [None]:
group_df.postcode_area['UKM'][np.isin(group_df.postcode_area['UKM'], group_df.postcode_area['UKD'])]

Tried this but it didn't work - saving in case I want to come back to it

for nuts_1 in group_df:
    print(group_df.iloc[nuts_1]['nuts_1'])
    

In [None]:
group_df.postcode_area['UKN']

Let's look at Wales to see if there is any crossover with its bordering regions

In [None]:
group_df.postcode_area['UKL'][np.isin(group_df.postcode_area['UKL'], group_df.postcode_area['UKD'])]

In [None]:
group_df.postcode_area['UKL'][np.isin(group_df.postcode_area['UKL'], group_df.postcode_area['UKG'])]

In [None]:
group_df.postcode_area['UKL'][np.isin(group_df.postcode_area['UKL'], group_df.postcode_area['UKK'])]

I'm pretty sure Northern Ireland just has one postocde, which should be unique, let's check there is a single postcode

In [None]:
raw_cap_by_area[raw_cap_by_area['postcode_area'] == 'BT']

# Using Northern Ireland as an example 

Nothern Ireland has a simple mapping of a single postcode so we can try out the data with it, to see what sorts of figures we're looking at. They also have a really high amount of CAP, so if they're not a net receiver then no one is likely to be!

So, in 2016, Northern Ireland gave £281,689,000  and received £283,568,147 in CAP if we're using the 1.69% figure, then they are a net receiver already. However, it'll be interesting to see how much higher we can get it, and if we can get past 1.9%

### NI ESIF

Let's look at ESIF first, and find the total amount spent in Northern Ireland in 2016. 

In [None]:
def add_postcode_area(df):
    df['postcode_area'] = df['postcode'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
    return df

In [None]:
esif = pd.concat([
    pd.read_pickle('../esif/output/esif_{}.pkl.gz'.format(dataset))
    for dataset in ['ni_2014_2020']
], sort=True)
add_postcode_area(esif)
esif.head()

In [None]:
def find_annual_sum(df, keys, column):
    def reweight(row):
        # create timeseries from start to end
        days = pd.date_range(row.start_date, row.end_date, closed='left')
        daily = pd.DataFrame({
            'year_start': days,
            column: row[column] / days.shape[0]
        })
        annual = daily.resample('AS', on='year_start').sum()
        for key in keys:
            annual[key] = row[key]
        return annual
    result = pd.concat(list(df.apply(reweight, axis=1)))
    result.reset_index(inplace=True)
    result['year'] = result.year_start.apply(lambda x: x.year)
    result.drop('year_start', axis=1, inplace=True)
    return result[['year'] + keys + [column]]

In [None]:
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total

In [None]:
esif_annual = find_annual_sum(esif, ['funds', 'postcode_area', 'my_eu_id'], 'eu_investment')
esif_annual_total = find_fund_annual_totals(esif_annual, 'eu_investment')
esif_annual_total.head()

In [None]:
esif_annual_total[esif_annual['year'] == 2016]

OK, so this is £21,215,000 - adding it to the £283,568,147 we already have, that's £304,783,147 - still not quite up to 1.9%

#### Horizon 2020

In [None]:
h2020_organizations = pd.read_pickle('../cordis/output/h2020_organizations.pkl.gz')
h2020_projects = pd.read_pickle('../cordis/output/h2020_projects.pkl.gz')
h2020 = pd.merge(
    h2020_projects, h2020_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
add_postcode_area(h2020)
h2020['my_eu_id'] = 'h2020_' + h2020.project_rcn.astype('str') + '_' + h2020.organization_id.astype('str')
h2020['funds'] = 'H2020'
h2020.head()

In [None]:
def estimate_missing_cordis_contributions():
    c = h2020.contribution_eur.copy()
    c[c.isna()] = h2020.max_contribution_eur[c.isna()] / h2020.num_organizations[c.isna()]
    h2020['estimated_contribution_eur'] = c
estimate_missing_cordis_contributions()
h2020[h2020.contribution_eur.isna()].head()

In [None]:
h2020['contribution_gbp'] = h2020.contribution_eur * h2020.eur_gbp
h2020['estimated_contribution_gbp'] = h2020.estimated_contribution_eur * h2020.eur_gbp

In [None]:
h2020_ni = h2020[h2020['postcode_area'] == 'BT'].copy()
h2020_ni.head()

In [None]:
h2020_ni.columns

In [None]:
h2020_ni_annual = find_annual_sum(h2020_ni, ['postcode_area', 'my_eu_id', 'funds'], 'estimated_contribution_gbp')


In [None]:
 h2020_ni_annual.head()

In [None]:
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
cordis_annual_total = find_fund_annual_totals(h2020_ni_annual, 'estimated_contribution_gbp')
cordis_annual_total.head()

That's £7,961,557, so now we're at £312,744,704 in total


#### Erasmus

In [None]:
erasmus_organisations = pd.read_pickle('../erasmus/output/erasmus_mobility_organisations.pkl.gz')
erasmus_projects = pd.read_pickle('../erasmus/output/erasmus_mobility_projects.pkl.gz')
erasmus = pd.merge(erasmus_projects, erasmus_organisations, on='project_identifier', validate='1:m')
erasmus.shape

In [None]:
add_postcode_area(erasmus)
assert erasmus.funds.unique().shape[0] == 1
erasmus['funds'] = 'Erasmus'
erasmus['my_eu_id'] = \
    'erasmus_' + erasmus.project_identifier + '_' + \
    erasmus.partner_number.apply('{:.0f}'.format).\
    str.replace('nan', 'coordinator', regex=False)
assert erasmus.shape[0] == erasmus.my_eu_id.unique().shape[0]
erasmus.head() 

In [None]:
[erasmus.shape[0], erasmus.max_contribution_eur.isna().sum()]

In [None]:
erasmus['estimated_contribution_eur'] = \
    erasmus.max_contribution_eur / erasmus.num_organisations
erasmus['estimated_contribution_gbp'] = erasmus.estimated_contribution_eur * erasmus.eur_gbp
erasmus['max_contribution_gbp'] = erasmus.max_contribution_eur * erasmus.eur_gbp
erasmus.head()

In [None]:
erasmus['start_date'] = pd.to_datetime(erasmus.call_year.apply(str) + '-01-01')
erasmus['end_date'] = pd.to_datetime(erasmus.call_year.apply(str) + '-12-31')
erasmus.head()

In [None]:
erasmus_annual = find_annual_sum(erasmus, ['funds', 'postcode_area', 'my_eu_id'], 'estimated_contribution_gbp')
erasmus_annual_total = find_fund_annual_totals(erasmus_annual, 'estimated_contribution_gbp')
erasmus_annual_total.head()

In [None]:
def find_fund_annual_area_totals(annual, column):
    annual_total = annual.groupby(['funds', 'postcode_area', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total


In [None]:
erasmus_annual_area_total = find_fund_annual_area_totals(erasmus_annual, 'estimated_contribution_gbp')
erasmus_annual_area_total.head()

In [None]:
erasmus_annual_area_total[erasmus_annual_area_total.postcode_area == 'BT']

So for 2016 that is £6,001,891, which takes our total to £318,746,595 - well over the 1.9% figure. Whichever way you look at it, NI is a net receiver. 

## More Postcode area Investigation

Let's look at how many postcodes are in each NUTS area, so we can see what % of CAP funding we will need to put in each region. 

In [None]:
group_count = pd.DataFrame(nuts.groupby(['nuts_1','postcode_area']).size())

In [None]:
group_count = group_count.reset_index()
group_count.head()

In [None]:
postcode_count = nuts.groupby(['postcode_area']).size()

In [None]:
postcode_count.head()

In [None]:
postcode_count = pd.DataFrame.from_dict(data = postcode_count)
postcode_count = postcode_count.reset_index()
postcode_count.head()

In [None]:
postcode_merge = pd.merge(group_count, postcode_count, how='outer', on='postcode_area')
postcode_merge.head()

In [None]:
postcode_merge['percent_postcodes'] = postcode_merge['0_x']/postcode_merge['0_y']
postcode_merge.head()

## CAP per region

Now we have number of postcodes per region, let's look at cap funding per region

In [None]:
cap_per_postcode = pd.merge(postcode_merge, raw_cap_by_area, how='outer', on='postcode_area')
cap_per_postcode.head()

In [None]:
cap_per_postcode['cap_in_area'] = cap_per_postcode['total']*cap_per_postcode['percent_postcodes']

In [None]:
cap_grouped = pd.DataFrame(cap_per_postcode.groupby(['nuts_1', 'cap_in_area']).sum())
cap_grouped = cap_grouped.reset_index()
cap_grouped.head()

In [None]:
cap_totals = pd.DataFrame(cap_grouped.groupby(['nuts_1']).sum())
cap_totals = cap_totals.reset_index()

cap_totals.head()

In [None]:
contributions['Country or region']

In [None]:
contributions['nuts_1']= ['UKC', 'UKD', 'UKE', 'UKF', 'UKG', 'UKH', 'UKI', 'UKJ', 'UKK', '', 'UKL', 'UKM', 'UKN']
contributions.head()


In [None]:
cap_vs_contribution = pd.merge(contributions, cap_totals, how='outer', on='nuts_1')
cap_vs_contribution.head()

In [None]:
cap_vs_contribution = cap_vs_contribution.drop(['2014/15', '2015/16', ' 2016/17', '0_x', '0_y', 'percent_postcodes', 'otherEAGF', 'directEAGF', 'ruralDevelopment', 'count', 'year', 'total'], axis=1)


In [None]:
cap_vs_contribution['cap_in_area'] = cap_vs_contribution['cap_in_area'] /1000000
cap_vs_contribution.head(20)