# Import Interreg NWE Data 
### http://www.nweurope.eu/media/4533/list-of-beneficiaries-for-website.xlsx

In [None]:
import json

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

In [None]:
all_benefs = pd.read_excel('input/list-of-beneficiaries-for-website.xlsx')
all_benefs.shape

## Load and Restrict Data to United Kingdom

In [None]:
#all_benefs.columns

In [None]:
all_benefs.rename({
    'No\nNo\nNr.\nNr.': 'Id',
    'Beneficiary name\nNom du bénéfictiaire\nName des Begünstigten \nNaam van de begunstigde ': 'beneficiary',
    "Operation name\nNom de l'opération\nBezeichnung des Vorhabens\nNaam van de concrete actie": 'project',
    "Operation summary\nRésumé de l'opération\nZusammenfassung des Vorhabens\nSamenvatting van de concrete actie": 'project_summary',
    "Operation start date\nDate de début de l'opération\nDatum des Beginns des Vorhabens\nBegindatum van de concrete actie": 'start_date',
    "Operation end date\nDate de fin de l'opération \nDatum des Endes des Vorhabens \nEinddatum van de concrete actie": 'end_date',
    'Total eligible expenditure allocated to the beneficiary\nTotal des dépenses éligibles attribué au bénéficiaire\nGesamtbetrag der förderfähigen Ausgaben an Begünstigte\nTotale subsidiabele uitgaven toegewezen aan begunstigde': 'funding',
    "Union co-financing rate\nTaux de cofinancement par l'Union \nUnions-Kofinanzierungssatz pro Prioritätsachse\nMedefinancieringspercentage van de Unie (per prioritaire as)": 'union_cofinancing',
    "Operation post code\nCode postal de l\'opération\nPostleitzahl des Vorhabens\nPostcode van de concrete actie": 'raw_postcode',
    'Country\nPays\nLand\nLand': 'country',
    "Name of category of intervention for the operation in accordance with point (b) (vi) of the first subparagraph of Article 96(2)\nDénomination de la catégorie d'intervention dont relève l'opération conformément à l'article 96, paragraphe 2,\npremier alinéa, point b) vi)\nBezeichnung der Interventionskategorie für das Vorhaben gemäß Artikel 96 Absatz 2 Unterabsatz 1 Buchstabe b Ziffer vi;\nNaam van de categorie steunverlening voor de concrete actie, overeenkomstig artikel 96, lid 2, eerste alinea,\nonder b), vi)": 'category',
    'Date of last update of the list of operations\nDate de la dernière mise à jour de la liste des opérations\nDatum der letzten Aktualisierung der Liste der Vorhaben\nDatum van de laatste bijwerking van de lijst van concrete acties': 'last_update'
}, axis=1, inplace=True)


In [None]:
all_benefs.columns

In [None]:
all_benefs.country.sort_values().unique()

In [None]:
benefs = all_benefs[all_benefs.country=='United Kingdom'].copy()

In [None]:
#benefs.head()

## Columns

In [None]:
benefs.count()

In [None]:
benefs.Id.unique()

#### Id column appears useless - not unique and doesn't list members of particular projects

In [None]:
#benefs.beneficiary.unique()

In [None]:
#benefs.category.unique()

In [None]:
benefs.category.unique().shape

In [None]:
#benefs.groupby('category').size().reset_index(name='size').sort_values('size', ascending=False).head(18)

# Categories need cleaning up -do below

In [None]:
#Do clean up here

## Import postcode data and check imported data

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')

In [None]:
ukpostcodes.shape

In [None]:
benefs.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
benefs['postcode'] = benefs.raw_postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2').\
    str.replace('UK ', '')

In [None]:
benefs.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
benefs.raw_postcode[~benefs.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
benefs[benefs in benefs.postcode.isin(ukpostcodes.postcode)

In [None]:
benefs.count()

### Typos, deprecated postcodes, one mistake, and one prefixed with UK - can't seem to get rid of it.

## Import exchange rate.

In [None]:
eur_gbp = pd.read_pickle('../exchange_rates/output/exchange_rates.pkl.gz')
eur_gbp.tail()

In [None]:
eur_gbp.count

In [None]:
eur_gbp.rate.isna().sum()

## Check that everything has start and end dates

In [None]:
benefs.start_date.isna().sum()

In [None]:
benefs.start_date.fillna(value='01.01.2015', inplace=True)

In [None]:
benefs.head()

In [None]:
benefs.end_date.isna().sum()

In [None]:
benefs.end_date.fillna(value='31.01.2015', inplace=True)

In [None]:
benefs.start_date.unique()

In [None]:
benefs.isna().count()

In [None]:
benefs.head()

In [None]:
benefs.end_date.unique()

#### They don't! What to do? We're setting the start date as 01/01/2015 under advice from John Lees-Miller

## Strip € symbol from funding?

In [None]:
benefs.funding.unique()

#### No need imported as floats 

In [None]:
eur_gbp.columns

In [None]:
eur_gbp.dtypes

In [None]:
benefs.dtypes

In [None]:
def find_average_eur_gbp_rate(row):
     #df = eur_gbp[(eur_gbp['month_start'] > pd.to_datetime(row['start_date'], 'yyyy-mm-dd')) & (eur_gbp['month_start'] <= pd.to_datetime(row['end_date'], 'yyyy-mm-dd'))]
    #df=eur_gbp[eur_gbp['month_start'] > '20150101000000']
    df=eur_gbp[(eur_gbp['month_start'] >= pd.to_datetime(row['start_date'] , format='%d.%m.%Y')) & \
               (eur_gbp['month_start'] <= pd.to_datetime(row['end_date'] , format='%d.%m.%Y'))]
    return df.rate.mean() 
    
benefs['eur_gbp'] = benefs.apply(find_average_eur_gbp_rate, axis=1)

benefs.head()

## Drop uninteresting columns

In [None]:
clean_benefs = benefs.drop(['Id','country','project_summary','raw_postcode'], axis=1)

In [None]:
clean_benefs.head()

In [None]:
#clean_benefs.Id.unique()

In [None]:
clean_benefs.beneficiary.unique()

In [None]:
clean_benefs.columns[0]

In [None]:
clean_benefs.shape

In [None]:
output_benefs = clean_benefs[
    clean_benefs.postcode.isin(ukpostcodes.postcode)
].copy()

In [None]:
output_benefs.count()

In [None]:
output_benefs['nwreg_benefs_id'] = 'nwreg_benefs_' + output_benefs.index.map(str)

In [None]:
output_benefs.nwreg_benefs_id.head()

In [None]:
output_benefs.to_pickle('output/interreg_beneficiaries.pkl.gz')