In [None]:
import json
import re
import urllib

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

In [None]:
erasmus_plus_mobility = pd.concat([
    pd.read_excel(file)
    for file in [
        'input/ErasmusPlus_KA1_2014_LearningMobilityOfIndividuals_Projects_Overview_2018-09-18.xls',
        'input/ErasmusPlus_KA1_2015_LearningMobilityOfIndividuals_Projects_Overview_2018-09-18.xls',
        'input/ErasmusPlus_KA1_2016_LearningMobilityOfIndividuals_Projects_Overview_2018-09-18.xls',
        'input/ErasmusPlus_KA1_2017_LearningMobilityOfIndividuals_Projects_Overview_2018-09-15.xls',
        'input/ErasmusPlus_KA1_2018_LearningMobilityOfIndividuals_Projects_Overview_2018-09-18.xls',
        'input/ErasmusPlus_KA2_CooperationForInnovationAndTheExchangeOfGoodPractices_Projects_Overview_2018-09-21.xls',
        'input/ErasmusPlus_Sports_Projects_Overview_2018-09-10.xls',
        'input/ErasmusPlus_JeanMonnet_Projects_Overview_2018-09-10.xls'
    ]
], ignore_index=True)
erasmus_plus_mobility.shape

In [None]:
erasmus_plus_mobility.head()

In [None]:
list(erasmus_plus_mobility)

In [None]:
erasmus_plus_mobility = erasmus_plus_mobility.rename(columns={
    'Programme': 'funds', 
    'Call year': 'call_year',
    'Project Identifier': 'project_identifier',
    'Project Title': 'project',
    'Project Summary': 'summary', 
    'Project Status': 'project_status',
    "EU Grant award in euros (This amount represents the grant awarded after the selection stage and is indicative. Please note that any changes made during or after the project's lifetime will not be reflected here.)": 'max_contribution_eur',
    'Project Website': 'project_url', 
    'Results Available': 'results_available', 
    'Results Platform Project Card': 'results_url', 
    'Participating countries': 'participating_countries',
    'Coordinating organisation name': 'coord_name', 
    'Coordinating organisation type': 'coord_org_type',
    "Coordinator's address": 'coord_address',
    "Coordinator's region": 'coord_region',
    "Coordinator's country": 'coord_country', 
    "Coordinator's website": 'coord_website',
    'Key Action': 'key_action',
    'Action Type': 'action_type',
    'Is Good Practice': 'is_good_practice',
    'Is Success Story': 'is_success_story',
    'Results Platform Project Card': 'results_platform_project_card', 
    'Topics': 'topics'
}).copy()
erasmus_plus_mobility.head()

Let's check if these projects are on the map already - looked at the first few, and they don't seem to be there 

In [None]:
erasmus_plus_mobility_check = erasmus_plus_mobility[erasmus_plus_mobility.coord_country == 'UK'].copy()
erasmus_plus_mobility_check = erasmus_plus_mobility_check[['coord_address', 'project', 'coord_country']].copy()
erasmus_plus_mobility_check.head()

### Unnamed Column

Apparently a placeholder for projects with more than 38 partners.

In [None]:
[erasmus_plus_mobility.shape, erasmus_plus_mobility['Unnamed: 250'].isna().sum()]

In [None]:
erasmus_plus_mobility['Unnamed: 250'][~erasmus_plus_mobility['Unnamed: 250'].isna()]

In [None]:
erasmus_plus_mobility.rename(columns={'Unnamed: 250': 'extra_partners'}, inplace=True)

### Project Identifier

Fortunately, this looks to be an ID.

In [None]:
erasmus_plus_mobility.project_identifier.isna().sum()

In [None]:
(erasmus_plus_mobility.project_identifier.str.strip() != erasmus_plus_mobility.project_identifier).sum()

In [None]:
[
    erasmus_plus_mobility.shape,
    erasmus_plus_mobility.project_identifier.nunique(),
    erasmus_plus_mobility.project_identifier.str.upper().nunique()
]

## Extract Projects from Partners and Coordinators

In [None]:
projects = erasmus_plus_mobility[[
    'project_identifier', 'funds',
    'call_year',
    'project', 'summary', 'project_status',
    'max_contribution_eur', 'project_url',
    'participating_countries', 'extra_partners'
]].copy()
projects.shape

### Funds

In [None]:
projects.funds.isna().sum()

In [None]:
projects.funds.unique()

### Call Year

In [None]:
projects.call_year.isna().sum()

In [None]:
projects.call_year.unique()

In [None]:
projects.call_year = projects.call_year.astype('int32')

### Project

In [None]:
projects.project.isna().sum()

In [None]:
(projects.project != projects.project.str.strip()).sum()

In [None]:
projects.project = projects.project.str.strip()

### Summary

In [None]:
projects.summary.isna().sum()

In [None]:
projects[projects.summary.isna()]

In [None]:
projects.summary[projects.summary.str.strip() != projects.summary] # lots
projects.summary = projects.summary.str.strip()

### Project Status

In [None]:
projects.project_status.isna().sum()

In [None]:
projects.project_status.unique()

### EU Investment


In [None]:
projects.max_contribution_eur.isna().sum()

In [None]:
projects.max_contribution_eur = projects.max_contribution_eur.map(str).str.strip()
max_contribution_eur_bad = projects.max_contribution_eur.str.match(re.compile(r'.*[^0-9.].*'))
projects.max_contribution_eur[max_contribution_eur_bad]

In [None]:
projects.max_contribution_eur = projects.max_contribution_eur.astype('float')

In [None]:
projects.max_contribution_eur.describe()

In [None]:
(projects.max_contribution_eur < 1000).value_counts()

In [None]:
projects = projects[projects.max_contribution_eur >= 1000]
projects.shape

### Project URL

In [None]:
(~projects.project_url.isna()).sum()

In [None]:
projects.project_url[~projects.project_url.isna()].head()

In [None]:
def is_valid_url(url):
    result = urllib.parse.urlparse(str(url))
    return bool(result.scheme and result.netloc)

(~projects.project_url.isna() & ~projects.project_url.apply(is_valid_url)).sum()

### Participating Countries

In [None]:
projects.participating_countries.isna().sum()

In [None]:
projects.participating_countries.head()

## Extract Coordinators

The coordinator is like a special partner, so make the names consistent, and we can treat partners and coordinators the same for cleaning purposes.


In [None]:
coordinators = erasmus_plus_mobility[[
    'project_identifier',
    'coord_name', 
    'coord_org_type',
    'coord_address',
    'coord_region',
    'coord_country', 
    'coord_website'
]].copy()
coordinators.shape

In [None]:
coordinators.rename(columns={
    'coord_name': 'name',
    'coord_org_type': 'type',
    'coord_address': 'address',
    'coord_region': 'region',
    'coord_country': 'country',
    'coord_website': 'website',
}, inplace=True)
coordinators['coordinator'] = True
coordinators.head()

In [None]:
coordinators.count()

### Name

In [None]:
(coordinators.name.str.strip() != coordinators.name).sum()

In [None]:
coordinators.name = coordinators.name.str.strip()
coordinators.name.unique().shape

### Type

In [None]:
coordinators.type.isna().sum()

In [None]:
(coordinators.type[~coordinators.type.isna()] != coordinators.type[~coordinators.type.isna()].str.strip()).sum()

In [None]:
coordinators[~coordinators.type.isna()].type.sort_values().unique()[0:10]

### Country

In [None]:
coordinators.country.isna().sum()

In [None]:
[
    coordinators.shape[0],
    (coordinators.country != coordinators.country.str.strip()).sum(),
    (coordinators.country != coordinators.country.str.upper()).sum(),
    (coordinators.country.str.match('[A-Z]{2}')).sum()
]

### Website

In [None]:
(~coordinators.website.isna() & ~coordinators.website.apply(is_valid_url)).sum()

In [None]:
[
    coordinators.website.str.startswith('http').sum(),
    (~coordinators.website.isna() & coordinators.website.apply(is_valid_url)).sum()
]

In [None]:
coordinators.head()

In [None]:
coordinators.website[~coordinators.website.isna() & ~coordinators.website.apply(is_valid_url)].head()

In [None]:
coordinators.loc[
    ~coordinators.website.isna() &
    ~coordinators.website.apply(is_valid_url), 'website'] = 'http://' + coordinators.website
(~coordinators.website.isna() & ~coordinators.website.apply(is_valid_url)).sum()

In [None]:
coordinators.website[~coordinators.website.isna() & ~coordinators.website.apply(is_valid_url)]

In [None]:
coordinators.website = coordinators.website.str.replace(r'^(https?://)/', r'\1')
(~coordinators.website.isna() & ~coordinators.website.apply(is_valid_url)).sum()

In [None]:
coordinators.website.head()

### Postcodes for UK Coordinators

Some people have switched 'O' for '0' - could clean this up later

In [None]:
coordinators_uk = coordinators[coordinators.country == 'UK'].copy()
[coordinators_uk.shape[0], coordinators.shape[0]]

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
VALID_POSTCODE_RE = re.compile(
    r'([A-Za-z][A-Ha-hJ-Yj-y]?[0-9][A-Za-z0-9]? ?[0-9][A-Za-z]{2}|[Gg][Ii][Rr] ?0[Aa]{2})'
)
assert ukpostcodes.postcode.str.match(VALID_POSTCODE_RE).sum() == ukpostcodes.shape[0]

In [None]:
coordinators_uk['raw_postcode'] = \
    coordinators_uk.address.str.extract(VALID_POSTCODE_RE)[0]
coordinators_uk.raw_postcode.head()

In [None]:
coordinators_uk[coordinators_uk.raw_postcode.isna()]

In [None]:
[
    (~coordinators_uk.raw_postcode.isna()).sum(),
    coordinators_uk.raw_postcode.isin(ukpostcodes.postcode).sum(),
]

In [None]:
def find_postcode_from_raw_postcode(raw_postcode):
    return raw_postcode.\
        str.upper().\
        str.strip().\
        str.replace(r'[^A-Z0-9]', '').\
        str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

coordinators_uk['postcode'] = find_postcode_from_raw_postcode(coordinators_uk.raw_postcode)
coordinators_uk.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
coordinators_uk.postcode[~coordinators_uk.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
coordinators_uk[~coordinators_uk.postcode.isin(ukpostcodes.postcode)]

In [None]:
clean_coordinators_uk = coordinators_uk[
    coordinators_uk.postcode.isin(ukpostcodes.postcode)
].copy()
clean_coordinators_uk.drop('raw_postcode', axis=1, inplace=True)
clean_coordinators_uk.shape

## Extract Partners

In [None]:
erasmus_plus_mobility.columns = [
    re.sub(r'^Partner (\d+) (.+)$', r'Partner_\2_\1', column)
    for column in erasmus_plus_mobility.columns
]
erasmus_plus_mobility.head()

In [None]:
partner_columns = [
    column for column in erasmus_plus_mobility.columns
    if column.startswith('Partner_')
]
partners_wide = erasmus_plus_mobility[['project_identifier'] + partner_columns]
partners_wide.head()

In [None]:
partners = pd.wide_to_long(
    partners_wide,
    ['Partner_name','Partner_organisation type', 'Partner_address', 'Partner_country', 'Partner_region', 'Partner_website'],
    'project_identifier', 'partner_number',
    sep='_'
)
partners.head()

In [None]:
partners = partners.rename(columns={
    'Partner_name': 'name',
    'Partner_organisation type': 'type', 
    'Partner_address': 'address', 
    'Partner_country': 'country',
    'Partner_region': 'region', 
    'Partner_website': 'website'
    }).copy()
partners['coordinator'] = False
partners.head()

In [None]:
partners.count()

In [None]:
partners = partners[~partners.name.isna()].copy()

In [None]:
partners.count()

### Name

In [None]:
(partners.name.str.strip() != partners.name).sum()

In [None]:
partners.name = partners.name.str.strip()
partners.name.unique().shape

### Type

In [None]:
partners.type.isna().sum()

In [None]:
(partners.type[~partners.type.isna()] != partners.type[~partners.type.isna()].str.strip()).sum()

In [None]:
partners[~partners.type.isna()].type.sort_values().unique()[0:10]

### Country

In [None]:
partners.country.isna().sum()

In [None]:
[
    partners.shape[0],
    (partners.country != partners.country.str.strip()).sum(),
    (partners.country != partners.country.str.upper()).sum(),
    (partners.country.str.match('[A-Z]{2}')).sum()
]

### Website

In [None]:
(~partners.website.isna() & ~partners.website.apply(is_valid_url)).sum()

In [None]:
[
    partners.website.str.startswith('http').sum(),
    (~partners.website.isna() & partners.website.apply(is_valid_url)).sum()
]

In [None]:
partners_copy = partners.copy()

In [None]:
partners = partners_copy.copy()

In [None]:
partners.website[
    partners.website.str.startswith('http') &
    ~partners.website.apply(is_valid_url)]

In [None]:
partners.website = partners.website.str.replace(r'^http:\\', 'http://')
partners.website = partners.website.str.replace(r'^http:://', 'http://')
partners.website = partners.website.str.replace(r'^http: //', 'http://')
partners.website = partners.website.str.replace(r'^http:/[^/]', 'http://')
partners.website = partners.website.str.replace(r'^http:[^/][^/]', 'http://')
partners.website = partners.website.str.replace(r'^http//:', 'http://')
partners.website = partners.website.str.replace(r'^http//', 'http://')
partners.website = partners.website.str.replace(r'^http/', 'http://')
partners.website = partners.website.str.replace(r'^http.www', 'http://www')

In [None]:
partners.loc[
    ~partners.website.isna() &
    ~partners.website.apply(is_valid_url), 'website'] = 'http://' + partners.website
(~partners.website.isna() & ~partners.website.apply(is_valid_url)).sum()

In [None]:
partners.website.head()

### Separating out UK partners

In [None]:
partners_uk = partners[partners.country == 'UK'].copy()
[partners_uk.shape, partners.shape]

In [None]:
partners_uk['raw_postcode'] = \
    partners_uk.address.str.extract(VALID_POSTCODE_RE)[0]
partners_uk.raw_postcode.head()

In [None]:
partners_uk[partners_uk.raw_postcode.isna()]

Quite a few here

In [None]:
partners_uk.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
partners_uk['postcode'] = find_postcode_from_raw_postcode(partners_uk.raw_postcode)
partners_uk.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
partners_uk.postcode[~partners_uk.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
partners_uk[~partners_uk.postcode.isna() & ~partners_uk.postcode.isin(ukpostcodes.postcode)]

In [None]:
clean_partners_uk = partners_uk[partners_uk.postcode.isin(ukpostcodes.postcode)].copy()
clean_partners_uk.drop('raw_postcode', axis=1, inplace=True)
clean_partners_uk.reset_index(inplace=True)

In [None]:
clean_partners_uk.shape

## Count Organisations and Countries
It is useful to know the total number of organisations and the number of countries involved, to deal with cases where the contribution of each organisation is unknown.

In [None]:
organisations = pd.concat([
    partners.reset_index()[['project_identifier', 'country']],
    coordinators.reset_index()[['project_identifier', 'country']]
])
organisations.shape 

In [None]:
project_num_organisations = organisations.groupby('project_identifier').\
    country.count().reset_index().rename(columns={'country': 'num_organisations'})
[projects.shape[0], project_num_organisations.shape]

Cross-check with partner numbers:

In [None]:
project_num_organisations_check = \
    (partners.reset_index().groupby('project_identifier').partner_number.max() + 1).\
    reset_index().rename(columns={'partner_number': 'num_organisations'})
[projects.shape[0], project_num_organisations_check.shape]

In [None]:
def compare_project_num_organisations():
    c = pd.merge(project_num_organisations, project_num_organisations_check,
                 on='project_identifier', how='left')
    c.loc[c.num_organisations_y.isna(), 'num_organisations_y'] = 1
    return (c.num_organisations_x != c.num_organisations_y).sum()
compare_project_num_organisations()

In [None]:
project_num_countries = organisations.groupby('project_identifier').\
    country.nunique().reset_index().rename(columns={'country': 'num_countries'})
[projects.shape[0], project_num_countries.shape]

In [None]:
project_num_organisations_and_countries = pd.merge(
    project_num_countries, project_num_organisations,
    on='project_identifier', validate='1:1'
)
project_num_organisations_and_countries.shape

In [None]:
project_num_organisations_and_countries.head()

In [None]:
projects = pd.merge(projects, project_num_organisations_and_countries,
                    on='project_identifier', validate='1:1')
projects.head()

## Save Data

### Organisations

In [None]:
organisations_uk = pd.concat([clean_coordinators_uk, clean_partners_uk], sort=True)
[
    organisations_uk.shape,
    clean_coordinators_uk.shape,
    clean_partners_uk.shape
]

In [None]:
organisations_uk.rename(columns={
    'name': 'organisation_name',
    'type': 'organisation_type',
    'address': 'organisation_address',
    'country': 'organisation_country',
    'region': 'organisation_region',
    'website': 'organisation_website',
    'coordinator': 'organisation_coordinator'
}, inplace=True)

In [None]:
organisations_uk

In [None]:
organisations_uk.project_identifier.unique().shape

In [None]:
organisations_uk.to_pickle('output/erasmus_mobility_organisations.pkl.gz')

### Projects in the UK

In [None]:
projects_uk_full = pd.merge(projects, organisations_uk, on='project_identifier', validate='1:m')
projects_uk_full.shape

In [None]:
projects_uk_full.head()

In [None]:
projects_uk = projects[projects.project_identifier.isin(organisations_uk.project_identifier)].copy()
projects_uk.shape

#### Convert to GBP

In [None]:
eur_gbp = pd.read_pickle('../exchange_rates/output/exchange_rates.pkl.gz')
eur_gbp.tail()

In [None]:
def find_average_eur_gbp_rate(row):
    # create timeseries from start to end
    year_start = str(row.call_year) +'-01-01'
    year_end = str(row.call_year) +'-12-31'
    days = pd.date_range(year_start, year_end, closed='left')
    daily = pd.DataFrame({
        'month_start': days,
        'weight': 1.0 / days.shape[0]
    })
    monthly = daily.resample('MS', on='month_start').sum()
    monthly = pd.merge(monthly, eur_gbp, on='month_start', validate='1:1')
    return (monthly.weight * monthly.rate).sum()

projects_uk['eur_gbp'] = projects_uk.apply(
    find_average_eur_gbp_rate, axis=1, result_type='reduce')

In [None]:
projects_uk.head()

In [None]:
projects_uk.to_pickle('output/erasmus_mobility_projects.pkl.gz')