In [None]:
import json
import re
import urllib

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

## Importing the data

In [None]:
creativeeurope = pd.read_excel('input/CreativeEurope_Projects_Overview_2018-08-01.xls')
creativeeurope.shape

In [None]:
creativeeurope.head()

In [None]:
list(creativeeurope)

In [None]:
creativeeurope = creativeeurope.rename(columns={
    'Programme': 'funds', 
    'Sub-programme': 'category',
    'Action': 'action', 
    'Activity type': 'activity_type',
    'Call year': 'call_year', 
    'Start date': 'start_date',
    'End date': 'end_date', 
    'Project Number': 'project_number',
    'Project Title': 'project',
    'Project Summary': 'summary', 
    'Project Status': 'project_status',
    "EU Grant award in euros (This amount represents the grant awarded after the selection stage and is indicative. Please note that any changes made during or after the project's lifetime will not be reflected here.)": 'eu_investment',
    'Is Success Story': 'is_success', 
    'Project Website': 'project_url', 
    'Results Available': 'results_available', 
    'Results Platform Project Card': 'results_url', 
    'Participating countries': 'participating_countries',
    "Coordinator's name": 'coord_name', 
    'Coordinator organisation type': 'coord_org_type',
    "Coordinator's address": 'coord_address',
    "Coordinator's region": 'coord_region',
    "Coordinator's country": 'coord_country', 
    "Coordinator's website": 'coord_website'
    }).copy()
creativeeurope.head()

### Unnamed Column

Apparently a placeholder for projects with more than 36 partners.

In [None]:
[creativeeurope.shape, creativeeurope['Unnamed: 251'].isna().sum()]

In [None]:
creativeeurope['Unnamed: 251'][~creativeeurope['Unnamed: 251'].isna()]

In [None]:
creativeeurope.rename(columns={'Unnamed: 251': 'extra_partners'}, inplace=True)

### Project Number

Fortunately, this looks to be an ID.

In [None]:
creativeeurope.project_number.isna().sum()

In [None]:
(creativeeurope.project_number.str.strip() != creativeeurope.project_number).sum()

In [None]:
[
    creativeeurope.shape,
    creativeeurope.project_number.nunique(),
    creativeeurope.project_number.str.upper().nunique()
]

## Extract Projects from Partners and Coordinators

In [None]:
projects = creativeeurope[[
    'project_number', 'funds', 'category', 'action', 'activity_type',
    'call_year', 'start_date', 'end_date',
    'project', 'summary', 'project_status',
    'eu_investment', 'is_success', 'project_url',
    'results_available', 'results_url',
    'participating_countries', 'extra_partners'
]].copy()
projects.shape

### Funds

Always the same.

In [None]:
projects.funds.isna().sum()

In [None]:
projects.funds.unique()

### Category

In [None]:
projects.category.isna().sum()

In [None]:
projects.category.unique()

### Action

In [None]:
projects.action.isna().sum()

In [None]:
projects.action.unique()

### Activity Type

In [None]:
projects.activity_type.isna().sum()

In [None]:
projects.activity_type[~projects.activity_type.isna()].sort_values().unique()[0:10]

### Call Year

In [None]:
projects.call_year.isna().sum()

In [None]:
projects.call_year.unique()

In [None]:
projects.call_year = projects.call_year.astype('int32')

In [None]:
projects.call_year.describe()

### Start and End Dates

In [None]:
[projects.start_date.isna().sum(), projects.start_date.dtype]

In [None]:
[projects.start_date.isna().sum(), projects.end_date.dtype]

In [None]:
(projects.start_date >= projects.end_date).sum()

In [None]:
projects.start_date.describe()

In [None]:
projects.end_date.describe()

### Project

In [None]:
projects.project.isna().sum()

In [None]:
(projects.project != projects.project.str.strip()).sum()

In [None]:
projects.project = projects.project.str.strip()

### Summary

In [None]:
projects.summary.isna().sum()

In [None]:
projects.summary[projects.summary.str.strip() != projects.summary] # lots
projects.summary = projects.summary.str.strip()

### Project Status

In [None]:
projects.project_status.isna().sum()

In [None]:
projects.project_status.unique()

### EU Investment


In [None]:
projects.eu_investment.isna().sum()

In [None]:
projects.eu_investment = projects.eu_investment.map(str).str.strip()
eu_investment_bad = projects.eu_investment.str.match(re.compile(r'.*[^0-9.].*'))
projects.eu_investment[eu_investment_bad]

In [None]:
projects.eu_investment = projects.eu_investment.astype('float')

In [None]:
projects.eu_investment.describe()

### Is Success

In [None]:
projects.is_success.isna().sum()

In [None]:
projects.is_success.unique()

In [None]:
(projects.is_success == 'Yes').sum()

### Project URL

In [None]:
(~projects.project_url.isna()).sum()

In [None]:
projects.project_url[~projects.project_url.isna()].head()

In [None]:
def is_valid_url(url):
    result = urllib.parse.urlparse(str(url))
    return bool(result.scheme and result.netloc)

(~projects.project_url.isna() & ~projects.project_url.apply(is_valid_url)).sum()

### Results Available

In [None]:
projects.results_available.isna().sum()

In [None]:
projects.results_available.unique()

In [None]:
(projects.results_available == 'Yes').sum()

### Results URL

It looks like every project has a page. Some projects have extra results uploaded on that page.

In [None]:
projects.results_url.isna().sum()

In [None]:
projects.results_url[projects.results_available == 'Yes'].values[0:5]

In [None]:
(~projects.results_url.isna() & ~projects.results_url.apply(is_valid_url)).sum()

### Participating Countries

In [None]:
projects.participating_countries.isna().sum()

In [None]:
projects.participating_countries.head()

## Extract Coordinators

The coordinator is like a special partner, so make the names consistent, and we can treat partners and coordinators the same for cleaning purposes.

In [None]:
coordinators = creativeeurope[[
    'project_number',
    'coord_name', 
    'coord_org_type',
    'coord_address',
    'coord_region',
    'coord_country', 
    'coord_website'
]].copy()
coordinators.shape

In [None]:
coordinators.rename(columns={
    'coord_name': 'name',
    'coord_org_type': 'type',
    'coord_address': 'address',
    'coord_region': 'region',
    'coord_country': 'country',
    'coord_website': 'website',
}, inplace=True)
coordinators['coordinator'] = True
coordinators.head()

In [None]:
coordinators.count()

### Name

In [None]:
(coordinators.name.str.strip() != coordinators.name).sum()

In [None]:
coordinators.name = coordinators.name.str.strip()
coordinators.name.unique().shape

### Type

In [None]:
coordinators.type.isna().sum()

In [None]:
(coordinators.type[~coordinators.type.isna()] != coordinators.type[~coordinators.type.isna()].str.strip()).sum()

In [None]:
coordinators[~coordinators.type.isna()].type.sort_values().unique()[0:10]

### Website

In [None]:
(~coordinators.website.isna() & ~coordinators.website.apply(is_valid_url)).sum()

In [None]:
[
    coordinators.website.str.startswith('http').sum(),
    (~coordinators.website.isna() & coordinators.website.apply(is_valid_url)).sum()
]

In [None]:
coordinators.loc[
    ~coordinators.website.isna() &
    ~coordinators.website.apply(is_valid_url), 'website'] = 'http://' + coordinators.website
(~coordinators.website.isna() & ~coordinators.website.apply(is_valid_url)).sum()

In [None]:
coordinators.website.head()

### Postcodes for UK Coordinators

They are embedded in the addresses. Use the regex from [Wikipedia](https://en.wikipedia.org/w/index.php?title=Postcodes_in_the_United_Kingdom&oldid=855238661). Note: the page was recently edited with a different regex, but it seems to work OK.

In [None]:
coordinators_uk = coordinators[coordinators.country == 'UK'].copy()
[coordinators_uk.shape[0], coordinators.shape[0]]

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv')
ukpostcodes.shape

In [None]:
VALID_POSTCODE_RE = re.compile(
    r'([A-Za-z][A-Ha-hJ-Yj-y]?[0-9][A-Za-z0-9]? ?[0-9][A-Za-z]{2}|[Gg][Ii][Rr] ?0[Aa]{2})'
)
assert ukpostcodes.postcode.str.match(VALID_POSTCODE_RE).sum() == ukpostcodes.shape[0]

In [None]:
coordinators_uk['raw_postcode'] = \
    coordinators_uk.address.str.extract(VALID_POSTCODE_RE)[0]
coordinators_uk.raw_postcode.head()

In [None]:
coordinators_uk[coordinators_uk.raw_postcode.isna()]

It appears to be missing for that one.

In [None]:
coordinators_uk.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
def find_postcode_from_raw_postcode(raw_postcode):
    return raw_postcode.\
        str.upper().\
        str.strip().\
        str.replace(r'[^A-Z0-9]', '').\
        str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

coordinators_uk['postcode'] = find_postcode_from_raw_postcode(coordinators_uk.raw_postcode)
coordinators_uk.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
coordinators_uk.postcode[~coordinators_uk.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
coordinators_uk[~coordinators_uk.postcode.isin(ukpostcodes.postcode)]

In [None]:
clean_coordinators_uk = coordinators_uk[
    coordinators_uk.postcode.isin(ukpostcodes.postcode)
].copy()
clean_coordinators_uk.drop('raw_postcode', axis=1, inplace=True)
clean_coordinators_uk.shape

## Extract Partners

In [None]:
creativeeurope.columns = [
    re.sub(r'^Partner (\d+) (.+)$', r'Partner_\2_\1', column)
    for column in creativeeurope.columns
]
creativeeurope.head()

In [None]:
partner_columns = [
    column for column in creativeeurope.columns
    if column.startswith('Partner_')
]
partners_wide = creativeeurope[['project_number'] + partner_columns]
partners_wide.head()

In [None]:
partners = pd.wide_to_long(
    partners_wide,
    ['Partner_name','Partner_organisation type', 'Partner_address', 'Partner_country', 'Partner_region', 'Partner_website'],
    'project_number', 'partner_number',
    sep='_'
)
partners.head()

In [None]:
partners = partners.rename(columns={
    'Partner_name': 'name',
    'Partner_organisation type': 'type', 
    'Partner_address': 'address', 
    'Partner_country': 'country',
    'Partner_region': 'region', 
    'Partner_website': 'website'
    }).copy()
partners['coordinator'] = False
partners.head()

In [None]:
partners.count()

In [None]:
partners = partners[~partners.name.isna()].copy()

In [None]:
partners.count()

### Name

In [None]:
(partners.name.str.strip() != partners.name).sum()

In [None]:
partners.name = partners.name.str.strip()
partners.name.unique().shape

### Type

In [None]:
partners.type.isna().sum()

In [None]:
(partners.type[~partners.type.isna()] != partners.type[~partners.type.isna()].str.strip()).sum()

In [None]:
partners[~partners.type.isna()].type.sort_values().unique()[0:10]

### Website

In [None]:
(~partners.website.isna() & ~partners.website.apply(is_valid_url)).sum()

In [None]:
[
    partners.website.str.startswith('http').sum(),
    (~partners.website.isna() & partners.website.apply(is_valid_url)).sum()
]

In [None]:
partners.website[
    partners.website.str.startswith('http') &
    ~partners.website.apply(is_valid_url)]

In [None]:
partners.website = partners.website.str.replace(r'http//:', 'http://')

In [None]:
partners.loc[
    ~partners.website.isna() &
    ~partners.website.apply(is_valid_url), 'website'] = 'http://' + partners.website
(~partners.website.isna() & ~partners.website.apply(is_valid_url)).sum()

In [None]:
coordinators.website.head()

### Separating out UK partners

In [None]:
partners_uk = partners[partners.country == 'UK'].copy()
[partners_uk.shape, partners.shape]

In [None]:
partners_uk['raw_postcode'] = \
    partners_uk.address.str.extract(VALID_POSTCODE_RE)[0]
partners_uk.raw_postcode.head()

In [None]:
partners_uk[partners_uk.raw_postcode.isna()]

It looks like it should be 4AA.

In [None]:
partners_uk.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
partners_uk['postcode'] = find_postcode_from_raw_postcode(partners_uk.raw_postcode)
partners_uk.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
partners_uk.postcode[~partners_uk.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
partners_uk[~partners_uk.postcode.isin(ukpostcodes.postcode)]

In [None]:
clean_partners_uk = partners_uk[partners_uk.postcode.isin(ukpostcodes.postcode)].copy()
clean_partners_uk.drop('raw_postcode', axis=1, inplace=True)
clean_partners_uk.reset_index(inplace=True)

In [None]:
clean_partners_uk.shape

## Save Data

### Organisations

In [None]:
organisations_uk = pd.concat([clean_coordinators_uk, clean_partners_uk], sort=True)
[
    organisations_uk.shape,
    clean_coordinators_uk.shape,
    clean_partners_uk.shape
]

In [None]:
organisations_uk.rename(columns={
    'name': 'organisation_name',
    'type': 'organisation_type',
    'address': 'organisation_address',
    'country': 'organisation_country',
    'region': 'organisation_region',
    'website': 'organisation_website',
    'coordinator': 'organisation_coordinator'
}, inplace=True)

In [None]:
organisations_uk

In [None]:
organisations_uk.project_number.unique().shape

In [None]:
organisations_uk.to_pickle('output/creative_europe_organisations.pkl.gz')

### Projects in the UK

In [None]:
projects_uk_full = pd.merge(projects, organisations_uk, on='project_number', validate='1:m')
projects_uk_full.shape

In [None]:
projects_uk_full.head()

In [None]:
projects_uk = projects[projects.project_number.isin(organisations_uk.project_number)].copy()
projects_uk.shape

In [None]:
projects_uk['my_eu_id'] = 'creative_' + projects_uk.project_number
projects_uk.my_eu_id.head()

In [None]:
projects_uk.to_pickle('output/creative_europe_projects.pkl.gz')