In [None]:
import json
import math
import re

import pandas as pd
import numpy as np


### Importing the data

In [None]:
creativeeurope = pd.read_excel('input/CreativeEurope_Projects_Overview_2018-08-01.xls')
creativeeurope.head()

In [None]:
list(creativeeurope)

In [None]:
creativeeurope = creativeeurope.rename(columns={
    'End date': 'end_date', 
    "Coordinator's name": 'beneficiary_1', 
    'Start date': 'start_date',
    "EU Grant award in euros (This amount represents the grant awarded after the selection stage and is indicative. Please note that any changes made during or after the project's lifetime will not be reflected here.)": 'eu_investment',
    "Coordinator's country": 'coord_country', 
    'Project Summary': 'summary', 
    'Sub-programme': 'category',
    'Is Success Story': 'is_success', 
    'Results Platform Project Card': 'eu_url', 
    'Activity type': 'activity_type',
    'Programme': 'funds', 
    "Coordinator's address": 'coord_address', 
    'Project Status': 'project_status',
    'Project Website': 'project_url', 
    'Action': 'action', 
    'Call year': 'call_year', 
    'Project Title':'project',
    "Coordinator's website": 'coord_website',
    'Results Available': 'results_available', 
    "Coordinator's region": 'coord_region',
    'Coordinator organisation type': 'coord_org_type'
    }).copy()
creativeeurope.head()

 ### Beneficiary

In [None]:
creativeeurope.beneficiary_1[creativeeurope.beneficiary_1.str.strip() != creativeeurope.beneficiary_1]# lots
creativeeurope.beneficiary_1 = creativeeurope.beneficiary_1.str.strip()

### Summary

In [None]:
creativeeurope.summary.isna().sum()

In [None]:
creativeeurope.summary[creativeeurope.summary.str.strip() != creativeeurope.summary]# lots
creativeeurope.summary = creativeeurope.summary.str.strip()

### Funds

In [None]:
creativeeurope.funds.isna().sum()

In [None]:
creativeeurope.funds.unique()

### EU Investment


In [None]:
creativeeurope.eu_investment.isna().sum()

In [None]:
creativeeurope.eu_investment  = creativeeurope.eu_investment.map(str).str.strip()
eu_investment_bad = creativeeurope.eu_investment.str.match(re.compile(r'.*[^0-9.].*'))
creativeeurope.eu_investment[eu_investment_bad]

### Start and End Dates


In [None]:
[creativeeurope.start_date.isna().sum(), creativeeurope.start_date.dtype]

In [None]:
[creativeeurope.start_date.isna().sum(), creativeeurope.end_date.dtype]

In [None]:
creativeeurope[creativeeurope.start_date >= creativeeurope.end_date]

In [None]:
creativeeurope.start_date.describe()

In [None]:
creativeeurope.end_date.describe()

### Category


In [None]:
creativeeurope.category.isna().sum()

### Making each partner a different row

In [None]:
creativeeurope.columns = [
    re.sub(r'^Partner (\d+) (.+)$', r'Partner_\2_\1', column)
    for column in creativeeurope.columns
]
creativeeurope.head()

In [None]:
[creativeeurope.shape, creativeeurope['Project Number'].nunique()]

In [None]:
creativeeurope_long = pd.wide_to_long(
    creativeeurope,
    ['Partner_name','Partner_organisation type', 'Partner_address', 'Partner_country', 'Partner_region', 'Partner_website'],
    'Project Number', 'partner_number',
    sep='_'
)
creativeeurope_long

In [None]:
creativeeurope_long.columns

In [None]:
creativeeurope_long = creativeeurope_long.rename(columns={
    'Partner_name': 'partner_name',
    'Partner_organisation type': 'partner_org_type', 
    'Partner_address': 'partner_address', 
    'Partner_country': 'partner_country',
    'Partner_region': 'partner_region', 
    'Partner_website': 'partner_website'
    }).copy()
creativeeurope_long.head()

### Separating out UK partners

In [None]:

creativeeurope_long_uk = creativeeurope_long[creativeeurope_long.partner_country == 'UK'].copy()
creativeeurope_long_uk.shape

In [None]:
creativeeurope_long_uk.partner_name[creativeeurope_long_uk.partner_name.str.strip() != creativeeurope_long_uk.partner_name]# lots
creativeeurope_long_uk.partner_name = creativeeurope_long_uk.partner_name.str.strip()

In [None]:
creativeeurope_long_uk['raw_postcode'] = creativeeurope_long_uk.partner_address.str.extract(r'.*\b(\w{1,4}\s*\d\w\w)\b.*')[0].str.strip().str.replace(' ', '')

creativeeurope_long_uk.head()

In [None]:
creativeeurope_long_uk.raw_postcode.isna().sum()

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv')
ukpostcodes.shape

In [None]:
creativeeurope_long_uk.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
creativeeurope_long_uk['postcode'] = creativeeurope_long_uk.raw_postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
creativeeurope_long_uk.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
creativeeurope_long_uk.postcode[~creativeeurope_long_uk.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
creativeeurope_long_uk[~creativeeurope_long_uk.postcode.isin(ukpostcodes.postcode)]

In [None]:
creativeeurope_long_uk = creativeeurope_long_uk[creativeeurope_long_uk.postcode.isin(ukpostcodes.postcode)].copy()

In [None]:
list(creativeeurope_long_uk)

### Coordinators

In [None]:
creativeeurope.shape

In [None]:
creativeeurope_ukcoords = creativeeurope[creativeeurope.coord_country == 'UK'].copy()


creativeeurope_ukcoords.shape

In [None]:
creativeeurope_ukcoords['raw_postcode'] = creativeeurope_ukcoords.coord_address.str.extract(r'.*\b(\w{1,4}\s*\d\w\w)\b.*')[0].str.strip().str.replace(' ', '')


In [None]:
creativeeurope_ukcoords.raw_postcode.isna().sum()

In [None]:
creativeeurope_ukcoords.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
creativeeurope_ukcoords['postcode'] = creativeeurope_ukcoords.raw_postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
creativeeurope_ukcoords.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
creativeeurope_ukcoords.postcode[~creativeeurope_ukcoords.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
creativeeurope_ukcoords[~creativeeurope_ukcoords.postcode.isin(ukpostcodes.postcode)]

In [None]:
creativeeurope_ukcoords.shape

## Save Data

### Coordinators

In [None]:
coordinators = creativeeurope_ukcoords.drop([



 'category',
 'action',
 'activity_type',
 'call_year',

 'Project Number',

 'project_status',

 'is_success',

 'results_available',

 'Participating countries',

 'coord_org_type',
 'coord_address',
 'coord_region',
 'coord_country',
 'coord_website',
 'Partner_name_1',
 'Partner_organisation type_1',
 'Partner_address_1',
 'Partner_region_1',
 'Partner_country_1',
 'Partner_website_1',
 'Partner_name_2',
 'Partner_organisation type_2',
 'Partner_address_2',
 'Partner_region_2',
 'Partner_country_2',
 'Partner_website_2',
 'Partner_name_3',
 'Partner_organisation type_3',
 'Partner_address_3',
 'Partner_region_3',
 'Partner_country_3',
 'Partner_website_3',
 'Partner_name_4',
 'Partner_organisation type_4',
 'Partner_address_4',
 'Partner_region_4',
 'Partner_country_4',
 'Partner_website_4',
 'Partner_name_5',
 'Partner_organisation type_5',
 'Partner_address_5',
 'Partner_region_5',
 'Partner_country_5',
 'Partner_website_5',
 'Partner_name_6',
 'Partner_organisation type_6',
 'Partner_address_6',
 'Partner_region_6',
 'Partner_country_6',
 'Partner_website_6',
 'Partner_name_7',
 'Partner_organisation type_7',
 'Partner_address_7',
 'Partner_region_7',
 'Partner_country_7',
 'Partner_website_7',
 'Partner_name_8',
 'Partner_organisation type_8',
 'Partner_address_8',
 'Partner_region_8',
 'Partner_country_8',
 'Partner_website_8',
 'Partner_name_9',
 'Partner_organisation type_9',
 'Partner_address_9',
 'Partner_region_9',
 'Partner_country_9',
 'Partner_website_9',
 'Partner_name_10',
 'Partner_organisation type_10',
 'Partner_address_10',
 'Partner_region_10',
 'Partner_country_10',
 'Partner_website_10',
 'Partner_name_11',
 'Partner_organisation type_11',
 'Partner_address_11',
 'Partner_region_11',
 'Partner_country_11',
 'Partner_website_11',
 'Partner_name_12',
 'Partner_organisation type_12',
 'Partner_address_12',
 'Partner_region_12',
 'Partner_country_12',
 'Partner_website_12',
 'Partner_name_13',
 'Partner_organisation type_13',
 'Partner_address_13',
 'Partner_region_13',
 'Partner_country_13',
 'Partner_website_13',
 'Partner_name_14',
 'Partner_organisation type_14',
 'Partner_address_14',
 'Partner_region_14',
 'Partner_country_14',
 'Partner_website_14',
 'Partner_name_15',
 'Partner_organisation type_15',
 'Partner_address_15',
 'Partner_region_15',
 'Partner_country_15',
 'Partner_website_15',
 'Partner_name_16',
 'Partner_organisation type_16',
 'Partner_address_16',
 'Partner_region_16',
 'Partner_country_16',
 'Partner_website_16',
 'Partner_name_17',
 'Partner_organisation type_17',
 'Partner_address_17',
 'Partner_region_17',
 'Partner_country_17',
 'Partner_website_17',
 'Partner_name_18',
 'Partner_organisation type_18',
 'Partner_address_18',
 'Partner_region_18',
 'Partner_country_18',
 'Partner_website_18',
 'Partner_name_19',
 'Partner_organisation type_19',
 'Partner_address_19',
 'Partner_region_19',
 'Partner_country_19',
 'Partner_website_19',
 'Partner_name_20',
 'Partner_organisation type_20',
 'Partner_address_20',
 'Partner_region_20',
 'Partner_country_20',
 'Partner_website_20',
 'Partner_name_21',
 'Partner_organisation type_21',
 'Partner_address_21',
 'Partner_region_21',
 'Partner_country_21',
 'Partner_website_21',
 'Partner_name_22',
 'Partner_organisation type_22',
 'Partner_address_22',
 'Partner_region_22',
 'Partner_country_22',
 'Partner_website_22',
 'Partner_name_23',
 'Partner_organisation type_23',
 'Partner_address_23',
 'Partner_region_23',
 'Partner_country_23',
 'Partner_website_23',
 'Partner_name_24',
 'Partner_organisation type_24',
 'Partner_address_24',
 'Partner_region_24',
 'Partner_country_24',
 'Partner_website_24',
 'Partner_name_25',
 'Partner_organisation type_25',
 'Partner_address_25',
 'Partner_region_25',
 'Partner_country_25',
 'Partner_website_25',
 'Partner_name_26',
 'Partner_organisation type_26',
 'Partner_address_26',
 'Partner_region_26',
 'Partner_country_26',
 'Partner_website_26',
 'Partner_name_27',
 'Partner_organisation type_27',
 'Partner_address_27',
 'Partner_region_27',
 'Partner_country_27',
 'Partner_website_27',
 'Partner_name_28',
 'Partner_organisation type_28',
 'Partner_address_28',
 'Partner_region_28',
 'Partner_country_28',
 'Partner_website_28',
 'Partner_name_29',
 'Partner_organisation type_29',
 'Partner_address_29',
 'Partner_region_29',
 'Partner_country_29',
 'Partner_website_29',
 'Partner_name_30',
 'Partner_organisation type_30',
 'Partner_address_30',
 'Partner_region_30',
 'Partner_country_30',
 'Partner_website_30',
 'Partner_name_31',
 'Partner_organisation type_31',
 'Partner_address_31',
 'Partner_region_31',
 'Partner_country_31',
 'Partner_website_31',
 'Partner_name_32',
 'Partner_organisation type_32',
 'Partner_address_32',
 'Partner_region_32',
 'Partner_country_32',
 'Partner_website_32',
 'Partner_name_33',
 'Partner_organisation type_33',
 'Partner_address_33',
 'Partner_region_33',
 'Partner_country_33',
 'Partner_website_33',
 'Partner_name_34',
 'Partner_organisation type_34',
 'Partner_address_34',
 'Partner_region_34',
 'Partner_country_34',
 'Partner_website_34',
 'Partner_name_35',
 'Partner_organisation type_35',
 'Partner_address_35',
 'Partner_region_35',
 'Partner_country_35',
 'Partner_website_35',
 'Partner_name_36',
 'Partner_organisation type_36',
 'Partner_address_36',
 'Partner_region_36',
 'Partner_country_36',
 'Partner_website_36',
 'Partner_name_37',
 'Partner_organisation type_37',
 'Partner_address_37',
 'Partner_region_37',
 'Partner_country_37',
 'Partner_website_37',
 'Partner_name_38',
 'Partner_organisation type_38',
 'Partner_address_38',
 'Partner_region_38',
 'Partner_country_38',
 'Partner_website_38',
 'Unnamed: 251',
 'raw_postcode'
], axis=1)
coordinators.head()

In [None]:
coordinators = coordinators.rename(columns={'benificiary_1': 'benificiary'}).copy()

In [None]:
 coordinators['my_eu_id'] = coordinators.funds.str.lower() + '_creative_coordinator_' + coordinators.index.map(str)
 coordinators.my_eu_id.head()

In [None]:
 coordinators.to_pickle('output/creative_europe_coordinators.pkl.gz')

### Projects

In [None]:
list(creativeeurope_long_uk)

In [None]:
partners = creativeeurope_long_uk.drop([
'coord_address',
 'category',
 'coord_org_type',
 'coord_region',
 'results_available',
 'activity_type',
 'Participating countries',
 'is_success',
 'call_year',
 'project_status',
 'Unnamed: 251',
 'project_url',
 'coord_country',
 'action',
 'partner_org_type',
 'partner_address',
 'partner_country',
 'partner_region',
 'partner_website',
 'raw_postcode'
    ], axis=1).copy()

In [None]:
partners = partners.rename(columns={'benificiary_1': 'benificiary'}).copy()

In [None]:
 partners['my_eu_id'] = partners.funds.str.lower() + '_creative_partner_'+ partners.partner_name.str.lower() + partners.index.map(str) 
    
 partners.my_eu_id.head()

In [None]:
 partners.to_pickle('output/creative_europe_partners.pkl.gz')