# CORDIS FP7

In [None]:
import json
import re

from titlecase import titlecase
import pandas as pd

pd.set_option('display.max_columns', 50)

## Read in Data

In [None]:
all_projects = pd.read_excel('input/fp7/cordis-fp7projects.xlsx')
all_projects.shape

In [None]:
all_organizations = pd.read_excel('input/fp7/cordis-fp7organizations.xlsx')
all_organizations.shape

In [None]:
all_briefs = pd.read_excel('input/fp7/cordis-fp7briefs.xlsx')
all_briefs.shape

## Restrict to UK

We are only interested in projects and organizations where the coordinator or at least one participant institution is in the UK.

In [None]:
uk_organizations = all_organizations[all_organizations.country == 'UK']
uk_organizations.shape

In [None]:
uk_organizations.head()

In [None]:
uk_projects = all_projects[all_projects.id.isin(uk_organizations.projectID)]
uk_projects.shape

In [None]:
uk_projects.head()

In [None]:
uk_briefs = all_briefs[all_briefs.projectRcn.isin(uk_projects.rcn)]
uk_briefs.shape

In [None]:
uk_briefs.head()

## Examples

### Coordinator outside UK

The UK has two participant institutions. It appears that `projects.ecMaxContribution` is the sum of all `organizations.ecContribution`s for all coordinator and participant institutions.

In [None]:
uk_projects[uk_projects.rcn == 101244]

In [None]:
uk_organizations[uk_organizations.projectRcn == 101244]

In [None]:
all_organizations[all_organizations.projectRcn == 101244].ecContribution.max()

In [None]:
all_organizations[all_organizations.projectRcn == 101244].ecContribution.sum()

In [None]:
all_briefs[all_briefs.projectRcn == 101244]

### Coordinator in UK

This one is also interesting in that it seems to have a lot of duplicate records that don't have titles, for some reason. We will need to filter those out.

In [None]:
uk_projects[uk_projects.rcn == 99464]

In [None]:
uk_organizations[uk_organizations.projectRcn == 99464]

In [None]:
uk_organizations[uk_organizations.projectRcn == 99464].ecContribution.unique().sum()

In [None]:
all_briefs[all_briefs.projectRcn == 99464]

## Duplicate Projects

It looks like it's safe to just drop projects without titles; those seem to be the only duplicates.

In [None]:
[uk_projects.rcn.nunique(), uk_projects.id.nunique(), uk_projects.shape]

In [None]:
uk_projects[uk_projects.duplicated('rcn', keep=False)]

In [None]:
uk_projects[pd.isnull(uk_projects.title)]

In [None]:
clean_projects = uk_projects[~pd.isnull(uk_projects.title)].copy()
# Could include coordinator and participants... would need some extra cleaning.
clean_projects.drop([
    'id', 'status', 'programme', 'topics', 'frameworkProgramme', 'call',
    'fundingScheme', 'coordinator', 'participants', 'subjects'
], axis=1, inplace=True)
clean_projects.shape

In [None]:
clean_projects.describe()

In [None]:
clean_projects.head()

## Clean Up Organizations

I notice several issues:

- Some are missing IDs (but do have postcodes)
- Some are missing postcodes
- Some postcodes are clearly typo'd (digit substitutions, etc);
- Some postcodes have been terminated (searched for them with google)

There are only 2993 unique organization IDs, so this is probably the result of a join.

For now, drop all organizations that don't have both an ID and a valid postcode. (It does look possible to match names to find IDs, and many without postcodes still have addresses, which we could geocode.)

Would be interesting to try this: https://codereview.stackexchange.com/questions/117801/uk-postcode-validation-and-format-correction-tool

In [None]:
[
    uk_organizations.shape,
    uk_organizations.id.notna().sum(),
    uk_organizations.id.isna().sum(),
    uk_organizations.id[uk_organizations.id.notna()].nunique(),
    uk_organizations.postCode.isna().sum(),
    uk_organizations.postCode[uk_organizations.postCode.notna()].nunique()
]

In [None]:
organizations = uk_organizations[uk_organizations.id.notna() & uk_organizations.postCode.notna()].copy()
organizations.id = organizations.id.astype('int64')
organizations.postCode = organizations.postCode.astype('str')
[
    organizations.shape,
    organizations.id.nunique(),
    organizations.postCode.nunique()
]

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv')
ukpostcodes.shape

In [None]:
organizations.postCode.isin(ukpostcodes.postcode).sum()

In [None]:
organizations['cleanPostcode'] = organizations.postCode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
organizations.cleanPostcode.isin(ukpostcodes.postcode).sum()

In [None]:
organizations.cleanPostcode[~organizations.cleanPostcode.isin(ukpostcodes.postcode)].unique()

In [None]:
organizations = organizations[organizations.cleanPostcode.isin(ukpostcodes.postcode)]
organizations.shape

In [None]:
clean_projects = clean_projects[clean_projects.rcn.isin(organizations.projectRcn)]
clean_projects.shape

## Clean Up Duplicate Organizations

I think there is also a join on the contacts, because we get multiple rows for some project-organization pairs. The main thing is that we want the `ecContribution` to be consistent. Otherwise, any row will do.

In [None]:
organizations.sort_values(['projectRcn', 'id']).\
    groupby(['projectRcn', 'id']).\
    filter(lambda x: x.shape[0] > 1)

In [None]:
organizations.groupby(['projectRcn', 'id']).\
    filter(lambda x: x.ecContribution.nunique() > 1).shape

In [None]:
clean_organizations = organizations.groupby(['projectRcn', 'id']).first()
clean_organizations.reset_index(inplace=True)
clean_organizations.drop([
    'projectID', 'projectAcronym', 'shortName', 'activityType', 'endOfParticipation',
    'country', 'street', 'city', 'postCode',
    'contactType', 'contactTitle', 'contactFirstNames', 'contactLastNames',
    'contactFunction', 'contactTelephoneNumber', 'contactFaxNumber', 'contactEmail'
], axis=1, inplace=True)
clean_organizations.rename({
    'id': 'organizationId',
    'cleanPostcode': 'postcode'
}, axis=1, inplace=True)
clean_organizations.name = clean_organizations.name.apply(titlecase)
clean_organizations.shape

In [None]:
clean_organizations.head()

## Briefs

Might as well merge these into the projects where we have them. We have a few duplicates to take care of.

In [None]:
clean_briefs = uk_briefs[
    uk_briefs.projectRcn.isin(clean_projects.rcn) &\
    (uk_briefs.title.notna() | uk_briefs.teaser.notna() | uk_briefs.article.notna())
].copy()
clean_briefs.shape

In [None]:
clean_briefs[clean_briefs.projectRcn.duplicated(keep=False)]

In [None]:
clean_briefs = clean_briefs.sort_values('lastUpdateDate')
clean_briefs = clean_briefs[~clean_briefs.projectRcn.duplicated(keep='last')]
clean_briefs.shape

In [None]:
clean_briefs.drop([
    'rcn', 'language', 'lastUpdateDate', 'country', 'projectAcronym',
    'programme', 'topics', 'relatedReportRcn'
], axis=1, inplace=True)
clean_briefs.rename({
    'projectRcn': 'rcn',
    'title': 'briefTitle'
}, axis=1, inplace=True)
clean_briefs.head()

In [None]:
clean_projects_with_briefs = pd.merge(
    clean_projects, clean_briefs, on='rcn', how='left', validate='1:1'
)
clean_projects_with_briefs.head()

## Checks

In [None]:
clean_organizations[clean_organizations.projectRcn == 101244]

In [None]:
clean_projects_with_briefs[clean_projects_with_briefs.rcn == 101244]

In [None]:
clean_organizations[clean_organizations.projectRcn == 99464]

In [None]:
clean_projects_with_briefs[clean_projects_with_briefs.rcn == 99464]

In [None]:
project_organizations = pd.merge(
    clean_projects_with_briefs, clean_organizations,
    left_on='rcn', right_on='projectRcn', validate='1:m')
project_organizations.drop(['projectRcn'], axis=1, inplace=True)
project_organizations.shape

In [None]:
project_organizations.head()

In [None]:
uk_contributions = project_organizations.groupby('rcn').aggregate({'ecContribution': sum})
uk_contributions.reset_index(inplace=True)
uk_contributions.rename({'ecContribution': 'ecUKContribution'}, axis=1, inplace=True)
uk_contributions.head()

In [None]:
project_uk_contributions = pd.merge(
    clean_projects_with_briefs,
    uk_contributions,
    on='rcn', validate='1:1')
project_uk_contributions.head()

In [None]:
project_uk_contributions[project_uk_contributions.ecUKContribution > project_uk_contributions.ecMaxContribution + 0.1].shape

In [None]:
project_organization_uk_contributions = pd.merge(
    project_uk_contributions, clean_organizations,
    left_on='rcn', right_on='projectRcn', validate='1:m'
)
project_organization_uk_contributions = pd.merge(
    project_organization_uk_contributions, ukpostcodes, on='postcode', validate='m:1'
)
project_organization_uk_contributions.shape

In [None]:
project_organization_uk_contributions.head()

## Save Data

In [None]:
clean_projects_with_briefs.to_pickle('output/fp7_projects.pkl.gz')

In [None]:
clean_organizations.to_pickle('output/fp7_organizations.pkl.gz')

In [None]:
def make_cordis_data_geo_json(data):
    def make_feature(row):
        properties = {
            property: row[property]
            for property in [
                'acronym', 'title',
                'totalCost', 'ecContribution', 'ecUKContribution', 'ecMaxContribution',
                'name', 'organizationUrl', 'projectUrl', 'imageUri']
            if str(row[property]) != 'nan'
        }
        return {
            'type': 'Feature',
            'geometry': {
                "type": "Point",
                "coordinates": [row['longitude'], row['latitude']]
            },
            'properties': properties
        }
    features = list(data.apply(make_feature, axis=1))
    return { 'type': 'FeatureCollection', 'features': features }
with open('output/cordis_data.geo.json', 'w') as file:
    json.dump(
        make_cordis_data_geo_json(project_organization_uk_contributions),
        file, sort_keys=True)