In [1]:
import numpy as np
import pandas as pd
import re
from datetime import date, timedelta

In [2]:
df = pd.read_csv('input_files/pkp_index_record_details20180228.csv', sep='\t')

In [3]:
df.columns = ['date', 'source', 'identifiers']
df.dropna(inplace=True)

In [4]:
df['date'] = pd.to_datetime(df.date, errors='coerce')
df['year'] = df.date.map(lambda x: x.year if x else np.nan)

In [5]:
df['identifiers'] = df.identifiers.map(lambda x: x.split('; '))

In [6]:
df['url'] = df.identifiers.map(lambda x: [i for i in x if i[0:4] == 'http'][0])

In [7]:
df['dois'] = df.identifiers.map(lambda x: [i for i in x if i[0:3] == '10.'])

In [8]:
df['doi'] = df.dois.map(lambda x: x[0] if len(x) > 0 else np.nan)

In [9]:
try: 
    del df['dois']
    del df['identifiers']
    del df['source']
except:
    pass

In [10]:
df.dropna(subset=['doi'], inplace=True)
df.index.name = 'record_id'
df.reset_index(inplace=True)
df.set_index('doi', inplace=True)

In [11]:
year_regex = re.compile('.*(?:[^\d\-]|^)((?:1\d|20)\d{2})(?:\)|\;|$).*')

# pull the year out of the ( ) if it is present
def find_best_year(y, s):
    try:
        y = int(y)
        if not (y <= date.today().year and y > 1000):
            y = None  # this means an invalid year in dcdate
    except:
        y = None

    try: 
        r=year_regex.match(s)
        year_in_source = int(r.group(1))
        if year_in_source <= date.today().year and year_in_source > 1000:
            y = year_in_source
    except:
        pass

    return y

In [12]:
df['year'] = df.apply(lambda row: find_best_year(row['year'], row['source']), axis=1)

In [15]:
df.to_csv('input_files/PKP20180228_fixedates.csv')