# Collect additional URLs

1. PMID/PMCID
    1. Collect via [ID-Converter-API](https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/)
    2. Collect via [Entrez]()
2. Resolve DOIs

## 1.A Collect with the ID-Converter-API

We can use this API to directly find the corresponding PMID and PMCID for a DOI

In [1]:
import datetime
import time
import sys
import re
import requests
import json
from dateutil.parser import parse
from random import shuffle

from pathlib import Path
import pandas as pd
import numpy as np
import lxml.etree as ET

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()




In [19]:
data_folder = Path("output_files/test/")
input_file = Path("input_files/PKP_20171220.csv")

In [9]:
## Functions
def load_raw(input_file, valid_dois=True):
    raw = pd.read_csv(input_file, encoding = 'utf8', parse_dates=True)
    raw = raw.drop_duplicates()
    return raw

def validate_doi(doi):
    # https://www.crossref.org/blog/dois-and-matching-regular-expressions/
    patterns = [
        r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$",
        r"^10.1002/[^\s]+$",
        r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$",
        r"^10.1021/\w\w\d+$",    
        r"^10.1207\/[\w\d]+\&\d+_\d+$"
    ]
    
    if type(doi) != str:
        return False
       
    for pat in patterns:
        if re.match(pat, doi, re.IGNORECASE):
            return True
    return False

def remove_invalid_dois(df):
    valid_dois = [validate_doi(doi) for doi in df['doi']]
    # df = df.progress_apply(validate_doi, axis = 1)
    df = df[valid_dois]
    return df

def collect_ncbi_ids(df, batchsize, rand=True, debug=False):
    df = df[[]].copy()
    df['pmid'] = None
    df['pmc']  = None
    df['ncbi_ts'] = None
    df['ncbi_errmsg'] = None
    
    # https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/
    url_base = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0"

    params = {
        'email': 'aenkhbay@sfu.ca',
        'tool': 'ScholCommLab ID Crawler - scholcommlab.ca',
        'idtype': 'doi',
        'versions': 'no',
        'format': 'json'
    }

    dois = list(set(df.index.tolist()))
    if rand:
        shuffle(dois)

    batches = range(0, len(dois), batchsize)
    for i in tqdm_notebook(batches, disable=not debug):
        now = datetime.datetime.now()
        batch = dois[i:i+batchsize] # the result might be shorter than batchsize at the end

        params['ids'] = ",".join(batch)
        response = requests.get(url_base,
                                params=params)
        records = json.loads(response.text)['records']

        for record in records:
            doi = record['doi']
            df.loc[doi, 'ncbi_ts'] = str(now)
            
            try:
                df.loc[doi, "pmid"] = record['pmid']
            except:
                pass

            try:
                df.loc[doi, "pmc"] = record["pmcid"][3:]
            except:
                pass

            try:
                df.loc[doi, "ncbi_errmsg"] = record["errmsg"]
            except:
                pass
                
    return df

def resolve_dois(df, timeout, debug=False):
    df = df[[]].copy()
    df['doi_url'] = None
    df['doi_resolve_ts']  = None
    df['doi_resolve_status'] = None
    df['doi_resolve_error'] = None
    
    for doi in tqdm_notebook(df.index, disable=not debug):
        now = datetime.datetime.now()

        # Init row values
        doi_resolve_status = None
        doi_resolve_error = None
        doi_url = None

        # Resolve DOI
        try:
            response = requests.get('https://doi.org/{}'.format(doi), allow_redirects=True, timeout=timeout)
            if response.ok:
                doi_resolve_status = response.status_code
                doi_url = response.url
            else:
                response.urdoi_resolve_status = response.status_code
                doi_resolve_error = response.reason
        except requests.exceptions.Timeout as ex:
            doi_resolve_error = "Timeout"
        except requests.exceptions.TooManyRedirects  as ex:
            doi_resolve_error = "TooManyRedirects"
        except requests.exceptions.RequestException  as ex:
            doi_resolve_error = "RequestException"
        
        df.loc[doi, 'doi_url'] = doi_url
        df.loc[doi, 'doi_resolve_status'] = doi_resolve_status
        df.loc[doi, 'doi_resolve_error'] = doi_resolve_error
        df.loc[doi, 'doi_resolve_ts']  = str(now)
    return df

## Run the scripts

In [20]:
raw = load_raw(input_file)
sample = remove_invalid_dois(raw).drop_duplicates().set_index("doi")

In [21]:
sample = sample.sample(100)

In [22]:
ncbi = collect_ncbi_ids(sample, batchsize=200, rand=True, debug=True)




## Resolve DOIs and create PMC/PubMed URLs

In [23]:
resolved_dois = resolve_dois(sample, timeout=5, debug=True)




In [None]:
# resolved_dois = pd.read_csv(data_folder / "resolved_dois.csv", index_col="doi")

In [27]:
urls = sample[['url']].merge(ncbi, left_index=True, right_index=True)

# Add NCBI URLs
urls['pmid_url'] = urls.pmid.apply(lambda x: "https://ncbi.nlm.nih.gov/pubmed/{}".format(int(x)) if pd.notnull(x) else None)
urls['pmc_url'] = urls.pmc.apply(lambda x: "https://ncbi.nlm.nih.gov/pmc/articles/PMC{}/".format(int(x)) if pd.notnull(x) else None)

# Add resolved DOI URLs
urls = urls.merge(resolved_dois, left_index=True, right_index=True).drop_duplicates()

# Misc
urls.rename(columns={'url':'pkp_url'}, inplace=True)
urls[[x for x in urls.columns if "url" in x]].to_csv(data_folder / "articles_with_urls.csv")