In [31]:
import datetime
import time
import sys
import re
import requests
import json
from dateutil.parser import parse
from random import shuffle

import pandas as pd
import numpy as np
import lxml.etree as ET
from ATB.ATB.Utils import resolve_doi

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()




In [None]:
input_file = "data/input_files/PKP_20171220.csv" # doi, url, date

# Input dataset

PKP...

In [26]:
# https://www.crossref.org/blog/dois-and-matching-regular-expressions/

def validate_doi(doi):
    patterns = [r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$",
                r"^10.1002/[^\s]+$",
                r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$",
                r"^10.1021/\w\w\d+$",
                r"^10.1207\/[\w\d]+\&\d+_\d+$"
               ]
    
    for pat in patterns:
        if re.match(pat, doi, re.IGNORECASE):
            return True
    return False

raw = pd.read_csv(input_file, encoding = 'utf8', parse_dates=['date'])
raw = raw.drop_duplicates()
raw['year'] = raw.date.apply(lambda x: x.year)
def f(row):
    row['valid_doi'] = validate_doi(row['doi'])
    return row

raw = raw.progress_apply(f, axis = 1)




In [None]:
temp = pd.read_csv("temp.csv")

In [66]:
sample = raw[raw.valid_doi].set_index("doi")
sample = sample[sample.index.isin(temp.doi)]
del(sample['valid_doi'])

# Collect additional URLs

1. PMID/PMCID
    1. Collect via [ID-Converter-API](https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/)
    2. Collect via [Entrez]()
2. Resolve DOIs

## 1.A Collect with the ID-Converter-API

We can use this API to directly find the corresponding PMID and PMCID for a DOI

In [68]:
# https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/

url_base = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0"

params = {
    'email': 'aenkhbay@sfu.ca',
    'tool': 'ScholCommLab ID Crawler - scholcommlab.ca',
    'idtype': 'doi',
    'versions': 'no',
    'format': 'json'
}

dois = sample.index.tolist()
shuffle(dois)

# Make two runs
# 1. Search all PMC items (they also usually contain their link to PubMed)
# 2. Search for remaining ones in PubMed

batchsize = 200
batches = range(0, len(dois), batchsize)
for i in tqdm_notebook(batches, total=len(batches)):
    batch = dois[i:i+batchsize] # the result might be shorter than batchsize at the end
    
    params['ids'] = ",".join(batch)
    response = requests.get(url_base,
                            params=params)
    records = json.loads(response.text)['records']
    
    for record in records:
        doi = record['doi']
        try:
            sample.loc[doi, "pmid"] = record['pmid']
        except:
            sample.loc[doi, "pmid"] = None
        
        try:
            sample.loc[doi, "pmcid"] = record["pmcid"]
        except:
            sample.loc[doi, "pmcid"] = None
        
        try:
            sample.loc[doi, "ncbi_errmsg"] = record["errmsg"]
        except:
            sample.loc[doi, "ncbi_errmsg"] = None




In [69]:
pmc_sample = sample
pmc_sample[['pmid', 'pmcid']].describe()

Unnamed: 0,pmid,pmcid
count,6886,7063
unique,6886,7063
top,27609718,PMC4900835
freq,1,1


## 1.B Collect via Entrez

This approach utilises Entrez and comprises several steps:

1. Search for the DOIs on Entre Search (batch search leads to a loss of the DOI <-> response mapping)
2. Fetch the database entries for the collected PMIDs & PMCIDs
3. Match the results and existing data based on DOI

### Search for the DOis

In [77]:
entrez_search = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

pubmed_params = {
    'db': "pubmed",
    'retmode':'json',
    'term': None
}

pmc_params = {
    'db': "pmc",
    'retmode':'json',
    'term': None
}

found_pmcids = []
found_pmids = []

dois = sample.index.tolist()
shuffle(dois)

batchsize = 200
batches = range(0, len(dois), batchsize)

for i in tqdm_notebook(batches, total=len(batches)):
    batch = dois[i:i+batchsize] # the result might be shorter than batchsize at the end
    
    # Query PubMed IDs
    pubmed_params['term'] = " OR ".join([x + "[doi]" for x in batch])
    response = requests.get(entrez_search, params=pubmed_params)
    
    try:
        found_pmids.extend(json.loads(response.text)['esearchresult']['idlist'])
    except:
        pass
    
    # Query PMC IDs
    pmc_params['term'] = " OR ".join([x + "[doi]" for x in batch])
    response = requests.get(entrez_search, params=pmc_params)
    
    try:
        found_pmcids.extend(json.loads(response.text)['esearchresult']['idlist'])
    except:
        pass




### Fetch all the entries based on PMID/PMCID

In [46]:
entrez_results = []

# XML Parser for the Entrez responses
parser = ET.XMLParser(recover=True)

base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
    'retmode': 'xml',
    'rettype': 'text'
}
batchsize = 50

items = found_pmcids
params['db'] = "pubmed"

batches = range(0, len(items), batchsize)
for i in tqdm_notebook(batches, total=(len(batches))):
    batch = items[i:i+batchsize] # the result might be shorter than batchsize at the end
    
    params['id'] = ",".join(batch)
    response = requests.get(base_url, params=params)
    
    tree = ET.ElementTree(ET.fromstring(response.text, parser=parser))
    articles = tree.findall(".//article-meta")
    for article in articles:
        _ = {}
        for article_id in article.findall("article-id"):
            _[article_id.get('pub-id-type')] = article_id.text
        entrez_results.append(_)

items = found_pmids
params['db'] = "pmc"
batches = range(0, len(items), batchsize)
for i in tqdm_notebook(batches, total=(len(batches))):
    batch = items[i:i+batchsize] # the result might be shorter than batchsize at the end
    
    params['id'] = ",".join(batch)
    response = requests.get(base_url, params=params)
    
    tree = ET.ElementTree(ET.fromstring(response.text, parser=parser))
    articles = tree.findall(".//ArticleIdList")
    for article in articles:
        _ = {}
        for article_id in article.findall("ArticleId"):
            _[article_id.get('IdType')] = article_id.text
        entrez_results.append(_)







In [None]:
# remove PMC from pmcids
for article in entrez_results:
    if 'pmc' in article:
        if "PMC" in article['pmc']:
            article['pmc'] = article['pmc'][3:]

In [None]:
doi_mapping = {}
no_doi = []
mismatch = []
for article in article_ids + article_ids_2:
    if 'doi' in article:
        if article['doi'] in doi_mapping:
            for k in [k for k in article.keys() if k in ['pmid', 'pmc', 'pmcid']]:
                if k in doi_mapping[article['doi']]:
                    if doi_mapping[article['doi']][k] != article[k]:
                        mismatch.append((doi_mapping[article['doi']], article))
                    else:
                        doi_mapping[article['doi']][k] = article[k]
        else:
            doi_mapping[article['doi']] = {}
            for k in [k for k in article.keys() if k in ['pmid', 'pmc', 'pmcid']]:
                doi_mapping[article['doi']][k] = article[k]
    else:
        no_doi.append(article)

In [None]:
print(len(no_doi))
print(len(doi_mapping))
print(len(mismatch))

In [None]:
# API results
api_results = sample[(~sample.pmid.isnull()) | (~sample.pmcid.isnull())][['pmid', 'pmcid']].to_dict(orient="index")
entrez_results = {}
for k, v in doi_mapping.items():
    if v:
        entrez_results[k] = v

In [None]:
print("Entrez mappings: {}".format(len(entrez_results)))
print("NCBI API mappings: {}".format(len(api_results)))
print("Sum of all found DOIs: {}".format(len(list(api_results.keys()) + list(entrez_results.keys()))))
uniqe_dois = set(list(api_results.keys()) + list(entrez_results.keys()))
print("Common DOIs: {} ".format(len(uniqe_dois)))

In [None]:
mismatched_results = 0
okay_pmids = 0
okay_pmc = 0
for doi in uniqe_dois:
    if doi in api_results and doi in entrez_results:
        pmid_mismatch = False
        pmcid_mismatch = False
        if 'pmid' in api_results[doi] and 'pmid' in entrez_results[doi]:
            pmid_mismatch = api_results[doi]['pmid'] !=  entrez_results[doi]['pmid']
            if not pmid_mismatch:
                okay_pmids = okay_pmids + 1
        
        if 'pmcid' in api_results[doi] and 'pmc' in entrez_results[doi]:
            pmcid_mismatch = api_results[doi]['pmcid'][3:] !=  entrez_results[doi]['pmc']
            if not pmcid_mismatch:
                okay_pmc = okay_pmc + 1
        
        if pmid_mismatch or pmcid_mismatch:
            mismatched_results = mismatched_results + 1
print("Bad matches: {}\nMatching PMC: {}\nMatching PMID: {}".format(mismatched_results, okay_pmc, okay_pmids))

In [None]:
dois = []
pmids = []
pmcids = []
for doi in uniqe_dois:
    dois.append(doi)
    pmid = None
    pmc = None
    try:
        pmid = entrez_results[doi]['pmid']
    except:
        pass
    try:
        pmid = api_results[doi]['pmid']
    except:
        pass
    try:
        pmc = entrez_results[doi]['pmc']
    except:
        pass
    try:
        pmc = api_results[doi]['pmcid'][3:]
    except:
        pass
    pmids.append(pmid)
    pmcids.append(pmc)

In [None]:
ncbi = pd.DataFrame({
            'doi': dois,
            'pmid': pmids,
            'pmc': pmcids})
ncbi = ncbi.set_index('doi')

# add date and original PKP url 
ncbi = ncbi.merge(sample[['url','date']], left_index=True, right_index=True)
ncbi.to_csv("data/pkp/pkp_collected_ids.csv")