# Extraction of papers referencing datasets Using OpenCitation API with PubMed database

## How does this file work ?

1. First, we get the DOI of the datasets we want to query using datasets.csv file
2. We convert those DOI to PubMedId using https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/
3. For each dataset, make a get request to https://opencitations.net/index/poci/api/v1/citations/{pubmedID_of_the_dataset}
4. From the API response, get the PubMedId and year of the paper that cites the dataset, convert the id to DOI
5. Export the result in paper_coci.csv

Get data from datasets.csv

In [2]:
import requests
import numpy as np
from pprint import pprint
import csv

#Dictionnary with dataset's name as key and DOI as value
datasets_DOI = {}

ds_reader = csv.DictReader(open('./data/datasets.csv'))
for ds in ds_reader:
    datasets_DOI[ds["name"]] = ds["DOI"]

#To convert DOI to pubmedID
base_url_pubmed = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=ddsa-theo&email=theo.sourget@univ-rouen.fr&format=json&ids="

pprint(datasets_DOI)


{'ACDC': '10.1109/TMI.2018.2837502',
 'BRATS': '10.1109/tmi.2014.2377694',
 'I2CVB': '10.1016/j.compbiomed.2015.02.009',
 'LA': '10.1016/j.media.2020.101832',
 'M&Ms': '10.1109/tmi.2021.3090082',
 'MSCMRSeg': '10.48550/arxiv.2006.12434',
 'Medical Decathlon': '10.1038/s41467-022-30695-9',
 'PROMISE12': '10.1016/j.media.2013.12.002',
 'Synapse': '10.7303/syn3193805'}


Request the OpenCitation API

In [3]:
url_base = "https://opencitations.net/index/poci/api/v1/citations/"
paper_using = {ds:[] for ds in datasets_DOI}

for ds in datasets_DOI:
    print(ds)
    r = requests.get(base_url_pubmed + datasets_DOI[ds])
    if "error" not in str(r.content):
        r_json = r.json()
        pid = r_json["records"][0]["pmid"]
        req_url = url_base + pid
        req = requests.get(req_url)
        if req.status_code == 200:
            req_json = req.json()
            for cite in req_json:
                id_to_convert = cite["citing"][5:]
                r = requests.get(base_url_pubmed + id_to_convert)
                if "error" not in str(r.content):
                    r_json = r.json()
                    
                    doi = r_json["records"][0]["doi"]
                    year = cite["creation"][:4]
                    paper_using[ds].append((doi,year))
        else:
            print(f"ERROR {req.status_code} for dataset {ds}")
    else:
            print(f"ERROR to convert DOI to PMID for dataset {ds}")


ACDC
ERROR to convert DOI to PMID for dataset ACDC
LA
ERROR to convert DOI to PMID for dataset LA
PROMISE12
MSCMRSeg
ERROR to convert DOI to PMID for dataset MSCMRSeg
M&Ms
ERROR to convert DOI to PMID for dataset M&Ms
Medical Decathlon
I2CVB
ERROR to convert DOI to PMID for dataset I2CVB
BRATS
Synapse
ERROR to convert DOI to PMID for dataset Synapse


In [29]:
paper_using

{'ACDC': [],
 'LA': [],
 'PROMISE12': [('pmid:32355894', '2019'),
  ('pmid:33680505', '2021'),
  ('pmid:34405896', '2021'),
  ('pmid:35621897', '2022'),
  ('pmid:32743017', '2020'),
  ('pmid:32746129', '2020'),
  ('pmid:34609756', '2021'),
  ('pmid:35300345', '2022'),
  ('pmid:33672608', '2021'),
  ('pmid:32874427', '2019'),
  ('pmid:33965983', '2021'),
  ('pmid:33929265', '2021'),
  ('pmid:34897550', '2021'),
  ('pmid:35502381', '2022'),
  ('pmid:33102206', '2020'),
  ('pmid:34890013', '2022'),
  ('pmid:34891962', '2021'),
  ('pmid:34979213', '2022'),
  ('pmid:35422101', '2022'),
  ('pmid:33300190', '2021'),
  ('pmid:33507867', '2021'),
  ('pmid:33341496', '2021'),
  ('pmid:32812797', '2021'),
  ('pmid:34625565', '2021'),
  ('pmid:34872011', '2021'),
  ('pmid:33212541', '2021'),
  ('pmid:32737628', '2021'),
  ('pmid:34527506', '2021'),
  ('pmid:35150363', '2022'),
  ('pmid:34888197', '2021'),
  ('pmid:33783929', '2021'),
  ('pmid:33052737', '2020'),
  ('pmid:35204380', '2022'),
  ('pm

Export the result in paper_coci.csv

In [4]:
with open("./extracted_csv/paper_poci.csv","w") as file:
    file.write(f"name,DOI,publication_year,dataset_used")
    for ds in datasets_DOI:
        lst_papers = paper_using[ds]
        for pap in lst_papers:
            file.write(f"\n{None},{pap[0]},{pap[1]},{ds}")