# Get list of papers from venue that cites a dataset

## Import libs

In [2]:
# To request OpenAlex
import requests
import numpy as np

#Better print of dict
from pprint import pprint

#To load data
import csv

## Load list of datasets and list of venues
We will search for paper from venue contained in [venues.csv](../../data/venues.csv) referencing one or more of the datasets contained in [datasets.csv](../../data/datasets.csv)

In [3]:
#Dictionnary with dataset's name as key and DOI as value
datasets_doi = {}
ds_reader = csv.DictReader(open('../../data/datasets.csv'))
for ds in ds_reader:
    datasets_doi[ds["name"]] = ds["DOI"]

#Dictionnary with venues name as key and openalex id as value loaded from venues.csv
venue_id = {}
ds_reader = csv.DictReader(open('../../data/venues.csv'))
for ds in ds_reader:
    venue_id[ds["name"]] = ds["openalex_id"]

In [4]:
print("Datasets:")
pprint(datasets_doi)
print("\nVenues:")
pprint(venue_id)

Datasets:
{'ACDC': '10.1109/TMI.2018.2837502',
 'BRATS': '10.1109/tmi.2014.2377694',
 'I2CVB': '10.1016/j.compbiomed.2015.02.009',
 'LA': '10.1016/j.media.2020.101832',
 'M&Ms': '10.1109/tmi.2021.3090082',
 'MSCMRSeg': '10.48550/arxiv.2006.12434',
 'Medical Decathlon': '10.1038/s41467-022-30695-9',
 'PROMISE12': '10.1016/j.media.2013.12.002',
 'Synapse': '10.7303/syn3193805'}

Venues:
{'LNCS': 'S106296714'}


## Try to convert DOI to OpenAlex ID

In [5]:
"""
Convert a DOI to OpenAlex ID used as value in some API field such as "referenced_works"
@param
    - DOI: the doi we want to convert
@return
    The OpenAlex ID if the DOI is in OpenAlex database, None otherwise
"""
def doi_to_OpenAlexId(doi):
    base_url = f"https://api.openalex.org/works/doi:{doi}"
    r = requests.get(base_url)
    if r.status_code == 200:
        r_json = r.json()
        return r_json["id"]
    else:
        return None

#Dictionnary with dataset names as key and openalex id as value. We associate an openalex ID because it's the value in the "referenced_works" field given by the API.
datasets_id = {}

#Convert DOI to OpenAlexID
for ds  in datasets_doi:
    openalex_id = doi_to_OpenAlexId(datasets_doi[ds])
    if not openalex_id:
        print(f"Couldn't convert DOI for {ds} into OpenAlex ID")
    datasets_id[ds]=openalex_id

Couldn't convert DOI for Synapse into OpenAlex ID


In [6]:
pprint(datasets_id)

{'ACDC': 'https://openalex.org/W2804047627',
 'BRATS': 'https://openalex.org/W1641498739',
 'I2CVB': 'https://openalex.org/W2049522781',
 'LA': 'https://openalex.org/W3093394156',
 'M&Ms': 'https://openalex.org/W4226199676',
 'MSCMRSeg': 'https://openalex.org/W4312016581',
 'Medical Decathlon': 'https://openalex.org/W3172681723',
 'PROMISE12': 'https://openalex.org/W2106033751',
 'Synapse': None}


note: Synapse value is None meaning it's note in OpenAlex database, therefore we won't be able to find references to it.

## Get list of papers from each venues citing at least one of the dataset

Query OpenAlex filtering on the dataset and the venue

In [13]:
#Dictionnary containing with dataset as key and a list of papers referencing the dataset as value
paper_referencing = {ds:[] for ds in datasets_id}


for ds in datasets_id:
    for venue in venue_id:
        #The list of paper referencing the dataset is decomposed in multiple pages so we have to iterate with the query parameter "page" to get them all.
        next_page = True
        page_number = 1
        while next_page:
            #Definition of the request
            base_url = "https://api.openalex.org/works"
            query_param = {
                "filter":f"cites:{datasets_id[ds]},locations.source.id:{venue_id[venue]}",
                "page":page_number
            }
            request = requests.get(base_url,params=query_param)

            if request.status_code == 200:
                request_json = request.json()
                
                #For each paper referencing the dataset we get the title (with a little transformation to remove "," and "\n" inside of them), doi and publication year
                for res in request_json["results"]:
                    title = res["title"]
                    title = title.replace(",","")
                    title = title.replace("\n","")

                    #Remove review paper
                    if "review" in title.lower():
                        continue

                    doi = res["doi"]
                    if doi is not None:
                        doi = doi[16:] #Remove the https://doi.org/
                        paper_referencing[ds].append((title,doi,res["publication_year"],res["abstract_inverted_index"]))

                #If the results field is empty that mean we are at the last page so we can continue to the next dataset
                #otherwise we need to go to next page of the current dataset
                if not request_json["results"]:
                    next_page = False
                else:
                    page_number += 1
            else:
                next_page = False
            

In [15]:
for d in paper_referencing:
    print(f"Number of citations for {d}: {len(paper_referencing[d])}")

Number of citations for ACDC: 120
Number of citations for LA: 10
Number of citations for MSCMRSeg: 0
Number of citations for M&Ms: 34
Number of citations for PROMISE12: 29
Number of citations for Medical Decathlon: 4
Number of citations for I2CVB: 13
Number of citations for BRATS: 494
Number of citations for Synapse: 0


# Try to filter out wrong references
This part of the notebook is to obtained a more accurate list of paper to only keep paper that are actually using the dataset and not only referencing the paper for another reason.

To do that we will:
1. Search for the dataset name in the abstract, making the hypothesis that if authors put the name of a dataset inside an abstract they must be using it.
2. Look for the name of the dataset in Table of the fulltext, again if the dataset name is in a Table it should be used (especially if we removed review papers)
3. Classify the figure in the fulltext to detect the organ of the cited dataset. (does not confirm at 100% that the actual dataset is really used and not another one focusing on the same organ)

If none of the above operation validate the citations, the paper is removed from the list. Otherwise, we can keep it.  

##  1. Check abstract for dataset's name

We will now query OpenAlex again for everypaper and search for the dataset name inside the "abstract_inverted_index" field of the API if it's present.

In [19]:
"""
Reconstruct and transform the abstract of a paper using abstract_inverted_index field. We will removed non alpha numeric caracters and lower every word.
"""
def reconstruct_abstract(paper):
    # Maximum size of the abstract, if the paper abstarct is longer it will be truncated
    abstract = np.full(2500,"",dtype=object)
    # The "abstract_inverted_index" field is a dictionnary with word as key and locations of this word in the abstract
    # So we fill the abstarct variable above at the index of the word to reconstruct the abstract
    if paper[3]:
        for w in paper[3]:
            for indices in paper[3][w]:
                if indices < 2500:
                    abstract[indices] = ''.join(filter(str.isalnum, w)).lower()
        # Remove empty location mostly due to a shorter abstract 
        abstract = abstract[abstract != ""]
        #Convert array to string
    str_abstract = ' '.join(abstract)
    return str_abstract

In [26]:
paper_using_abstract = {dataset:[] for dataset in paper_referencing}
for dataset in paper_referencing:
    for paper in paper_referencing[dataset]:
        abstract = reconstruct_abstract(paper)
        if dataset.lower() in abstract:
            paper_using_abstract[dataset].append(paper[:-1])

In [28]:
for d in paper_using_abstract:
    print(f"Number of citations for {d}: {len(paper_using_abstract[d])}")

Number of citations for ACDC: 21
Number of citations for LA: 10
Number of citations for MSCMRSeg: 0
Number of citations for M&Ms: 0
Number of citations for PROMISE12: 3
Number of citations for Medical Decathlon: 0
Number of citations for I2CVB: 1
Number of citations for BRATS: 266
Number of citations for Synapse: 0


## 2. Check tables in fulltext for dataset's name

In [None]:
#TODO using code below

## 3. Check for dataset's organ in figures

In [None]:
#TODO using code below

# Check fulltext of paper for either figures or tables

## Import libs

In [1]:
#To download fulltext
import requests
import pandas as pd

#To filter invalid pdf
from pypdf import PdfReader
from pypdf.errors import PdfReadError

#To handle files
import glob
import os

#To extract images from pdf
import fitz

#To extract tables from pdf
import camelot
import ghostscript

## Download papers full text

Filter paper to get for the test

In [2]:
#Only get paper from 2023 referencing ACDC paper
df = pd.read_csv("../../results/extracted_csv/paper_openalex.csv")
df_2023 = df[df["publication_year"] == 2023]
df_acdc = df_2023[df_2023["dataset_used"] == "ACDC"]

In [18]:
url_base = "https://api.openalex.org/works/https://doi.org/"
paper_id = 1
for doi in df_acdc["DOI"]:
    url = url_base + doi 
    r_paper = requests.get(url)
    if r_paper.status_code == 200:
        r_paper_json = r_paper.json()
        fulltext_url = r_paper_json["open_access"]["oa_url"]
        if fulltext_url:
            r_fulltext = requests.get(fulltext_url,allow_redirects=True)
            if r_fulltext.status_code == 200:
                print(fulltext_url)
                open(f"../../results/papers_fulltext/{paper_id}.pdf","wb").write(r_fulltext.content)
                paper_id += 1
        #Stop after 10th download, only to test and maybe not get block by some site
        if paper_id == 10:
            break
    else:
        print(f"Error {r_paper.status_code} for {doi}")

10.1016/b978-0-32-385773-4.00023-x
10.1016/b978-0-32-385773-4.00025-3
10.1016/b978-0-32-385773-4.00009-5
https://doi.org/10.1016/b978-0-32-385773-4.00009-5
10.1016/j.compbiomed.2022.106439
10.1371/journal.pdig.0000159
https://journals.plos.org/digitalhealth/article/file?id=10.1371/journal.pdig.0000159&type=printable
10.1109/access.2023.3234241
10.1016/j.compmedimag.2022.102174
10.1088/1361-6560/acb19a
https://iopscience.iop.org/article/10.1088/1361-6560/acb19a/pdf
10.1007/s00740-022-00474-9
10.1016/j.patcog.2023.109318
https://doi.org/10.1016/j.patcog.2023.109318
10.1109/access.2023.3238058
10.1038/s41598-023-28348-y
https://www.nature.com/articles/s41598-023-28348-y.pdf
10.3389/fphys.2023.1027076
https://www.frontiersin.org/articles/10.3389/fphys.2023.1027076/pdf
10.1016/b978-0-12-821983-6.00008-4
10.3390/bioengineering10020166
https://www.mdpi.com/2306-5354/10/2/166/pdf?version=1674889337
10.1016/j.media.2023.102762
http://arxiv.org/pdf/2206.01136
10.1016/j.bspc.2023.104631
10.1109/w

## Remove invalid pdf obtain from previous step

In [19]:
#get every downloaded pdf path
pdf_path = glob.glob("../../results/papers_fulltext/*.pdf")

#Create folder where valid pdf will be moved
if not os.path.exists("../../results/papers_fulltext/valid_pdf"):
    os.makedirs("../../results/papers_fulltext/valid_pdf")

#Create folder where invalid pdf will be moved
if not os.path.exists("../../results/papers_fulltext/removed_pdf"):
    os.makedirs("../../results/papers_fulltext/removed_pdf")

#For each downloaded pdf
for file in pdf_path:
    try:
        #Try to read the pdf (Raise an error if the file is an invalid pdf)
        PdfReader(file,strict=True)
        #If valid, move the file to valid folder
        os.rename(file, file.replace("papers_fulltext/","papers_fulltext/valid_pdf/"))
    except PdfReadError:
        #If a PdfReadError is raised, the pdf is invalid and therefor moved to removed_pdf folder
        os.rename(file, file.replace("papers_fulltext/","papers_fulltext/removed_pdf/"))


## Extract images from valid pdf

In [4]:
#get every valid pdf path
valid_pdf_path = glob.glob("../../results/papers_fulltext/valid_pdf/*.pdf")

#path of folder where images will be stored
images_path = "../../results/papers_fulltext/images"
#Create this folder if it does not exist
if not os.path.exists(images_path):
    os.makedirs(images_path)

#For each pdf file
for file in valid_pdf_path:
    #Open the file
    pdf_file = fitz.open(file)

    #Get the number of pages in PDF file
    page_nums = len(pdf_file)

    #Create empty list to store images information
    images_list = []

    #Extract all images information from each page
    for page_num in range(page_nums):
        page_content = pdf_file[page_num]
        images_list.extend(page_content.get_images())

    #If there is at least one image in the pdf
    if len(images_list)!=0:
        #Create a subfolder for the article, this way we easily know from which paper the images is coming from
        if not os.path.exists(os.path.join(images_path, os.path.basename(file.replace(".pdf","")))):
            os.makedirs(os.path.join(images_path, os.path.basename(file.replace(".pdf",""))))

        #Save all the extracted images
        for i, img in enumerate(images_list, start=1):
            #Extract the image object number
            xref = img[0]
            #Extract image
            base_image = pdf_file.extract_image(xref)
            #Store image bytes
            image_bytes = base_image['image']
            #Store image extension
            image_ext = base_image['ext']
            #Generate image file name
            image_name = str(i) + '.' + image_ext
            #Save image
            with open(os.path.join(images_path, os.path.basename(file.replace(".pdf","")),image_name) , 'wb') as image_file:
                image_file.write(image_bytes)
                image_file.close()



## Extract tables from valid PDF

In [11]:
valid_pdf_path = glob.glob("../../results/papers_fulltext/valid_pdf/*.pdf")
for pdf in valid_pdf_path:
    tables = camelot.read_pdf(pdf,"all",flavor="stream",suppress_stdout=True)
    for t in tables:
        t_str = t.df.to_string()
        if "ACDC" in t_str:
            print("ACDC IN",pdf)
            break

In [29]:
for pdf in valid_pdf_path:
    tables = camelot.read_pdf(pdf,"all",flavor="stream",suppress_stdout=True)
    for t in tables:
        t_str = t.df.to_string()
        if "ACDC" in t_str:
            print("ACDC IN",pdf)
            break

ACDC IN ../../results/papers_fulltext/valid_pdf/5.pdf
ACDC IN ../../results/papers_fulltext/valid_pdf/9.pdf
ACDC IN ../../results/papers_fulltext/valid_pdf/6.pdf
ACDC IN ../../results/papers_fulltext/valid_pdf/7.pdf
ACDC IN ../../results/papers_fulltext/valid_pdf/8.pdf
