## Import section

In [1]:
import glob
import bs4 as bs
import csv
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Grobid extraction information

In [3]:
print("Number of pdf with complete extraction:",len(glob.glob("../../Results/extraction/grobid_extraction/*.xml")))
print("Number of pdf with error extraction:",len(glob.glob("../../Results/extraction/grobid_extraction/*.txt")))
print("Total number of paper:",len(glob.glob("../../Results/extraction/fulltext/*.pdf")))

Number of pdf with complete extraction: 2678
Number of pdf with error extraction: 0
Total number of paper: 2678


## Load dataset information

In [4]:
datasets_info = {}
with open('../../Resources/data/datasets.csv') as ds_csv:
    ds_reader = csv.DictReader(ds_csv)
    for ds in ds_reader:
        datasets_info[ds["name"]] = {
                                        "doi":ds["doi"],
                                        "title":ds["paper_title"],
                                        "name":ds["name"],
                                        "aliases":ds["aliases"].split(","),
                                        "url":ds["url"].split(",")
                                     }
datasets_info

{'ACDC': {'doi': '10.1109/TMI.2018.2837502',
  'title': 'Deep Learning Techniques for Automatic MRI Cardiac Multi-Structures Segmentation and Diagnosis: Is the Problem Solved?',
  'name': 'ACDC',
  'aliases': ['ACDC', 'Automated Cardiac Diagnosis Challenge', 'AC17'],
  'url': ['https://www.creatis.insa-lyon.fr/Challenge/acdc',
   'https://www.creatis.insa-lyon.fr/Challenge/acdc/databases.html']},
 'Sunnybrook': {'doi': 'https://doi.org/10.54294/g80ruo',
  'title': 'Evaluation Framework for Algorithms Segmenting Short Axis Cardiac MRI',
  'name': 'Sunnybrook',
  'aliases': ['Sunnybrook'],
  'url': ['https://www.cardiacatlas.org/sunnybrook-cardiac-data']},
 'STACOM’11': {'doi': '10.1007/978-3-642-28326-0_9',
  'title': 'Left Ventricular Segmentation Challenge from Cardiac MRI: A Collation Study',
  'name': 'STACOM’11',
  'aliases': ['STACOM’11', "STACOM'11"],
  'url': ['https://www.satdl.com/download/37618']},
 'RVSC': {'doi': 'https://doi.org/10.1016/j.media.2014.10.004',
  'title': 'Ri

## Specify datasets selected and sections considered as Method

In [6]:
datasets_columns = ["ACDC","BRATS","LIDC-IDRI","DRIVE","PROMISE12","Chexpert","PadChest","PAD-UFES-20","CAMELYON","CADDementia","MRNet","PROSTATEx","MIMIC","CBIS-DDSM"]
lst_keywords = ["data","method","result","setup","material","experiment","evaluat"]

In [5]:
#Get the results of grobid parsing
xml_paths = glob.glob("../../Results/extraction/grobid_extraction/*")

#res will contain an element (dict) per paper, in each element keys will be datasets' name and value a list of mention location
res = []
for i,path in enumerate(xml_paths):
    paper_name = path.removeprefix("../../Results/extraction/grobid_extraction/").removesuffix(".grobid.tei.xml").removesuffix(".txt")
    
    #Check that the result is an xml, otherwise grobid had an error during the parsing and generate a .txt file
    if path.endswith(".xml"):
        #Load the xml file with BeautifulSoup for parsing
        with open(path) as fp:
            soup = bs.BeautifulSoup(fp,features="xml")


        #Get main section name, it will be used to assign subsection to the main one (e.g section 3.2 match with section 3.)
        sections = {}
        for elem in soup.findAll("head"):
            if elem.get("n") and elem.get("n")[0] not in sections:
                sections[elem.get("n")[0]] = elem.text.lower()
          
        #Search for the mention of every dataset one by one
        for ds in datasets_info:
            #Construct the regex for matching name or aliases, example for ACDC: (?<![^_\W])(ACDC)|(Automated Cardiac Diagnosis Challenge)|(AC17)(?![^_\s\d\.\),'])
            ds_urls = [f"((https://)?{re.escape(url.removeprefix('https://').removeprefix("http://"))}(/)?)" for url in datasets_info[ds]['url']]
            ds_name_aliases = [f"({re.escape(a)})" for a in datasets_info[ds]["aliases"]]
            
            ds_name_aliases_url_regex = "|".join(ds_name_aliases+ds_urls)
            
            #For every element in the xml with a matching, try to associate it with a location or a structure (Figure, Table, Footnote)
            for elem in soup.findAll(string=re.compile(f"(?<![^_\\W])({ds_name_aliases_url_regex})(?![^_\\s\\d\\.\\),'-])")):
                #To detect in "normal" text for which the parent elements is a <div>
                parent_div = elem.find_parent("div")
                if parent_div:
                    #If the element was part of the abstract or an appendix, the div will have a parent abstract or annex.
                    #Otherwise it's part of a fulltext's section
                    if parent_div.find_parent("abstract"):
                        res.append([paper_name,ds,"In Abstract",True])
                    elif (parent_div.find_parent("div")) and (parent_div.find_parent("div").get("type") == "annex"):
                        res.append([paper_name,ds,"Elsewhere",True])
                    elif parent_div.find("head"):
                        #Get the section number
                        head_level = parent_div.find("head").get("n")
                        
                        #Interpolate the section in case there is no "n" attribute by looking at the closest previous div with this attribute
                        if not head_level:
                            for div in parent_div.find_previous_siblings("div"):
                                if div.find("head") and div.find("head").get("n"):
                                    head_level = div.find("head").get("n")
                                    break
                        
                        #Match with a main section if it was a subsection (3.2 -> 3) and get the section name
                        if head_level:
                            head_text = sections[head_level[0]]
                        else:
                            head_text = parent_div.find("head").text.lower()
                        
                        
                        if any([kw in head_text for kw in lst_keywords]):
                            to_append = "In Method"
                        else:
                            to_append = "Elsewhere"
                        res.append([paper_name,ds,to_append,True])
                

                #To detect in figures or tables, for which parent element is <figure>
                parent_figure = elem.find_parent("figure")
                if parent_figure:
                    if parent_figure.get("type") == "table":
                        res.append([paper_name,ds,"In Table",True])                        
                    else:
                        res.append([paper_name,ds,"In Figure",True])
                        

                #To detect footnotes for which parent element is <note>
                parent_footnote = elem.find_parent("note")
                if parent_footnote:
                    if parent_footnote.get("place") == "foot":
                        res.append([paper_name,ds,"In Footnote",True])

In [6]:
df_res = pd.DataFrame(res,columns=["doc_name","label1","label2","value"])
df_res = df_res.drop_duplicates(["doc_name","label1","label2"])
df_res.to_csv("../../Results/extraction/grobid_fulltext_detection.csv",index=False)

## Example of regex used for detection

In [8]:
ds = "MIMIC"
ds_urls = [f"((https://)?{re.escape(url.removeprefix('https://').removeprefix("http://"))}(/)?)" for url in datasets_info[ds]['url']]
ds_name_aliases = [f"({re.escape(a)})" for a in datasets_info[ds]["aliases"]]
            

ds_name_aliases_url_regex = "|".join(ds_name_aliases+ds_urls)
regex = re.compile(f"(?<![^_\\W])({ds_name_aliases_url_regex})(?![^_\\s\\d\\.\\),'])")
regex

re.compile(r"(?<![^_\W])((MIMIC)|(MIMIC\-II)|(MIMIC\-III)|(MIMIC\-CXR)|((https://)?archive\.physionet\.org/physiobank/database/mimicdb(/)?)|((https://)?archive\.physionet\.org/physiobank/database/mimic2cdb(/)?)|((https://)?physionet\.org/content/mimiciii/1\.4(/)?)|((https://)?physionet\.org/content/mimic\-cxr/2\.0\.0(/)?)|((https://)?www\.physionet\.org/content/mimic\-cxr\-jpg/2\.0\.0(/)?))(?![^_\s\d\.\),'])",
           re.UNICODE)

# Reference matching

In [21]:
#Get the results of grobid parsing
xml_paths = glob.glob("../../Results/extraction/grobid_extraction/*")

#res will contain an element (dict) per paper, in each element keys will be datasets' name and value boolean indicating a citation
res = []
for i,path in enumerate(xml_paths):
    paper_name = path.removeprefix("../../Results/extraction/grobid_extraction/").removesuffix(".grobid.tei.xml").removesuffix(".txt")
    
    #Check that the result is an xml, otherwise grobid had an error during the parsing and generate a .txt file
    if path.endswith(".xml"):
        #Load the xml file with BeautifulSoup for parsing
        with open(path) as fp:
            soup = bs.BeautifulSoup(fp,features="xml")
          
        #Search for the mention of every dataset one by one
        for ds in datasets_info:
            #Construct the regex for matching name or aliases, example for ACDC: (?<![^_\W])(ACDC)|(Automated Cardiac Diagnosis Challenge)|(AC17)(?![^_\s\d\.\),'])
            ds_title = re.escape(datasets_info[ds]["title"])
            if soup.find(string=re.compile(ds_title,re.IGNORECASE)):
                res.append((paper_name,ds,True))
            else:
                res.append((paper_name,ds,False))
    

In [22]:
df_res = pd.DataFrame(res,columns=["doc_name","label1","value"])
df_res.to_csv("../../Results/extraction/grobid_reference_detection.csv",index=False)