# Create Dataset
Aim to create the dataset of abstract for task classification.
From OpenAlex inverted_abstract will create a reconstructed string version of the abstract and classify this task in regard of the dataset the paper references

# Using list of paper that references a dataset with OpenAlex

In [1]:
import pandas as pd
import requests
import numpy as np
import csv

#Dictionnary with dataset as key and organ as value (e.g ACDC:Cardiac)
datasets_context = {}
ds_reader = csv.DictReader(open('../../data/datasets.csv'))
for ds in ds_reader:
    datasets_context[ds["name"]] = ds["context"]
    
df = pd.read_csv("../../results/extracted_csv/paper_openalex.csv")

In [4]:
query_url = "https://api.openalex.org/works/doi"

#Dictionnary with doi as key and tuple (abstract,task) as value
paper_abstract = {}
with open("../../results/extracted_csv/paper_openalex.csv","r") as paper_csv:
    with open("../../data/abstract_dataset.csv","w") as abstract_file:
        abstract_file.write(f'title,doi,publication_year,abstract,task')
        text = csv.reader(paper_csv,)
        header=next(text)
        for i,l in enumerate(text):
            organ = datasets_context[l[-1]]
            #Check if the paper were already seen before to not compute 2 times the abstract
            if not paper_abstract.get(l[1],None):
                r = requests.get(query_url+l[1])
                if r.status_code == 200:
                    r_json = r.json()
                    if r_json["abstract_inverted_index"]:
                        abstract = np.full(1000,"",dtype=object)
                        for w in r_json["abstract_inverted_index"]:
                            for indices in r_json["abstract_inverted_index"][w]:
                                abstract[indices] = ''.join(filter(str.isalnum, w)).lower()
                        abstract = abstract[abstract != ""]
                        str_abstract = ' '.join(abstract)
                        paper_abstract[l[1]] = (str_abstract,organ)
                        abstract_file.write(f'\n{l[0]},{l[1]},"{paper_abstract[l[1]][0]}",{organ}')
            #If the paper were already seen, we check if the organ were the same to not have a duplicate in the dataset
            elif organ != paper_abstract[l[1]][1]:
                abstract_file.write(f'\n{l[0]},{l[1]},{l[2]},"{paper_abstract[l[1]][0]}",{organ}')

## Using list of papers that cite a dataset or an organ in the abstract

In [13]:
# Dictionnary with keyword as key and organ as value
keyword_dict = {}
with open("../../data/keywords.csv",'r') as f:
    for l in csv.DictReader(f):
        keyword_dict[l["keyword"]] = l["organ"]

In [None]:
query_url = "https://api.openalex.org/works"
with open('../../results/classification/raw_abstract.csv',"w") as abstract_file:
    abstract_file.write(f'title,doi,publication_year,abstract,task')
    for year in range(2014,2024):
        print(year)
        #OpenAlex return 25 result per page so we have to iterate to get all of them
        next_page = True
        page_number = 1
        while next_page:
            #Query for papers tagged with "Machine Learning" and "Segmentation" concept, have an abstract and is published on the current iteration year
            query_param = {
                'page':page_number,
                'filter':f"concepts.id:C89600930,concepts.id:C154945302,has_abstract:true,publication_year:{year}"
            }
            r_year = requests.get(query_url,params=query_param)
            if r_year.status_code == 200:
                r_json = r_year.json()
                #If "results" field is empty we have reached the end for the current year
                if not r_json["results"]:
                    next_page = False
                else:
                    # For each paper in the current page
                    for paper in r_json["results"]:
                        # Maximum size of the abstract, if the paper abstarct is longer it will be truncated
                        abstract = np.full(2500,"",dtype=object)
                        # The "abstract_inverted_index" field is a dictionnary with word as key and locations of this word in the abstract
                        # So we fill the abstarct variable above at the index of the word to reconstruct the abstract
                        for w in paper["abstract_inverted_index"]:
                            for indices in paper["abstract_inverted_index"][w]:
                                if indices < 2500:
                                    abstract[indices] = ''.join(filter(str.isalnum, w)).lower()
                        # Remove empty location mostly due to a shorter abstract 
                        abstract = abstract[abstract != ""]
                        #Convert array to string
                        str_abstract = ' '.join(abstract)

                        #Get title and remove "," and "\n" that will create problem in the result csv 
                        title = paper["title"]
                        if title:
                            title = title.replace(",","")
                            title = title.replace("\n","")

                        # Search for keyword in the abstract
                        for k in keyword_dict:
                            if k in str_abstract:
                                abstract_file.write(f'\n{title},{paper["doi"]},{paper["publication_year"]},"{str_abstract}",{keyword_dict[k]}')
                    page_number += 1
            else:
                next_page=False

In [4]:
# Remove duplicates in the previous generated csv due to the detection of multiple keywords for the same organ
df = pd.read_csv("../../results/classification/raw_abstract.csv")
df = df.drop_duplicates(subset=['title','task'])
df.to_csv("../../data/abstract_dataset.csv")

## List of non-relevant papers selected manually to add to the dataset to make the model able to detect non related papers

In [6]:
query_url = "https://api.openalex.org/works/doi"

with open("../../data/paper_other.csv","r") as paper_csv:
    with open("../../data/abstract_dataset_nonrelevant.csv","w") as abstract_file:
        abstract_file.write("title,doi,publication_year,abstract,task")
        text = csv.reader(paper_csv,)
        for i,l in enumerate(text):
            r = requests.get(query_url+l[0])
            if r.status_code == 200:
                r_json = r.json()
                if r_json["abstract_inverted_index"]:
                    abstract = np.full(1000,"",dtype=object)
                    for w in r_json["abstract_inverted_index"]:
                        for indices in r_json["abstract_inverted_index"][w]:
                            abstract[indices] = ''.join(filter(str.isalnum, w)).lower()
                    abstract = abstract[abstract != ""]
                    str_abstract = ' '.join(abstract)
                    title = r_json["title"]
                    title = title.replace(",","")
                    title = title.replace("\n","")
                    abstract_file.write(f'\n{title},{l[0]},"{str_abstract}",Other')