# Fusion sources

The goal of this file is to merge the information extracted with the different sources such as OpenAlex or OpenCitation together to generate the final output of the process which contains all the association paper-dataset-task to allow further analysis after.
It also compute the "coverage" for a paper which tells if a source has return a paper or not, this information will be used to investigate why some references are missing and compare sources.

## How does this proceed ?
1. Load every generated csv present in extracted_csv folder
2. For each line in a csv:
    1. Test if the association DOI-Dataset has already been seen
    2. If not create a new line in merged.csv with the addition of the information about the task find in dataset.csv "context" field


In [1]:
import glob
import csv
import pandas as pd
import requests
from itertools import product

In [6]:
#Dictionnary with dataset's name as key and task as value (e.g ACDC:cardiac)
datasets_context = {}
ds_reader = csv.DictReader(open('../../data/datasets.csv'))
for ds in ds_reader:
    datasets_context[ds["name"]] = ds["context"]


#Dictionnaries to keep track of which DOI/Paper we've already encountered to not add duplicates
doi_using_dataset = {ds:[] for ds in datasets_context}
title_using_dataset = {ds:[] for ds in datasets_context}

#Get the list of csv from different sources except dimensions for time reason
extracted_csv = glob.glob("../../results/extracted_csv/*.csv")
extracted_csv.remove('../../results/extracted_csv/paper_dimensions.csv')

#Dictionnary with paper DOI as key and a list of boolean as value. The boolean is True if the sources return that paper and false otherwise
sources_coverage = {}

#title of papers as keys and DOI as value, use for papers with multiple DOI to do the coverage analysis
name_doi = {}

#Url use to request a DOI in order to get the venue
base_url_venue = "https://api.openalex.org/works/https://doi.org/"

# /!\ WARNING /!\: If merged.csv already exist, the previous content will be removed !
with open("../../results/processed_csv/merged.csv","w") as merged :
    merged.write(f"name,DOI,publication_year,dataset_used,task")
    for i,csv_path in enumerate(extracted_csv):
        print(csv_path)
        csv_reader = csv.DictReader(open(csv_path))
        for line in csv_reader:
            
            #Check if we've already seen this paper before with another source for the same dataset
            doi_not_encountered = line['DOI'].lower() not in doi_using_dataset[line['dataset_used']]
            title_not_encountered = line['name'].lower() not in title_using_dataset[line['dataset_used']]

            #If not we add a a line in the output csv and create the line in coverage
            if doi_not_encountered and title_not_encountered:

                #Get the paper's venue and if it's accessible in open access or not
                request_url = base_url_venue + line['DOI']
                request = requests.get(request_url)
                venue = None
                if request.status_code == 200:
                    json_response = request.json()
                    venue = json_response["host_venue"]["display_name"]
                    is_oa = json_response["open_access"]["is_oa"]
                doi_using_dataset[line['dataset_used']].append(line['DOI'].lower())
                title_using_dataset[line['dataset_used']].append(line['name'].lower())
                
                merged.write(f"\n{line['name'].lower()},{line['DOI'].lower()},{line['publication_year'].lower()},{line['dataset_used']},{datasets_context[line['dataset_used']]}")
                sources_coverage[line['DOI'].lower()] = [False for _ in range(len(extracted_csv))] + [venue,line['publication_year'],is_oa]
                sources_coverage[line['DOI'].lower()][i] = True
                name_doi[line["name"].lower()] = line['DOI'].lower()

            #Otherwise we add the sources into the coverage for this paper    
            elif line['name'].lower() in name_doi:
                sources_coverage[name_doi[line['name'].lower()]][i] = True
            elif line['DOI'].lower() in sources_coverage:
                sources_coverage[line['DOI'].lower()][i] = True

../extracted_csv/paper_poci.csv
../extracted_csv/paper_openalex.csv
../extracted_csv/paper_coci.csv


### Generation of coverage.csv

In [7]:
columns_name = []
for e in extracted_csv:
    columns_name.append(e[23:-4])
columns_name.append("venue")
columns_name.append("publication_year")
columns_name.append("is_oa")
df = pd.DataFrame.from_dict(sources_coverage).transpose()
df.index.name = 'DOI'
df.columns = columns_name
df.to_csv("../../results/processed_csv/coverage.csv")