# Fusion sources

The goal of this file is to merge the information extracted with the different sources such as OpenAlex or OpenCitation together to generate the final output of the process which contains all the association paper-dataset-task to allow further analysis after.

## How does this proceed ?
1. Load every generated csv present in extracted_csv folder
2. For each line in a csv:
    1. Test if the association DOI-Dataset has already been seen
    2. If not create a new line in merged.csv with the addition of the information about the task find in dataset.csv "context" field


In [1]:
import glob
import csv
import numpy as np
import pandas as pd
import requests
from itertools import product
import random
random.seed(1907)

In [13]:
extracted_csv = glob.glob("../extracted_csv/*.csv")

extracted_csv.remove('../extracted_csv/paper_dimensions.csv')
extracted_csv

['../extracted_csv/paper_poci.csv',
 '../extracted_csv/paper_openalex.csv',
 '../extracted_csv/paper_coci.csv']

In [15]:
#Dictionnary with dataset's name as key and task as value (e.g ACDC:cardiac)
datasets_context = {}
ds_reader = csv.DictReader(open('../data/datasets.csv'))
for ds in ds_reader:
    datasets_context[ds["name"]] = ds["context"]


#Dictionnaries to keep track of which DOI/Paper we've already encountered to not add duplicates
doi_using_dataset = {ds:[] for ds in datasets_context}
title_using_dataset = {ds:[] for ds in datasets_context}

#Get the list of csv from different sources
extracted_csv = glob.glob("../extracted_csv/*.csv")

extracted_csv.remove('../extracted_csv/paper_dimensions.csv')

#Dictionnary with paper DOI as key and list of sources that give back that paper as value
sources_coverage = {}

#title of papers as keys and DOI as value, use for papers with multiple DOI to do the coverage analysis
name_doi = {}

#Url use to request a DOI in order to get the venue
base_url_venue = "https://api.openalex.org/works/https://doi.org/"

# /!\ WARNING /!\: If merged.csv already exist, the previous content will be removed !
with open("../processed_csv/merged2.csv","w") as merged :
    merged.write(f"name,DOI,publication_year,dataset_used,task")
    for i,csv_path in enumerate(extracted_csv):
        print(csv_path)
        csv_reader = csv.DictReader(open(csv_path))
        for line in csv_reader:
            
            #Check if we've already seen this paper before with another source
            doi_encountered = line['DOI'].lower() not in doi_using_dataset[line['dataset_used']]
            title_encountered = line['name'].lower() not in title_using_dataset[line['dataset_used']]

            if doi_encountered and title_encountered:

                #Get the paper's venue
                request_url = base_url_venue + line['DOI']
                request = requests.get(request_url)
                venue = None
                if request.status_code == 200:
                    json_response = request.json()
                    venue = json_response["host_venue"]["display_name"]

                doi_using_dataset[line['dataset_used']].append(line['DOI'].lower())
                title_using_dataset[line['dataset_used']].append(line['name'].lower())
                
                merged.write(f"\n{line['name'].lower()},{line['DOI'].lower()},{line['publication_year'].lower()},{line['dataset_used']},{datasets_context[line['dataset_used']]}")
                sources_coverage[line['DOI'].lower()] = [False for _ in range(len(extracted_csv))] + [venue,line['publication_year']]
                sources_coverage[line['DOI'].lower()][i] = True
                name_doi[line["name"].lower()] = line['DOI'].lower()
                
            elif line['name'].lower() in name_doi:
                sources_coverage[name_doi[line['name'].lower()]][i] = True
            elif line['DOI'].lower() in sources_coverage:
                sources_coverage[line['DOI'].lower()][i] = True

../extracted_csv/paper_poci.csv
../extracted_csv/paper_openalex.csv


## Study of coverage
Analysis the result from the different sources and compare them together

In [11]:
columns_name = []
for e in extracted_csv:
    columns_name.append(e[23:-4])
columns_name.append("venue")
columns_name.append("publication_year")
df = pd.DataFrame.from_dict(sources_coverage).transpose()
df.index.name = 'DOI'
df.columns = columns_name
df.to_csv("../processed_csv/coverage.csv")

possibleCombination = sorted(np.array([ele for ele in product([True,False], repeat = df.shape[1]-2)]),key=sum)
possibleCombination_code = {str(e):i for i,e in enumerate(possibleCombination)}


def compute_combination(row):
    combination = np.array([row[i] for i in range(df.shape[1]-2)])
    return possibleCombination_code[str(combination)]


df["combination"] =df.apply(compute_combination,axis=1)

df = df.sort_values('combination',ascending=False)

def color_background(row):
    return [
        'background-color: green; color: green' if cell == True
        else 'background-color: red; color: red'
        for cell in row
    ]
df_html = df.iloc[:,:df.shape[1]-1].style.apply(color_background,subset=columns_name[:-2])
df_html.to_html("../res/coverage.html",col_space='100px')