# Fusion sources

The goal of this file is to merge the information extracted with the different sources such as OpenAlex or OpenCitation together to generate the final output of the process which contains all the association paper-dataset-task to allow further analysis after.

## How does this proceed ?
1. Load every generated csv present in extracted_csv folder
2. For each line in a csv:
    1. Test if the association DOI-Dataset has already been seen
    2. If not create a new line in merged.csv with the addition of the information about the task find in dataset.csv "context" field


In [1]:
import glob
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import pandas as pd

In [10]:
#Dictionnary with dataset's name as key and task as value (e.g ACDC:cardiac)
datasets_context = {}
ds_reader = csv.DictReader(open('../data/datasets.csv'))
for ds in ds_reader:
    datasets_context[ds["name"]] = ds["context"]

doi_using_dataset = {ds:[] for ds in datasets_context}
title_using_dataset = {ds:[] for ds in datasets_context}

#Get the list of csv from different sources
extracted_csv = glob.glob("../extracted_csv/*.csv")

#Dictionnary with paper DOI as key and list of sources that give back that paper as value
sources_coverage = {}

#title of papers as keys and DOI as value, use for papers with multiple DOI to do the coverage analysis
name_doi = {}

# /!\ WARNING /!\: If merged.csv already exist, the previous content will be removed !
with open("../processed_csv/merged.csv","w") as merged :
    merged.write(f"name,DOI,publication_year,dataset_used,task")
    for i,csv_path in enumerate(extracted_csv):
        csv_reader = csv.DictReader(open(csv_path))
        for line in csv_reader:
            doi_encountered = line['DOI'] not in doi_using_dataset[line['dataset_used']]
            title_encountered = line['name'] not in title_using_dataset[line['dataset_used']]

            if doi_encountered and title_encountered:
                doi_using_dataset[line['dataset_used']].append(line['DOI'])
                title_using_dataset[line['dataset_used']].append(line['name'])
                
                merged.write(f"\n{line['name']},{line['DOI']},{line['publication_year']},{line['dataset_used']},{datasets_context[line['dataset_used']]}")
                sources_coverage[line['DOI']] = [False for _ in range(len(extracted_csv))]
                sources_coverage[line['DOI']][i] = True
                name_doi[line["name"]] = line['DOI']
                
            elif line['name'] in name_doi:
                sources_coverage[name_doi[line['name']]][i] = True

## Study of coverage
Analysis the result from the different sources and compare them together

In [9]:

df = pd.DataFrame.from_dict(sources_coverage).transpose()
df.columns = extracted_csv
df.index.name = 'DOI'
df.to_csv("../processed_csv/coverage.csv")


cmap = LinearSegmentedColormap.from_list(
    name='test', 
    colors=['red','green']
)
plt.figure(figsize=(10,650))
plt.title("Presence of a paper in the API's response")
sns.heatmap(df,cbar=False,cmap=cmap,xticklabels = 1,yticklabels = 5)
sns.set(font_scale=2)
plt.savefig('../res/coverage.png',bbox_inches='tight')

plt.close()