# Evaluating the sensitivity of Zenodo and Dryad API

In [190]:
import json
import requests
import regex as re
import pandas as pd

In [191]:
def extract_dryad(entry, i_query):
    if "keywords" in entry.keys():
        keywords = "_".join(entry['keywords'])
    else:
        keywords = []
    return([entry['_links']['self']['href'],
    entry["title"],
    entry['abstract'],
    keywords,
    entry["publicationDate"],
           i_query])

def extract_zenodo(entry, i_query):
    metadata = entry["metadata"]
    if "keywords" in metadata.keys():
        keywords = "; ".join(metadata['keywords'])
    else:
        keywords = ""
        
    if "method" in metadata.keys():
        method = metadata['method']
    else:
        method = ""
    
    if "notes" in metadata.keys():
        notes = metadata['notes']
    else:
        notes = ""
    
    if "locations" in metadata.keys():
        locations = [k["place"] for k in metadata['locations']]
        locations = "; ".join(locations)
    else:
        locations = ""
    
    if "related_identifiers" in metadata.keys():
        cited_articles = ["https://doi.org/" + k["identifier"] for k in metadata["related_identifiers"]]
        cited_articles = "; ".join(cited_articles)
    else:
        cited_articles = ""
    return([metadata['doi'],
    metadata["title"],
    metadata['description'],
   method,
    notes,        
    keywords,
    locations,
    metadata["publication_date"],
    cited_articles,
           i_query])

def retrieve_zenodo(queries):
    rows = []
    for i, q in queries.items():
        response = requests.get('https://zenodo.org/api/records',
                            params={'q': "+" + q.replace('AND ', "+"),
                                    "type" : "dataset", 
                                    "size":1000,
                                    'access_token': "Mf4LxV3d12BadrTyBke4vKphD6SO59ILOCHKGlQBbrcuKWMPlcUG51jBCA7p"})
        for j in range(0, len(response.json()["hits"]["hits"])):
            entry = response.json()["hits"]["hits"][j]
            rows.append(extract_zenodo(entry, i))
    df = pd.DataFrame(rows)
    df.columns = ["url", "title", "description", "method", "notes", "keywords", "locations", "publication_date", "cited_articles", "id_query"]
    df["source"] = "zenodo"
    df['url'] = df['url'].apply(lambda row : "https://doi.org/" + row)
    return(df)


def retrieve_dryad(queries):

    base_url = "https://datadryad.org/api/v2/search"
    rows = []
    for i_query in range(len(queries)):
        query = {'q':queries[i_query]}
        response = requests.get(BASE_URL, params=query)
        n_last  = int(re.search('/api/v2/search\?page=([0-9]+)', response.json()['_links']["last"]['href']).group(1))
        for i in range(0, n_last):
            search = 'https://datadryad.org/api/v2/search?page={0}&q={1}'.format(i, queries[i_query])
            response = requests.get(search)
            for entry in response.json()['_embedded']["stash:datasets"]:
                rows.append(extract_dryad(entry, i_query))
    df = pd.DataFrame(rows)
    df.columns = ["url", "title", "content", "keywords", "publication_date", "id_query"]
    df['url'] = df['url'].apply(lambda row : row.replace("/api/v2/datasets/doi%3A10.5061%2F", "https://doi.org/10.5061/"))
    df["source"] = "dryad"
    return(df)

Let's create example queries. We will compare the results obtained by Zenodo and Dryad API for queries 0 and 1 ("occurrence" and "occurrence"), queries 0 and 2 ("Québec" and "Quebec") and queries 3 and 4 ("inventory" and "inventories").

In [192]:
queries = {0: "Quebec AND occurrence AND species",
          1: "Quebec AND occurrences AND species",
        2: "Québec AND occurrence AND species",
           3: "Quebec AND inventory AND species",
          4: "Quebec AND inventories AND species"
          }

# Testing the sensitivity to singular/plural forms

## Zenodo

In [193]:
df_zenodo = retrieve_zenodo(queries)

In [194]:
df_zenodo.head()

Unnamed: 0,url,title,description,method,notes,keywords,locations,publication_date,cited_articles,id_query,source
0,https://doi.org/10.5061/dryad.24rj8,"Data from: Aspicilia bicensis (Megasporaceae),...",Aspicilia bicensis is described as new to scie...,,"<div class=""o-metadata__file-usage-entry"">data...",Anthropocene; Aspicilia; sterile crustose lich...,Canada; Quebec; Boreal Forest; North America,2016-12-23,https://doi.org/10.1639/0007-2745-119.1.008,0,zenodo
1,https://doi.org/10.5061/dryad.q810f,Data from: Ecological and evolutionary diversi...,The concept of limiting similarity predicts th...,,"<div class=""o-metadata__file-usage-entry"">Supp...",nuclear ribosomal spacers; Tertiary and Quater...,Central Quebec-Labrador peninsula,2017-04-29,https://doi.org/10.1600/036364416x692514,0,zenodo
2,https://doi.org/10.5061/dryad.t11f5,Data from: Habitat-based polymorphism is commo...,1. Morphological differences (size and shape) ...,,"<div class=""o-metadata__file-usage-entry"">Data...",Catostomus commersoni; Exoglossum maxillingua;...,Canada; Québec,2015-07-14,https://doi.org/10.1111/1365-2656.12269,0,zenodo
3,https://doi.org/10.5061/dryad.q573n5tdx,Priority effects will impede range shifts of t...,<p> Temperate tree species are expected to exp...,<p>A grid of 20 m x 20 m quadrats were establi...,<p><b>Permanent Plots - Tree Species Codes</b>...,long-term permanent plot; Ecotone; seedlings; ...,,2019-12-01,,0,zenodo
4,https://doi.org/10.5061/dryad.s4mw6m97h,Lesser Yellowlegs location data describing the...,<p>Shorebirds have experienced a precipitous r...,<p>Location data was collected using Lotek Arg...,<p>The dataset does not include any missing va...,Tringa flavipes; Random walk model; harvest; m...,,2021-11-18,https://doi.org/10.5281/zenodo.5661999,0,zenodo


In [195]:
url_zenodo = dict()
for i, query in queries.items():
    url_zenodo[i] = df_zenodo.query("id_query == @i").url.tolist()

Let'see if searching for a singular form ("occurrence") with query 0 also matches also plural form ("occurrences") obtained by query 1.

In [196]:
print("Results from query 1 detected by query 0 :", len([x for x in url_zenodo[0] if x in url_zenodo[1]]))
print("Results of query 1 :", len(url_zenodo[1]))

Results from query 1 detected by query 0 : 1
Results of query 1 : 3


The query with singular form did not macth the plural form (2 results missing).

## Dryad

In [197]:
df_dryad = retrieve_dryad(queries)

In [198]:
url_dryad = dict()
for i, query in queries.items():
    url_dryad[i] = df_dryad.query("id_query == @i").url.tolist()

In [199]:
print("Results from query 1 detected by query 0 :", len([x for x in url_dryad[0] if x in url_dryad[1]]))
print("Results of query 1 :", len(url_dryad[1]))

Results from query 1 detected by query 0 : 6
Results of query 1 : 6


It's a match ! Let's check if it work with other types of plural forms ("inventory" and "inventories").

In [200]:
print("Results from query 4 detected by query 3 :", len([x for x in url_dryad[3] if x in url_dryad[4]]))
print("Results of query 4 :", len(url_dryad[4]))

Results from query 4 detected by query 3 : 2
Results of query 4 : 2


Still a match !

# Testing the sensitivity to accent

## Zenodo

In [201]:
print("Results from query 0 :", len(url_zenodo[0]))
print("Results from query 2 :", len(url_zenodo[2]))

Results from query 0 : 10
Results from query 2 : 15


In [202]:
print("Results from query 0 detected by query 2 :", len([x for x in url_zenodo[0] if x in url_zenodo[2]]))
print("Results from query 2 detected by query 0 :", len([x for x in url_zenodo[2] if x in url_zenodo[0]]))

Results from query 0 detected by query 2 : 3
Results from query 2 detected by query 0 : 3


## Dryad

In [203]:
print("Results from query 0 :", len(url_dryad[0]))
print("Results from query 2 :", len(url_dryad[2]))

Results from query 0 : 6
Results from query 2 : 5


In [204]:
print("Results from query 0 detected by query 2 :", len([x for x in url_dryad[0] if x in url_dryad[2]]))
print("Results from query 2 detected by query 0 :", len([x for x in url_dryad[2] if x in url_dryad[0]]))

Results from query 0 detected by query 2 : 1
Results from query 2 detected by query 0 : 1


Both APIs are sensitive to accent.

# Testing the retrieval of Dryad datasets by Zenodo API

Let's check that the queries 0 and 1 from Zenodo match the same query by Dryad (we have to merge query 0 and 1 for Zenodo as the API is sensitive to singumar/plural forms).

In [205]:
zenodo_result = list(set(url_zenodo[0] + url_zenodo[1]))
dryad_result = url_dryad[0]

In [206]:
print("Results from Dryad:" , len(dryad_result))
print("Results from Dryad detected by Zenodo:", len(set(zenodo_result) & set(dryad_result)))

Results from Dryad: 6
Results from Dryad detected by Zenodo: 6


It's a match !