In [6]:
# SPARQL endpoint hosting previous version of ITO.owl
import yaml
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)
endpoint = config['blazegraph']['endpoint']
endpoint

'http://193.171.177.138:9999/blazegraph/namespace/ito/sparql'

In [7]:
import os
import random
import string
import re
import gzip
import shutil
import json
import types
import datetime
import decimal
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
import statsmodels.api as sm
import pylab as py
import scipy.stats as stats
import seaborn as sns
from IPython.display import display, HTML

if not os.path.exists('artefacts'):
    os.makedirs('artefacts')

In [8]:

prefixes = """
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix ito: <https://identifiers.org/ito:>
prefix edam: <http://edamontology.org/>
prefix obo: <http://www.geneontology.org/formats/oboInOwl#>
prefix dc: <http://purl.org/dc/elements/1.1/>
"""


def query(query, return_format = JSON):
    sparql = SPARQLWrapper(endpoint)
    sparql.method = 'POST'
    sparql.setReturnFormat(return_format)
    sparql.setQuery(prefixes + query)
    results = sparql.query().convert()
    return results


def query_df(query, numeric_cols = []):
    # Run SPARQL query, and convert results to Pandas dataframe
    sparql = SPARQLWrapper(endpoint)
    sparql.method = 'POST'
    sparql.setReturnFormat(JSON)
    sparql.setQuery(prefixes + query)
    results = sparql.query()
    processed_results = json.load(results.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
        
    df = pd.DataFrame(out, columns=cols)
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col])
    
    return df

def tasks_stat_dict(df):
    from collections import defaultdict
    tasks_stat = defaultdict(int)
    for idx, row in df.iterrows():
        for task in row["tasks"]:
            tasks_stat[task] += 1
    return dict(sorted(tasks_stat.items(), key=lambda x: x[1], reverse=True))

def get_variant_to_parent():
    """
    filename = './datasets.json.gz'
    if not os.path.exists(filename):
        url = 'https://paperswithcode.com/media/about/datasets.json.gz'
        myfile = requests.get(url)
        with open(filename, 'wb') as o:
            o.write(myfile.content)
        with gzip.open(filename, 'rb') as f_in:
            with open('./data/datasets.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    """

    with open('./data/datasets.json') as f:
        datasets_ = json.load(f)
        
    variant_to_parent = list()
    variant_to_meta = dict()
    # unprioritize independency

    for dataset in datasets_:
        paper = dataset.get("paper", None)
        variant_to_meta[dataset["name"]] = (dataset["url"], dataset["homepage"], paper["title"] if paper else None, paper["url"] if paper else None)
    for dataset in datasets_:
        ## create variant_to_parent mapping 
        paper = dataset.get("paper", None)
        for variant in dataset["variants"]:
            if dataset["name"] != variant:
                variant_to_parent.append([variant, dataset["name"]])
                variant_to_meta[dataset["name"]] = (dataset["url"], dataset["homepage"], paper["title"] if paper else None, paper["url"] if paper else None)
    variant_to_parent = pd.DataFrame(variant_to_parent, columns=["variant", "parent"])
    return variant_to_parent, variant_to_meta

def extract_df(endpoint, prefix, root, sort_by):
    df = query_df("""
    SELECT * WHERE {
        ?benchmark rdfs:subClassOf+ <""" + root + """> .
        ?benchmark rdfs:subClassOf <https://identifiers.org/ito:Benchmarking> .
        ?benchmark rdfs:label ?benchmark_label .
        ?result rdf:type ?benchmark .
        ?result ito:has_input ?dataset .
        OPTIONAL{
            ?result obo:date ?date .
            ?result rdfs:seeAlso ?paper .
            ?paper a edam:data_0971 .
            ?paper rdfs:label ?paper_label .
        }
    }
    """)
    df["benchmark_label"] = df["benchmark_label"].str.replace(" benchmarking", "")
    df[["benchmark_label", "task_label"]] = df["benchmark_label"].str.split(' - ', 1, expand=True)
    df["date"] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    df = df.sort_values(by="date")

    # df["benchmark_parent_label"] = df["benchmark_label"].apply(lambda x: variant_to_parent.get(x, None))

    variant_to_parent, variant_to_meta = get_variant_to_parent()

    df = df.merge(variant_to_parent, left_on="benchmark_label", right_on="variant", how="left")
    df = df.rename(columns={"parent": "benchmark_parent_label"}).drop(columns="variant")
    df["benchmark_parent_label"] = df["benchmark_parent_label"].fillna(df["benchmark_label"])
    df = df.dropna(subset=["benchmark_parent_label"])
    df["url"] = df["benchmark_parent_label"].apply(lambda x: variant_to_meta[x][0] if x in variant_to_meta else None)
    df["homepage"] = df["benchmark_parent_label"].apply(lambda x: variant_to_meta[x][1] if x in variant_to_meta else None)
    df["paper_name"] = df["benchmark_parent_label"].apply(lambda x: variant_to_meta[x][2] if x in variant_to_meta else None)
    df["paper_url"] = df["benchmark_parent_label"].apply(lambda x: variant_to_meta[x][3] if x in variant_to_meta else None)
    return df

def aggregate_metrics(df, prefix, root, sort_by):
    def agg(examples):
        examples = examples.sort_values(by="date")
        first_date = examples.iloc[0]["date"]
        tasks = set(examples["task_label"])
        count = examples.groupby(["benchmark_label"])["benchmark_label"].count()
        
        papers = examples[["benchmark_label", "paper_label"]]
        papers = papers.dropna().drop_duplicates()
        count_paper = papers.groupby(["benchmark_label"])["paper_label"].count()
        return pd.Series({
            "first_date": first_date, 
            "n_variant": len(set(examples["benchmark_label"])), 
            "n_row_sum": count.sum(), 
            "n_row_min": count.min(), 
            "n_row_max": count.max(), 
            "n_row_mean": count.mean(), 
            "n_unique_paper_sum": len(set(papers["paper_label"])), 
            "n_paper_min": count_paper.min(), 
            "n_paper_max": count_paper.max(), 
            "n_paper_mean": count_paper.mean(), 
            "n_tasks": len(tasks), 
            "tasks": tasks})

    df = df.groupby(["benchmark_parent_label", "url"])[["benchmark_parent_label", "result", "date", "benchmark_label", "paper_label", "task_label"]].apply(agg).reset_index()
    df = df.sort_values(by=sort_by, ascending=False)
    df = df[df[sort_by] > 0]
    df.to_csv(f"artefacts/{prefix}_all_unfiltered.csv", index=None)
    return df

def split_popular_unpopular(df, sort_by):
    cumsum = df.sort_values(by=[sort_by]).reset_index(drop=True)
    cumsum["cumsum"] = cumsum[sort_by].cumsum()
    max_ = cumsum["cumsum"].max()
    half = max_ / 2.

    foo = 0
    for ix, row in cumsum.iterrows():
        if row["cumsum"] > half:
            corr_ix = ix
            break

    print(f"{corr_ix * 100 / len(cumsum)} % of smallest datasets have the same amount of papers as {100 - (corr_ix * 100 / len(cumsum))} % of biggest datasets")
    print(f"Length popular: {len(cumsum) - corr_ix}")
    popular = cumsum.iloc[corr_ix:].sort_values(by=sort_by, ascending=False)
    unpopular = cumsum.iloc[:corr_ix].sample(len(popular), random_state=1).sort_values(by=sort_by, ascending=False)
    
    popular_task_types = pd.DataFrame(tasks_stat_dict(popular).items(), columns=["task_type", "n"])
    unpopular_task_types = pd.DataFrame(tasks_stat_dict(unpopular).items(), columns=["task_type", "n"])
    
    return popular, unpopular, popular_task_types, unpopular_task_types

# plot and wirte all datasets
def plot_and_write_all(df, prefix, sort_by, plot=True):
    label_x = "Number of papers utilizing dataset"
    label_y = "Number of datasets"

    g = sns.displot(df[sort_by], color='g', kind="hist", kde=True, aspect=2, bins=12)
    g.set_axis_labels(label_x, label_y)
    plt.savefig(f"artefacts/{prefix}.png")
    if plot:
        plt.show()
    plt.close()

    g = sns.displot(df[sort_by], color='g', kind="hist", kde=True, aspect=2, log_scale=True, bins=12)
    g.set_axis_labels(label_x, label_y)
    plt.savefig(f"artefacts/{prefix}_log.png")
    if plot:
        plt.show()
    plt.close()

    popular, unpopular, popular_task_types, unpopular_task_types = split_popular_unpopular(df, sort_by)

    if plot:
        display(df.describe())
        display(popular[["n_tasks", "n_variant"]].describe())
        display(unpopular[["n_tasks", "n_variant"]].describe())
        display(popular[["benchmark_parent_label", sort_by, "n_variant", "n_tasks", "tasks", "url"]].head(5).style.set_caption("Popular benchmarks"))
        display(unpopular[["benchmark_parent_label", sort_by, "n_variant", "n_tasks", "tasks", "url"]].head(5).style.set_caption("Unpopular benchmarks"))
        display(unpopular_task_types.head(5).style.set_caption("Task types in unpopular"))
        display(popular_task_types.head(5).style.set_caption("Task types in popular"))
    popular.to_csv(f"artefacts/{prefix}_popular.csv", index=None)
    unpopular.to_csv(f"artefacts/{prefix}_unpopular.csv", index=None)
    popular_task_types.to_csv(f"artefacts/{prefix}_popular_task_types.csv", index=None)
    unpopular_task_types.to_csv(f"artefacts/{prefix}_unpopular_task_types.csv", index=None)


# plot and write only starting form 2018
def plot_and_write_starting_year(df, prefix, sort_by, first_year, plot=True):
    # filter year
    if first_year > -1:
        filtered_df = df[df['first_date'].dt.year == first_year]

    label_x = "Number of papers utilizing dataset"
    label_y = "Number of datasets"
    g = sns.displot(filtered_df[sort_by], color='g', kind="hist", kde=True, aspect=2, bins=12)
    g.set_axis_labels(label_x, label_y)
    plt.savefig(f"artefacts/{prefix}_2018.png")
    if plot:
        plt.show()
    plt.close()

    popular, unpopular, popular_task_types, unpopular_task_types = split_popular_unpopular(filtered_df, sort_by)

    if plot:
        display(filtered_df.describe())
        display(popular[["n_tasks", "n_variant"]].describe())
        display(unpopular[["n_tasks", "n_variant"]].describe())
        display(popular[["benchmark_parent_label", sort_by, "n_variant", "n_tasks", "tasks", "url"]].head(5).style.set_caption("Popular benchmarks"))
        display(unpopular[["benchmark_parent_label", sort_by, "n_variant", "n_tasks", "tasks", "url"]].head(5).style.set_caption("Unpopular benchmarks"))
        display(unpopular_task_types.head(5).style.set_caption("Task types in unpopular"))
        display(popular_task_types.head(5).style.set_caption("Task types in popular"))
    
    popular.to_csv(f"artefacts/{prefix}_popular_2018.csv", index=None)
    unpopular.to_csv(f"artefacts/{prefix}_unpopular_2018.csv", index=None)
    popular_task_types.to_csv(f"artefacts/{prefix}_popular_task_types_2018.csv", index=None)
    unpopular_task_types.to_csv(f"artefacts/{prefix}_unpopular_task_types_2018.csv", index=None)

## Plots for all datasets

In [4]:
for tlc, root in [("natural_language_processing", "https://identifiers.org/ito:ITO_00141"), ("vision_process", "https://identifiers.org/ito:ITO_00101")]:
    print(tlc)
    sort_by = "n_unique_paper_sum"
    df = extract_df(endpoint, tlc, root, sort_by)
    df = aggregate_metrics(df, tlc, root, sort_by)
    print("ALL")
    plot_and_write_all(df, tlc, sort_by, plot=False)
    print("START 2018")
    plot_and_write_starting_year(df, tlc, sort_by, 2018, plot=False)
    print()

natural_language_processing
ALL
90.31719532554257 % of smallest datasets have the same amount of papers as 9.68280467445743 % of biggest datasets
Length popular: 58
START 2018
78.125 % of smallest datasets have the same amount of papers as 21.875 % of biggest datasets
Length popular: 14

vision_process
ALL
94.76744186046511 % of smallest datasets have the same amount of papers as 5.232558139534888 % of biggest datasets
Length popular: 54
START 2018
79.23076923076923 % of smallest datasets have the same amount of papers as 20.769230769230774 % of biggest datasets
Length popular: 27



# Incompletness

In [9]:
df1 = extract_df(endpoint, "natural_language_processing", "https://identifiers.org/ito:ITO_00141", "n_unique_paper_sum")
df2 = extract_df(endpoint, "vision_process", "https://identifiers.org/ito:ITO_00101", "n_unique_paper_sum")
#df.groupby(["benchmark_label", "benchmark_parent_label","url", "task_label"])
df = pd.concat([df1, df2])
df.head()

Unnamed: 0,benchmark,benchmark_label,result,dataset,date,paper,paper_label,task_label,benchmark_parent_label,url,homepage,paper_name,paper_url
0,https://identifiers.org/ito:ITO_53501,Penn Treebank,https://identifiers.org/ito:ITO_iqwDxdruDLC3K2ewW,https://identifiers.org/ito:ITO_04494,2004-07-01,https://identifiers.org/ito:ITO_53505,Corpus-Based Induction of Syntactic Structure:...,Unsupervised dependency parsing,Penn Treebank,https://paperswithcode.com/dataset/penn-treebank,https://catalog.ldc.upenn.edu/docs/LDC95T7/cl9...,Building a Large Annotated Corpus of English: ...,http://dl.acm.org/citation.cfm?id=972470.972475
1,https://identifiers.org/ito:ITO_56230,Penn Treebank,https://identifiers.org/ito:ITO_izjN0bn2si9MTQk8H,https://identifiers.org/ito:ITO_04494,2006-06-01,https://identifiers.org/ito:ITO_34554,Effective Self-Training for Parsing,Constituency parsing,Penn Treebank,https://paperswithcode.com/dataset/penn-treebank,https://catalog.ldc.upenn.edu/docs/LDC95T7/cl9...,Building a Large Annotated Corpus of English: ...,http://dl.acm.org/citation.cfm?id=972470.972475
2,https://identifiers.org/ito:ITO_45414,Linux IRC (Ch2 Elsner),https://identifiers.org/ito:ITO_iwcpMJE5wRevyLqJV,https://identifiers.org/ito:ITO_21330,2008-06-01,https://identifiers.org/ito:ITO_45413,You Talking to Me? A Corpus and Algorithm for ...,Conversation disentanglement,irc-disentanglement,https://paperswithcode.com/dataset/irc-disenta...,https://github.com/jkkummerfeld/irc-disentangl...,A Large-Scale Corpus for Conversation Disentan...,https://paperswithcode.com/paper/analyzing-ass...
3,https://identifiers.org/ito:ITO_45412,Linux IRC (Ch2 Kummerfeld),https://identifiers.org/ito:ITO_iPxkvmyiiIdT4pNeV,https://identifiers.org/ito:ITO_21324,2008-06-01,https://identifiers.org/ito:ITO_45413,You Talking to Me? A Corpus and Algorithm for ...,Conversation disentanglement,irc-disentanglement,https://paperswithcode.com/dataset/irc-disenta...,https://github.com/jkkummerfeld/irc-disentangl...,A Large-Scale Corpus for Conversation Disentan...,https://paperswithcode.com/paper/analyzing-ass...
4,https://identifiers.org/ito:ITO_45415,irc-disentanglement,https://identifiers.org/ito:ITO_iujJfxeLdxFdVFNpL,https://identifiers.org/ito:ITO_21309,2008-06-01,https://identifiers.org/ito:ITO_45413,You Talking to Me? A Corpus and Algorithm for ...,Conversation disentanglement,irc-disentanglement,https://paperswithcode.com/dataset/irc-disenta...,https://github.com/jkkummerfeld/irc-disentangl...,A Large-Scale Corpus for Conversation Disentan...,https://paperswithcode.com/paper/analyzing-ass...


In [182]:
benchmarks = df.dropna(subset=["paper_label"])
benchmarks = benchmarks[["date", "benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url", "task_label", "paper_label"]].drop_duplicates()
benchmarks = benchmarks[benchmarks["date"] <= datetime.datetime(2021, 12, 31)]
benchmarks = benchmarks[["benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url", "task_label", "paper_label"]]
benchmarks = benchmarks.fillna("")
benchmarks = benchmarks.groupby(["benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url"], as_index=False, dropna=False).filter(lambda x: len(x) >= 3).reset_index(drop=True)
random_benchmarks = benchmarks[["benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url"]].drop_duplicates().sample(10, random_state=1)
random_benchmarks

Unnamed: 0,benchmark_label,benchmark_parent_label,url,homepage,paper_name,paper_url
7126,VIDIT’20 validation set,VIDIT’20 validation set,,,,
7764,n-MNIST,n-MNIST,,,,
6113,CAT 256x256,CAT 256x256,,,,
8056,FashionIQ,Fashion IQ,https://paperswithcode.com/dataset/fashion-iq,https://github.com/XiaoxiaoGuo/fashion-iq,Fashion IQ: A New Dataset Towards Retrieving I...,https://paperswithcode.com/paper/the-fashion-i...
5723,YouTube-VOS 2018 val,YouTube-VOS 2018 val,https://paperswithcode.com/dataset/youtube-vos,https://youtube-vos.org/,YouTube-VOS: A Large-Scale Video Object Segmen...,https://arxiv.org/pdf/1809.03327.pdf
5063,SAT-4,SAT-4,,,,
5332,Oxf5k,Oxf5k,,,,
71,Persona-Chat,Persona-Chat,,,,
7576,FC100 5-way (1-shot),FC100,https://paperswithcode.com/dataset/fc100,https://github.com/ElementAI/TADAM,TADAM: Task dependent adaptive metric for impr...,https://paperswithcode.com/paper/tadam-task-de...
2474,TabFact,TabFact,https://paperswithcode.com/dataset/tabfact,https://tabfact.github.io/,TabFact: A Large-scale Dataset for Table-based...,https://paperswithcode.com/paper/tabfact-a-lar...


In [11]:

def select_benchmarks(df, tlc, selection):

    import random
    benchmarks = df.dropna(subset=["paper_label", "paper_url"])
    benchmarks = benchmarks[["date", "benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url", "task_label", "paper_label"]].drop_duplicates()
    benchmarks = benchmarks[benchmarks["date"] <= datetime.datetime(2021, 12, 31)]
    benchmarks = benchmarks[["benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url", "task_label", "paper_label"]]
    benchmarks = benchmarks.fillna("")

    selection_ = benchmarks.groupby(["benchmark_parent_label","url", "homepage", "paper_name", "paper_url"], as_index=False, dropna=False).filter(lambda x: len(x) >= 3).reset_index(drop=True)[["benchmark_parent_label","url", "homepage", "paper_name", "paper_url"]].drop_duplicates()
    # 0 and 4
    # selected_benchmarks = selection.sample(10, random_state=0)
    selected_benchmarks = selection_[selection_["benchmark_parent_label"].isin(selection)]
    selected_benchmarks = benchmarks.merge(selected_benchmarks, how="inner", on=["benchmark_parent_label","url", "homepage", "paper_name", "paper_url"])

    if tlc == "natural_language_processing":
        sota_papers = pd.read_csv("data/sota_papers_ITO_00141.csv")
    else:
        sota_papers = pd.read_csv("data/sota_papers_ITO_00101.csv")
    sota_papers = sota_papers.rename(columns={"dataset_label": "benchmark_label"})[["benchmark_label", "paper_label"]]
    variant_to_parent, variant_to_meta = get_variant_to_parent()
    sota_papers = sota_papers.merge(variant_to_parent, left_on="benchmark_label", right_on="variant", how="left")
    sota_papers = sota_papers.rename(columns={"parent": "benchmark_parent_label"}).drop(columns="variant")
    sota_papers["benchmark_parent_label"] = sota_papers["benchmark_parent_label"].fillna(sota_papers["benchmark_label"])
    sota_papers = sota_papers.dropna(subset=["benchmark_parent_label"])
    sota_papers["sota"] = 1
    selected_benchmarks = selected_benchmarks.merge(sota_papers, on=["benchmark_parent_label", "benchmark_label", "paper_label"], how="left")
    def agg(ex):
        return pd.Series({"benchmark_label": ", ".join(set(ex["benchmark_label"].to_list())), "task_label": ", ".join(set(ex["task_label"].to_list())), "paper_label": len(set(ex["paper_label"].to_list())), "sota": int(ex[["paper_label", "sota"]].drop_duplicates(subset="paper_label")["sota"].sum())})
    selected_benchmarks = selected_benchmarks.groupby(["benchmark_parent_label", "url", "homepage", "paper_name", "paper_url"], dropna=False)[["benchmark_label", "task_label", "paper_label", "sota"]].apply(agg).reset_index()
    selected_benchmarks.rename(columns={"task_label": "tasks", "paper_label": "n_unique_papers", "sota": "n_unique_sota_papers"}, inplace=True)
    selected_benchmarks = selected_benchmarks.sort_values(by="benchmark_parent_label")
    return selected_benchmarks


    selected_benchmarks = benchmarks.merge(random_benchmarks, how="inner", on=["benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url", "task_label"])

    selected_benchmarks = selected_benchmarks.set_index(["benchmark_label", "benchmark_parent_label", "url", "homepage", "paper_name", "paper_url", "task_label"])
    selected_benchmarks["n_benchmarking_papers"] = selected_benchmarks.groupby(["benchmark_label", "benchmark_parent_label", "url", "homepage", "paper_name", "paper_url", "task_label"], dropna=False).size()
    # selected_benchmarks = selected_benchmarks.reset_index().set_index(["benchmark_label", "benchmark_parent_label", "url", "homepage", "paper_name", "paper_url", "task_label", "n_papers"])
    def agg(ex):
        return set(ex.to_list())
    selected_benchmarks = selected_benchmarks.groupby(["benchmark_label", "benchmark_parent_label","url", "homepage", "paper_name", "paper_url", "task_label", "n_benchmarking_papers"], dropna=False)["paper_label"].apply(agg).reset_index()
    selected_benchmarks.rename(columns={"paper_label": "benchmarking_paper_label"}, inplace=True)

    return selected_benchmarks

selected_benchmarks = []
for tlc, root in [("natural_language_processing", "https://identifiers.org/ito:ITO_00141"), ("vision_process", "https://identifiers.org/ito:ITO_00101")]:
    
    if tlc == "natural_language_processing":
        selection = ["CAS-VSR-W1k (LRW-1000)", "CLEVR", "Spoken-SQuAD", "NNE", "NewsQA", "HellaSwag", "Sepehr_RumTel01", "Billion Word Benchmark", "WebQuestionsSP", "WikiHow"]
    else:
        selection = ["ActivityNet Captions", "AdobeVFR syn", "Birdsnap", "PACS", "CHASE_DB1", "CrowdHuman", "DeepFashion", "Google Refexp", "MTL-AQA", "UCF-Crime"]

    
    print(tlc)
    sort_by = "n_unique_paper_sum"
    df = extract_df(endpoint, tlc, root, sort_by)
    print(len(df["benchmark_parent_label"].unique()))
    df[["benchmark_parent_label"]].drop_duplicates().to_csv("benchmark_parent_label" + "_" + tlc + ".csv")
    selected_benchmarks.append(select_benchmarks(df, tlc, selection))
selected_benchmarks = pd.concat(selected_benchmarks, ignore_index=True)


display(selected_benchmarks)
selected_benchmarks.to_csv("test.csv", index=False)

natural_language_processing
1077
vision_process
1837


Unnamed: 0,benchmark_parent_label,url,homepage,paper_name,paper_url,benchmark_label,tasks,n_unique_papers,n_unique_sota_papers
0,Billion Word Benchmark,https://paperswithcode.com/dataset/billion-wor...,https://code.google.com/archive/p/1-billion-wo...,One Billion Word Benchmark for Measuring Progr...,https://paperswithcode.com/paper/one-billion-w...,One Billion Word,"Text generation, Language modelling",16,8
1,CAS-VSR-W1k (LRW-1000),https://paperswithcode.com/dataset/lrw-1000,https://vipl.ict.ac.cn/en/view_database.php?id=13,LRW-1000: A Naturally-Distributed Large-Scale ...,https://paperswithcode.com/paper/lrw-1000-a-na...,CAS-VSR-W1k (LRW-1000),Lip reading,7,4
2,CLEVR,https://paperswithcode.com/dataset/clevr,https://cs.stanford.edu/people/jcjohns/clevr/,CLEVR: A Diagnostic Dataset for Compositional ...,https://paperswithcode.com/paper/clevr-a-diagn...,CLEVR,Image question answering,14,4
3,HellaSwag,https://paperswithcode.com/dataset/hellaswag,https://rowanzellers.com/hellaswag/,HellaSwag: Can a Machine Really Finish Your Se...,https://paperswithcode.com/paper/hellaswag-can...,HellaSwag,Sentence Completion,3,2
4,NNE,https://paperswithcode.com/dataset/nne,,NNE: A Dataset for Nested Named Entity Recogni...,https://paperswithcode.com/paper/nne-a-dataset...,NNE,Nested named entity recognition,5,3
5,NewsQA,https://paperswithcode.com/dataset/newsqa,https://www.microsoft.com/en-us/research/proje...,NewsQA: A Machine Comprehension Dataset,https://paperswithcode.com/paper/newsqa-a-mach...,NewsQA,Question answering,5,5
6,Sepehr_RumTel01,https://paperswithcode.com/dataset/sepehr-rumt...,,A Speech Act Classifier for Persian Texts and ...,https://paperswithcode.com/paper/a-speech-act-...,Sepehr_RumTel01,Rumor Detection,4,3
7,Spoken-SQuAD,https://paperswithcode.com/dataset/spoken-squad,https://github.com/chiahsuan156/Spoken-SQuAD,Spoken SQuAD: A Study of Mitigating the Impact...,https://paperswithcode.com/paper/spoken-squad-...,Spoken-SQuAD,Spoken language understanding,3,3
8,WebQuestionsSP,https://paperswithcode.com/dataset/webquestionssp,https://www.microsoft.com/en-us/download/detai...,The Value of Semantic Parse Labeling for Knowl...,https://paperswithcode.com/paper/the-value-of-...,WebQuestionsSP,"Knowledge base question answering, Semantic pa...",6,4
9,WikiHow,https://paperswithcode.com/dataset/wikihow,https://github.com/mahnazkoupaee/WikiHow-Dataset,WikiHow: A Large Scale Text Summarization Dataset,https://paperswithcode.com/paper/wikihow-a-lar...,WikiHow,Text summarization,3,3


: 

In [161]:
import pandas as pd
sheet_name = "Tabellenblatt1"
url = f"https://docs.google.com/spreadsheets/d/18sBVxSW4fFe1ZCjvo_7b6RpUSJHCrNZJNX3trBJi-jo/gviz/tq?tqx=out:csv&sheet={sheet_name}"
df = pd.read_csv(url).drop(columns=["Unnamed: 12", "Unnamed: 13"])

In [162]:
import math
n_samples = math.ceil(((math.pow(1.96,2)*0.25)/(math.pow(0.05,2)))/(1+((math.pow(1.96,2)*0.25)/((math.pow(0.05,2))*df["gs_citations"].sum()))))
n_samples

366

In [167]:
np.random.seed(42)
distro = df[["benchmark_parent_label", "gs_citations", "gsid"]].copy()
citations = df["gs_citations"].to_numpy()
distro["n_sample"] = (citations / citations.sum() * n_samples).round()
distro["n_sample_100"] = (citations / citations.sum() * 100).round()

samples = {}
for idx, row in distro.iterrows():
    n = row["gs_citations"]
    samples[row["benchmark_parent_label"]] = np.random.choice([i for i in range(1, n+1)], size=int(row["n_sample"]), replace=False)

"""
for key in samples:
    samples[key] = [(math.ceil(i/10.),i - (math.ceil(i/10.) - 1) * 10) for i in samples[key]]
"""
    
sample_df = pd.DataFrame([(key, val) for key in samples for val in samples[key]], columns=["benchmark_parent_label", "idx"])
sample_df = sample_df.sort_values(by=["benchmark_parent_label", "idx"])

sample_df = sample_df.merge(distro[["benchmark_parent_label", "gsid", "n_sample", "n_sample_100"]], how="left", on="benchmark_parent_label")
sample_df["gsid"] = "http://scholar.google.com/scholar?lr=lang_en&hl=en&as_sdt=2005&as_yhi=2021&cites=" + sample_df["gsid"].astype(str) + "&num=1&start=" + sample_df["idx"].astype(str)

def agg(ex):
    ex["100"] = 0
    assert((ex["n_sample_100"] == ex["n_sample_100"].iloc[0]).all())
    ex["100"].iloc[0:int(ex["n_sample_100"].iloc[0])] = 1
    return ex
sample_df = sample_df.groupby(["benchmark_parent_label"], as_index=False).apply(agg).reset_index()

sample_df[["benchmark_parent_label", "idx", "gsid", "100"]].to_csv("sample.csv", index=False)


In [11]:
import requests
from bs4 import BeautifulSoup
import math
import time
 
cite = "1033880884200484288"
 
# Making a GET request
r = requests.get(f'https://scholar.google.com/scholar?cites={cite}&as_yhi=2021&lr=lang_en')
 
# check status code for response received
# success code - 200
print(r.status_code)
 
# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')



number = soup.find("div", id="gs_ab_md").find("div", class_="gs_ab_mdw").text
print(number)
number = number[:number.find("Ergebnis")]
number = number.replace("Ungefähr", "").replace("\r", "").replace("\n", "").replace(" ", "").replace(u'\xa0', '')
number = int(number)
n_pages = math.ceil(number/10)
n_pages



200
Ungefähr 1 190 Ergebnisse (0,02 Sek.)


119