In [1]:
import seaborn as sns
import json

# load /Users/ruth/rr-measure-basic/2k-paper-minidataset/2k_p1_v23_f20.json as a json
with open('../data/2k_p1_v23_f20.json') as f:
    data = json.load(f)["results"]

In [2]:
# graph the proportion of values in the loaded data where ["is_data_used"] is true as a pie chart using seaborn

import matplotlib.pyplot as plt

def make_pie_chart(field, title, field_name):
    # Count occurrences
    true_count = 0
    false_count = 0

    for item in data:
        if item.get(field) is True:
            true_count += 1
        else:
            false_count += 1

    # Prepare data for the pie chart
    labels = [f'{field_name}: True', f'{field_name}: False']
    sizes = [true_count, false_count]

    # Use a seaborn color palette
    colors = sns.color_palette('pastel')[0:2] # Example using a pastel palette
    explode = (0.05, 0)  # "explode" the 'True' slice slightly for emphasis

    # Create the pie chart
    plt.figure(figsize=(8, 8))
    plt.pie(sizes, explode=explode, labels=labels, colors=colors,
            autopct='%1.1f%%', shadow=True, startangle=140,
            textprops={'fontsize': 12}) # Adjust font size for labels
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.title(title, fontsize=14)
    plt.show()

    print(f"Total objects: {len(data)}")
    print(f"Objects with 'code_needed': True - {true_count}")
    print(f"Objects with 'code_needed': False - {false_count}")
    

In [None]:
make_pie_chart("is_code_used", "Proportion of papers that used code", "Code Used")

In [None]:
make_pie_chart("is_data_used", "Proportion of papers that used data", "Data Used")

In [None]:
for item in data:
    if item.get("is_data_used") and item.get("is_code_used"):
        if item.get("is_data_used") ^ item.get("is_code_used"):
            print("Different!!")

In [None]:
# open /Users/ruth/rr-measure-basic/form_MVD/forms_rematched_5_20.json as json
with open('../data/form_MVD/forms_rematched_5_20.json') as f:
    data = json.load(f)


for item in data:
    if item.get("is_data_needed_for_reproduce") and item.get("is_code_needed_for_reproduce"):
        if (item.get("is_data_needed_for_reproduce")=="TRUE") ^ (item.get("is_code_needed_for_reproduce")=="Yes"):
            print("Different!! ", item.get("doi"))

In [3]:
import plotly.graph_objects as go


with open('../data/2k_p1_v23_f20.json') as f:
    data = json.load(f)["results"]

nodes = ["All", "Data Used", "Data Not Used", "0 datasets", "1-2 datasets", "3-5 datasets", "6+ datasets", "All Data Available", "Some Data Available", "No Data Available"]
node_map = {node: i for i, node in enumerate(nodes)}
from collections import defaultdict
aggregated_flows = defaultdict(int)

# Initialize lists for Sankey data
sankey_source = []
sankey_target = []
sankey_value = []

# A helper to add flows
def add_flow(source_node, target_node, count=1):
    sankey_source.append(node_map[source_node])
    sankey_target.append(node_map[target_node])
    aggregated_flows[(node_map[source_node], node_map[target_node])] += 1

counter = 0

for item in data:
    # --- Initial Flow from "Start" ---
    if item["is_editorial"]:
        continue

    if item["is_data_used"]:
        add_flow("All", "Data Used")
        counter += 1
    else:
        add_flow("All", "Data Not Used")
        counter += 1

    if item["is_data_used"]:
        # --- Flow from "Data Used" ---
        if len(item["datasets"]) == 0:
            add_flow("Data Used", "0 datasets")
        elif len(item["datasets"]) <= 2:
            add_flow("Data Used", "1-2 datasets")
        elif len(item["datasets"]) <= 5:
            add_flow("Data Used", "3-5 datasets")
        else:
            add_flow("Data Used", "6+ datasets")
    else:
        if "datasets" not in item or len(item["datasets"]) == 0:
            add_flow("Data Not Used", "0 datasets")
        elif len(item["datasets"]) <= 2:
            add_flow("Data Not Used", "1-2 datasets")
        elif len(item["datasets"]) <= 5:
            add_flow("Data Not Used", "3-5 datasets")
        else:
            add_flow("Data Not Used", "6+ datasets")

    num_avail = 0
    total_datasets = 0
    if "datasets" in item:
        for dataset in item["datasets"]:
            if dataset["is_dataset_publically_available"] == True:
                num_avail += 1
            total_datasets += 1

    if "datasets" not in item or len(item["datasets"]) == 0:
        if num_avail == 0:
            add_flow("0 datasets", "No Data Available")
    elif len(item["datasets"]) <= 2:
        if num_avail == 0:
            add_flow("1-2 datasets", "No Data Available")
        elif num_avail == total_datasets:
            add_flow("1-2 datasets", "All Data Available")
        else:
            add_flow("1-2 datasets", "Some Data Available")
    elif len(item["datasets"]) <= 5:
        if num_avail == 0:
            add_flow("3-5 datasets", "No Data Available")
        elif num_avail == total_datasets:
            add_flow("3-5 datasets", "All Data Available")
        else:
            add_flow("3-5 datasets", "Some Data Available")
    else:
        if num_avail == 0:
            add_flow("6+ datasets", "No Data Available")
        elif num_avail == total_datasets:
            add_flow("6+ datasets", "All Data Available")
        else:
            add_flow("6+ datasets", "Some Data Available")
        

final_source = []
final_target = []
final_value = []

for (s, t), v in aggregated_flows.items():
    final_source.append(s)
    final_target.append(t)
    final_value.append(v)

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
      pad=15,
      thickness=20,
      line=dict(color="black", width=0.5),
      label=nodes, # Use your defined node labels
      color="blue" # Default node color, you can customize further
    ),
    link=dict(
      source=final_source,
      target=final_target,
      value=final_value,
      color="rgba(0,0,0,0.2)" # Default link color with some transparency
  ))])

fig.update_layout(title_text="Data Availability", font_size=10)
fig.show()

print(counter)

1965


In [4]:
import plotly.graph_objects as go


with open('../data/2k_p1_v23_f20.json') as f:
    data = json.load(f)["results"]

nodes = ["All", "Data Used", "Data Not Used", "0 datasets", "1-2 datasets", "3-5 datasets", "6+ datasets", "All Data Available", "Some Data Available", "No Data Available"]
node_map = {node: i for i, node in enumerate(nodes)}
from collections import defaultdict
aggregated_flows = defaultdict(int)

# Initialize lists for Sankey data
sankey_source = []
sankey_target = []
sankey_value = []

# A helper to add flows
def add_flow(source_node, target_node, count=1):
    sankey_source.append(node_map[source_node])
    sankey_target.append(node_map[target_node])
    aggregated_flows[(node_map[source_node], node_map[target_node])] += 1

counter = 0

for item in data:
    # --- Initial Flow from "Start" ---
    if item["is_editorial"]:
        continue

    if item["is_data_used"]:
        add_flow("All", "Data Used")
        counter += 1
    else:
        add_flow("All", "Data Not Used")
        counter += 1

    if item["is_data_used"]:
        # --- Flow from "Data Used" ---
        if len(item["datasets"]) == 0:
            add_flow("Data Used", "0 datasets")
        elif len(item["datasets"]) <= 2:
            add_flow("Data Used", "1-2 datasets")
        elif len(item["datasets"]) <= 5:
            add_flow("Data Used", "3-5 datasets")
        else:
            add_flow("Data Used", "6+ datasets")
    else:
        if "datasets" not in item or len(item["datasets"]) == 0:
            add_flow("Data Not Used", "0 datasets")
        elif len(item["datasets"]) <= 2:
            add_flow("Data Not Used", "1-2 datasets")
        elif len(item["datasets"]) <= 5:
            add_flow("Data Not Used", "3-5 datasets")
        else:
            add_flow("Data Not Used", "6+ datasets")

    num_avail = 0
    total_datasets = 0
    if "datasets" in item:
        for dataset in item["datasets"]:
            if dataset["is_dataset_publically_available"] == True:
                num_avail += 1
            total_datasets += 1

    if "datasets" not in item or len(item["datasets"]) == 0:
        if num_avail == 0:
            add_flow("0 datasets", "No Data Available")
    elif len(item["datasets"]) <= 2:
        if num_avail == 0:
            add_flow("1-2 datasets", "No Data Available")
        elif num_avail == total_datasets:
            add_flow("1-2 datasets", "All Data Available")
        else:
            add_flow("1-2 datasets", "Some Data Available")
    elif len(item["datasets"]) <= 5:
        if num_avail == 0:
            add_flow("3-5 datasets", "No Data Available")
        elif num_avail == total_datasets:
            add_flow("3-5 datasets", "All Data Available")
        else:
            add_flow("3-5 datasets", "Some Data Available")
    else:
        if num_avail == 0:
            add_flow("6+ datasets", "No Data Available")
        elif num_avail == total_datasets:
            add_flow("6+ datasets", "All Data Available")
        else:
            add_flow("6+ datasets", "Some Data Available")
        

final_source = []
final_target = []
final_value = []

for (s, t), v in aggregated_flows.items():
    final_source.append(s)
    final_target.append(t)
    final_value.append(v)

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
      pad=15,
      thickness=20,
      line=dict(color="black", width=0.5),
      label=nodes, # Use your defined node labels
      color="blue" # Default node color, you can customize further
    ),
    link=dict(
      source=final_source,
      target=final_target,
      value=final_value,
      color="rgba(0,0,0,0.2)" # Default link color with some transparency
  ))])

fig.update_layout(title_text="Data Availability", font_size=10)
fig.show()

print(counter)

1965
