### Exploring the annotated data.


In [2]:
from datasets import load_dataset

# can infer the generic loader script this easily
ds = load_dataset("./data/annotated/")
ds

Resolving data files:   0%|          | 0/95 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'name', 'author', 'description', 'tags', 'likes', 'viewed', 'date', 'time_retrieved', 'image_code', 'image_inputs', 'common_code', 'sound_code', 'sound_inputs', 'buffer_a_code', 'buffer_a_inputs', 'buffer_b_code', 'buffer_b_inputs', 'buffer_c_code', 'buffer_c_inputs', 'buffer_d_code', 'buffer_d_inputs', 'cube_a_code', 'cube_a_inputs', 'license', 'thumbnail', 'access', 'wgpu-test'],
        num_rows: 19622
    })
})

In [3]:
# how much faster would datasets be...

from annotate import annotate_shader, try_shader
import os

# all .jsonl files in ./data/raw/
raw_files = os.listdir("./data/raw/")
raw_files = [f for f in raw_files if f.endswith(".jsonl")]
raw_ds = load_dataset("json", data_files=["./data/raw/" + f for f in raw_files])
raw_ds
info_ds = raw_ds.map(lambda x: x["Shader"]["info"])
info_ds
# annotated_ds = raw_ds.map(annotate, fn_kwargs={"test": False, "access": "shaders20k"}, batched=False, num_proc=4)

# merge the "published" column from info_ds into ds, and align by "id"ArithmeticError
ds["train"] = ds["train"].add_column("published", info_ds["train"]["published"])

ds

Resolving data files:   0%|          | 0/95 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'name', 'author', 'description', 'tags', 'likes', 'viewed', 'date', 'time_retrieved', 'image_code', 'image_inputs', 'common_code', 'sound_code', 'sound_inputs', 'buffer_a_code', 'buffer_a_inputs', 'buffer_b_code', 'buffer_b_inputs', 'buffer_c_code', 'buffer_c_inputs', 'buffer_d_code', 'buffer_d_inputs', 'cube_a_code', 'cube_a_inputs', 'license', 'thumbnail', 'access', 'wgpu-test', 'published'],
        num_rows: 19622
    })
})

In [4]:
from annotate import try_shader
# TODO: make async for multithreading?
def run_test(ex):
    # print(ex)
    for rp in ex["Shader"]["renderpass"]:
        if rp["type"] == "image":
            image_code = rp["code"]
    if not image_code:
        return {"wgpu-test": "untested"}

    res = try_shader(shader_data=ex, image_code=image_code)
    return {"wgpu-test" :res}
# hangs on 344
# raw_ds = raw_ds.map(run_test, num_proc=1, batched=False)

In [7]:
from wgpu_shadertoy import Shadertoy
k = 344
print(f'https://shadertoy.com/view/{ds["train"]["id"][k]}, {ds["train"]["published"][k]}')
shader = Shadertoy.from_json(raw_ds["train"][344]) 
# shader = Shadertoy.from_id("lXK3WV")
shader.show()

https://shadertoy.com/view/XdfSDB


RFBOutputContext()

In [4]:
shader._canvas.close()

In [None]:
# list of permissive licenses like: https://huggingface.co/datasets/bigcode/the-stack-v2/blob/main/license_stats.csv
print(ds["train"].unique("license"))
# filter permissive licenses
ds["train"].filter(lambda x: x["license"] in "mit apache-2.0")

In [None]:
# see how many shaders contain buffer passes (seemingly none?)
ds["train"].filter(lambda x: x["buffer_a_code"] + x["buffer_b_code"] + x["buffer_c_code"] + x["buffer_d_code"] != "")

In [None]:
import pandas as pd
df = ds["train"].to_pandas()
# read "date" column as unix timestamp
df["date"] = pd.to_datetime(df["date"].astype(int), unit="s")
# plot entried per month as bars
df["date"].groupby(df["date"].dt.to_period("M")).count().plot(kind="bar", figsize=(14,6))

In [None]:
# plot stacked and streched bars of licenses per month
licenses = df["date"].groupby([df["date"].dt.to_period("M"), df["license"]]).count().unstack()
licenses = licenses.div(licenses.sum(axis=1), axis=0)
licenses.plot(kind="bar", stacked=True, figsize=(14,6), mark_right=True, legend=False)

In [None]:
# plot stacked and streched bars of publishing privacy setting per month
privacies = df["date"].groupby([df["date"].dt.to_period("M"), df["published"]]).count().unstack()
privacies = privacies.div(privacies.sum(axis=1), axis=0)
privacies.plot(kind="bar", stacked=True, figsize=(14,6), mark_right=True, legend=True)