In [None]:
from pathlib import Path
import sys
project_root = next((parent for parent in [Path.cwd()] + list(Path.cwd().parents) if (parent / "pyproject.toml").exists()), Path.cwd())
sys.path.append(str(project_root))

In [None]:
superking_path = "/tmp/superking.parquet"

In [None]:
from llm_python.datasets.superking import download_superking

# download_superking(superking_path)

In [None]:
from llm_python.datasets.io import read_soar_parquet
from llm_python.utils.task_loader import get_task_loader


superking_df = read_soar_parquet(superking_path)

task_loader = get_task_loader()
arc_1_train_task_ids = [task_id for task_id, _ in task_loader.get_subset_tasks("arc-prize-2024/training")]

superking_df = superking_df[superking_df["task_id"].isin(arc_1_train_task_ids)].copy()

In [None]:
print(len(superking_df), "rows in SuperKing after filtering to ARC-1 training tasks.")
print("Unique task_ids:", superking_df["task_id"].nunique())
print("Max code length:", superking_df["code"].str.len().max())

import matplotlib.pyplot as plt

code_lengths = superking_df["code"].str.len()
plt.figure(figsize=(10, 6))
plt.hist(code_lengths, bins=50, color='skyblue', edgecolor='black')
plt.title("Distribution of Code Lengths")
plt.xlabel("Code Length")
plt.ylabel("Frequency")
plt.show()

In [None]:
superking_df = superking_df[superking_df["code"].str.len() <= 5000].copy()
superking_df = superking_df.sort_values(by="code", key=lambda x: x.str.len(), ascending=False)
print(f"Rows after capping code length to 5000: {len(superking_df)}")

In [None]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

In [None]:
import numpy as np
import tqdm
import torch

# Compute embeddings for the "code" field in batches
code_texts = superking_df["code"].astype(str).tolist()

batch_size = 64
embeddings = []
for i in tqdm.tqdm(range(0, len(code_texts), batch_size)):
    batch = code_texts[i:i+batch_size]
    batch_emb = model.encode(batch, show_progress_bar=False)
    embeddings.extend(batch_emb)

# Add embeddings to the dataframe as a new column
superking_df["code_embedding"] = list(embeddings)

In [None]:
embeddings_path = "superking_with_embeddings.parquet"

In [None]:
superking_df.to_parquet(embeddings_path, index=False)


In [None]:
import pandas as pd

df_with_embeddings = pd.read_parquet(embeddings_path, dtype_backend="pyarrow")

In [None]:
df_with_embeddings.head()