Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# API Keys for LLM Providers
# Copy this file to .env and fill in your actual API keys
# DO NOT commit your .env file with real keys!

# OpenAI (for GPT models and o1/o3 reasoning models)
OPENAI_API_KEY=sk-...

# Anthropic (for Claude models)
ANTHROPIC_API_KEY=sk-ant-api03-...

# Google Gemini
GEMINI_API_KEY=...

# DeepSeek
DEEPSEEK_API_KEY=sk-...

# Together AI
TOGETHER_API_KEY=...

# Fireworks AI
FIREWORKS_AI_API_KEY=...

# Local Server Deployment (SGLang, vLLM, Tokasaurus)
SGLANG_API_KEY=...
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ pip install -r requirements.txt
pip install -e .
```

To call LLM API providers, set your `{INFERENCE_SERVER_PROVIDER}_API_KEY` API key.
We use `litellm` for API calls. Please set your keys by creating a `.env` following our `.env.example`.

Running and profiling kernels require a GPU.
If you don't have GPU available locally, you can set up [Modal](https://modal.com/). Set up your modal token after creating an account by running `modal token new`. Then, use the `generate_and_eval_single_sample_modal.py` script.
Expand Down Expand Up @@ -122,7 +122,7 @@ If you are using a different hardware, you can generate the baseline time with `
We provide some reference baseline times a variety of NVIDIA GPUs across generations in `results/timing`, but we recommend you to generate your own baseline time for more accurate results (cluster power, software version, all affects timing result). See `results/timing/README.md` for more details.

### Multi-Turn Framework
We have also releaed the test-time framework [Caesar](https://github.com/simonguozirui/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems.
We have also releaed the test-time framework [Caesar](https://github.com/ScalingIntelligence/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems.

## 🛣️ Upcoming Roadmap
Check out our [roadmap](https://github.com/ScalingIntelligence/KernelBench/issues/74) for what we plan to add as features. We welcome community contirbutions in these directions.
Expand Down
5 changes: 1 addition & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@ einops
dotenv
numpy

# to deprecate with litellm
google-generativeai
together
openai
anthropic
litellm[proxy]

31 changes: 27 additions & 4 deletions scripts/generate_and_eval_single_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,15 @@ def __init__(self):
self.gpu_arch = ["Ada"]

# Inference config
self.server_type = "deepseek"
self.model_name = "deepseek-coder"
self.max_tokens = 4096
self.temperature = 0.0
self.server_type = None
self.model_name = None
self.max_tokens = None
self.temperature = None

# Reasoning model specific parameters
self.is_reasoning_model = False # set to True for o1, o3, Gemini 2.5 thinking, etc.
self.reasoning_effort = None # for o1/o3: "low", "medium", "high"
self.budget_tokens = 0 # for Claude extended thinking mode

# Logging
self.logdir = os.path.join(REPO_TOP_DIR, "results/eval_logs")
Expand Down Expand Up @@ -81,6 +86,21 @@ def main(config: EvalConfig):
"""
Keep it simple: Generate and evaluate a single sample
"""
from src.utils import SERVER_PRESETS

if config.server_type and config.server_type in SERVER_PRESETS:
preset = SERVER_PRESETS[config.server_type]
if config.model_name is None or config.model_name == "None":
config.model_name = preset.get("model_name", "None")
if config.max_tokens is None or config.max_tokens == "None":
config.max_tokens = preset.get("max_tokens", "None")
if config.temperature is None or config.temperature == "None":
config.temperature = preset.get("temperature", "None")

# Convert string boolean to actual boolean for reasoning model flag
if isinstance(config.is_reasoning_model, str):
config.is_reasoning_model = config.is_reasoning_model.lower() in ['true', '1', 'yes']

print(f"Starting Eval with config: {config}")

# Configurations
Expand Down Expand Up @@ -143,6 +163,9 @@ def main(config: EvalConfig):
max_tokens=config.max_tokens,
verbose=config.verbose,
time_generation=True,
is_reasoning_model=config.is_reasoning_model,
reasoning_effort=config.reasoning_effort,
budget_tokens=config.budget_tokens,
)

# Use appropriate prompt constructor based on backend
Expand Down
38 changes: 30 additions & 8 deletions scripts/generate_and_eval_single_sample_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,15 @@ def __init__(self):


# Inference config
self.server_type = "deepseek"
self.model_name = "deepseek-coder"
self.max_tokens = 4096
self.temperature = 0.0
self.server_type = None
self.model_name = None
self.max_tokens = None
self.temperature = None

# Reasoning model specific parameters
self.is_reasoning_model = False # set to True for o1, o3, Gemini 2.5 thinking, etc.
self.reasoning_effort = None # for o1/o3: "low", "medium", "high"
self.budget_tokens = 0 # for Claude extended thinking mode

# Logging
self.logdir = os.path.join(REPO_TOP_DIR, "results/eval_logs")
Expand Down Expand Up @@ -94,7 +99,6 @@ def __repr__(self):
"clang" # note i skip a step
)
.pip_install( # required to build flash-attn
"anthropic",
"numpy",
"openai",
"packaging",
Expand All @@ -103,15 +107,15 @@ def __repr__(self):
"tqdm",
"datasets",
"transformers",
"google-generativeai",
"together",
"pytest",
"ninja",
"utils",
# "tilelang", # commented out - not working currently
#"apache-tvm",
"python-dotenv",
"nvidia-cutlass-dsl",
"litellm[proxy]", # Unified LLM interface
"einops", # for numerics

)
.add_local_python_source("src")
Expand Down Expand Up @@ -140,6 +144,21 @@ def main(config: EvalConfig):
"""
Keep it simple: Generate and evaluate a single sample
"""
from src.utils import SERVER_PRESETS

if config.server_type and config.server_type in SERVER_PRESETS:
preset = SERVER_PRESETS[config.server_type]
if config.model_name is None or config.model_name == "None":
config.model_name = preset.get("model_name", "None")
if config.max_tokens is None or config.max_tokens == "None":
config.max_tokens = preset.get("max_tokens", "None")
if config.temperature is None or config.temperature == "None":
config.temperature = preset.get("temperature", "None")

# Convert string boolean to actual boolean for reasoning model flag
if isinstance(config.is_reasoning_model, str):
config.is_reasoning_model = config.is_reasoning_model.lower() in ['true', '1', 'yes']

print(f"Starting Eval with config: {config}")

# Configurations
Expand Down Expand Up @@ -187,7 +206,10 @@ def main(config: EvalConfig):
temperature=config.temperature,
max_tokens=config.max_tokens,
verbose=config.verbose,
time_generation=True)
time_generation=True,
is_reasoning_model=config.is_reasoning_model,
reasoning_effort=config.reasoning_effort,
budget_tokens=config.budget_tokens)



Expand Down
56 changes: 48 additions & 8 deletions scripts/generate_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,15 @@ def __init__(self):
self.api_query_interval = 0.0

# Inference config
self.server_type = "deepseek"
self.model_name = "deepseek-coder"
self.max_tokens = 4096
self.server_type = None
self.model_name = None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so are we not specifying a default, this could also works, and just expect them to use a preset here?

Copy link
Collaborator Author

@pythonomar22 pythonomar22 Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to reflect slack thread - but yeah

self.max_tokens = None
self.temperature = 0.0

# Reasoning model specific parameters
self.is_reasoning_model = False # set to True for o1, o3, Gemini 2.5 thinking, etc.
self.reasoning_effort = "low" # for o1/o3: "low", "medium", "high"
self.budget_tokens = 0 # for Claude extended thinking mode

# Logging
# Top Directory to Store Runs
Expand Down Expand Up @@ -192,6 +197,21 @@ def main(config: GenerationConfig):
Batch Generate Samples for Particular Level
Store generated kernels in the specified run directory
"""
from src.utils import SERVER_PRESETS

if config.server_type and config.server_type in SERVER_PRESETS:
preset = SERVER_PRESETS[config.server_type]
if config.model_name is None or config.model_name == "None":
config.model_name = preset.get("model_name", "None")
if config.max_tokens is None or config.max_tokens == "None":
config.max_tokens = preset.get("max_tokens", "None")
if config.temperature is None or config.temperature == "None":
config.temperature = preset.get("temperature", "None")

# Convert string boolean to actual boolean for reasoning model flag
if isinstance(config.is_reasoning_model, str):
config.is_reasoning_model = config.is_reasoning_model.lower() in ['true', '1', 'yes']

print(f"Starting Batch Generation with config: {config}")

# Dataset Configurations
Expand All @@ -217,6 +237,10 @@ def main(config: GenerationConfig):

# set up run directory
run_dir = os.path.join(config.runs_dir, config.run_name)
run_exists = os.path.exists(run_dir)
if run_exists:
print(f"\n⚠️ WARNING: Run directory already exists: {run_dir}")
print(f" Existing kernels will be skipped. Use a different run_name for a fresh run.\n")
os.makedirs(run_dir, exist_ok=True)
pydra.save_yaml(config.to_dict(), os.path.join(run_dir, "generation_config.yaml"))

Expand All @@ -225,14 +249,22 @@ def main(config: GenerationConfig):
), "supporting local file-system based storage for now" # database integreation coming soon, need to migrate from CUDA Monkeys code

problems_to_run = []
total_problems = 0
already_completed = 0
for problem_id in range(
problem_id_range.start, problem_id_range.stop + 1
): # end index is inclusive
for sample_id in range(config.num_samples):
total_problems += 1
if not check_kernel_exists(run_dir, config.level, problem_id, sample_id):
problems_to_run.append(
WorkArgs(problem_id=int(problem_id), sample_id=sample_id)
)
else:
already_completed += 1

if already_completed > 0:
print(f"📁 Found {already_completed}/{total_problems} kernels already generated. Generating remaining {len(problems_to_run)} kernels.")

# Create inference function with config parameters
# We provide some presets in utils but you can also pass in your own, see query_server for more details
Expand All @@ -242,6 +274,9 @@ def main(config: GenerationConfig):
temperature=config.temperature,
max_tokens=config.max_tokens,
verbose=config.verbose,
is_reasoning_model=config.is_reasoning_model,
reasoning_effort=config.reasoning_effort,
budget_tokens=config.budget_tokens,
)

# Launch workers
Expand All @@ -258,11 +293,16 @@ def main(config: GenerationConfig):
)

num_generated_samples = len(generation_results)
total_problems = len(problems_to_run)
num_failed_problems = total_problems - num_generated_samples
print(
f"Generated {num_generated_samples} samples for total {total_problems} problems, Please retry for the {num_failed_problems} failed problems."
)
num_attempted = len(problems_to_run)
num_failed_problems = num_attempted - num_generated_samples

if num_attempted == 0:
print(f"\n✅ All {total_problems} kernels already exist in {run_dir}")
print(f" Use a different run_name if you want to generate fresh samples.\n")
else:
print(
f"\nGenerated {num_generated_samples} samples for total {num_attempted} problems, Please retry for the {num_failed_problems} failed problems."
)


if __name__ == "__main__":
Expand Down
Loading