Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Frameworks
torch==2.5.0
# we use latest PyTorch stable release
torch==2.9.0

# we shall upgrade torch for blackwell when it is stable
transformers
datasets
Expand All @@ -8,7 +10,6 @@ modal
# DSLs
nvidia-cutlass-dsl
tilelang
apache-tvm

# helper
tqdm
Expand All @@ -22,6 +23,7 @@ einops
dotenv
numpy

openai
# use litellm for cloud providers and openai for local
openai
litellm[proxy]

4 changes: 3 additions & 1 deletion results/timing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ This folder contains a set of baseline timing results for the KernelBench proble
Since KernelBench measures the speedup between Runtime(refernece architecture) and Runtime(LLM-generated architecture), it is important to measure the baseline reference module runtime.

We have provided a set of baseline results for the KernelBench problems on a variety of hardware as well as various PyTorch configurations.
All baseline are ran with PyTorch `2.5.0+cu124` and CUDA `12.4`.
All (current) baseline are ran with PyTorch `2.5.0+cu124` and CUDA `12.4`.

Note: we will update it soon with PyTorch `2.9.0` and CUDA `12.8`

For timing, we measure wall clock time. We warm up 3 times and collect runtime statistics for 100 trials.

Expand Down
43 changes: 20 additions & 23 deletions scripts/eval_from_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
app = modal.App("eval_from_generations_modal")
gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]}

cuda_version = "12.4.0" # should be no greater than host CUDA version
cuda_version = "12.8.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
Expand All @@ -67,19 +67,7 @@
"g++-10",
"clang"
)
.pip_install(
"numpy",
"packaging",
"pydra_config",
"torch==2.5.0",
"tqdm",
"datasets",
"transformers",
"pytest",
"ninja",
"utils",
"python-dotenv",
)
.pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
.add_local_dir(
KERNEL_BENCH_PATH,
remote_path="/root/KernelBench"
Expand Down Expand Up @@ -165,17 +153,18 @@ class WorkArgs:
# Modal Evaluation Class
# GPU must be specified here for all instances
# Retries are configured at the class level to handle GPU attachment failures
# @modal.concurrent: Each container handles exactly ONE evaluation at a time - prevents memory leaks
# scaledown_window=5 kills idle containers after 5 seconds
# Combined with 10s sleep between batches, this prevents container reuse and GPU corruption spread
@app.cls(
image=image,
image=image,
gpu="A10G",
scaledown_window=5, # Kill idle containers after 5 seconds
retries=modal.Retries(
max_retries=3,
backoff_coefficient=2.0,
initial_delay=1.0,
)
)
@modal.concurrent(max_inputs=1) # One input per container - prevents GPU memory leaks
class ModalEvaluator:

@modal.method()
Expand Down Expand Up @@ -230,11 +219,11 @@ def evaluate_single_sample_modal(
backend=backend,
precision=get_torch_dtype_from_string(precision),
)
# Force cleanup and exit to prevent container reuse and memory leaks

# Cleanup GPU cache before returning
torch.cuda.empty_cache()
return result # Never reached, but needed for type checking

return result


def fetch_ref_arch_from_problem_id(
Expand Down Expand Up @@ -482,7 +471,8 @@ def batch_eval_modal(
evaluator_cls = ModalEvaluator.with_options(gpu=config.gpu) if config.gpu != "A10G" else ModalEvaluator

# Spawn all tasks in parallel
# Each spawn creates a NEW container instance with a GPU
# Modal assigns these to available containers (may reuse warm containers from previous batches)
# To prevent GPU corruption spread, we sleep between batches to ensure containers scale down
futures = []
for item in work_items:
if item is None:
Expand Down Expand Up @@ -538,7 +528,14 @@ def batch_eval_modal(

print("-" * 128)
print(f"[Modal Batch] Evaluation took {end_time - start_time:.2f} seconds")


# Wait for containers to scale down before next batch
# This prevents container reuse and GPU corruption from spreading between batches
if len(total_work) > 0: # Only sleep if there are more batches
scaledown_wait = 10 # Wait 10 seconds (2x the scaledown_window) to ensure containers are killed
print(f"[Modal] Waiting {scaledown_wait}s for containers to scale down before next batch...")
time.sleep(scaledown_wait)

pbar.update(len(curr_work_batch))


Expand Down
27 changes: 4 additions & 23 deletions scripts/generate_and_eval_single_sample_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def verbose_logging(self):
def __repr__(self):
return f"EvalConfig({self.to_dict()})"

cuda_version = "12.4.0" # should be no greater than host CUDA version
cuda_version = "12.8.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
Expand All @@ -96,29 +96,10 @@ def __repr__(self):
.apt_install("git",
"gcc-10",
"g++-10",
"clang" # note i skip a step
"clang" # note i skip a step
)
.pip_install( # required to build flash-attn
"numpy",
"openai",
"packaging",
"pydra_config",
"torch==2.5.0",
"tqdm",
"datasets",
"transformers",
"pytest",
"ninja",
"utils",
"tilelang",
"apache-tvm",
"python-dotenv",
"nvidia-cutlass-dsl",
"litellm[proxy]", # Unified LLM interface
"einops", # for numerics

)
.add_local_python_source("src")
.pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
.add_local_python_source("src")
)

@app.cls(image=image)
Expand Down
22 changes: 4 additions & 18 deletions scripts/generate_baseline_time_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
batch_size = 10
gpu = "L40S"
timeout = 1800
cuda_version = "12.4.0" # should be no greater than host CUDA version
cuda_version = "12.8.0" # should be no greater than host CUDA version
flavor = "devel" # includes full CUDA toolkit
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"
Expand All @@ -65,28 +65,14 @@
.apt_install("git",
"gcc-10",
"g++-10",
"clang" # note i skip a step
"clang" # note i skip a step
)
.pip_install( # required to build flash-attn
# Let's unify these dependencies somewhere
"numpy",
"packaging",
"pydra_config",
"torch==2.5.0",
"tqdm",
"datasets",
"transformers",
"pytest",
"ninja",
"utils",
"einops",
"python-dotenv",
)
.pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
.add_local_dir(
KERNEL_BENCH_PATH,
remote_path="/root/KernelBench"
)
.add_local_python_source("src")
.add_local_python_source("src")
)

def write_batch_to_json(entries_to_write: list, f_path: str):
Expand Down
18 changes: 2 additions & 16 deletions scripts/run_and_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,29 +27,15 @@
REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")

cuda_version = "12.4.0"
cuda_version = "12.8.0"
flavor = "devel"
operating_sys = "ubuntu22.04"
tag = f"{cuda_version}-{flavor}-{operating_sys}"

image = (
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
.apt_install("git", "gcc-10", "g++-10", "clang")
.pip_install(
"numpy",
"packaging",
"pydra_config",
"torch==2.5.0",
"tqdm",
"datasets",
"transformers",
"pytest",
"ninja",
"utils",
"einops",
"python-dotenv",
"litellm[proxy]",
)
.pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
.add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
.add_local_python_source("src")
.add_local_python_source("scripts")
Expand Down
9 changes: 1 addition & 8 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,7 @@ def query_server(
):
"""
Query various sort of LLM inference API providers
Supports:
- OpenAI
- Deepseek
- Together
- Sambanova
- Anthropic
- Gemini / Google AI Studio
- Fireworks (OpenAI compatbility)
Done through liteLLM:
- Local Server (SGLang, vLLM, Tokasaurus)
"""
# Local Server (SGLang, vLLM, Tokasaurus) - special handling
Expand Down