ScalingIntelligence · simonguozirui · Nov 11, 2025 · Nov 9, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,7 @@
 # Frameworks
-torch==2.5.0
+# we use latest PyTorch stable release
+torch==2.9.0
+
 # we shall upgrade torch for blackwell when it is stable
 transformers
 datasets
@@ -8,7 +10,6 @@ modal
 # DSLs
 nvidia-cutlass-dsl
 tilelang
-apache-tvm
 
 # helper
 tqdm
@@ -22,6 +23,7 @@ einops
 dotenv
 numpy
 
-openai
+# use litellm for cloud providers and openai for local
+openai 
 litellm[proxy]
 
diff --git a/results/timing/README.md b/results/timing/README.md
@@ -6,7 +6,9 @@ This folder contains a set of baseline timing results for the KernelBench proble
 Since KernelBench measures the speedup between Runtime(refernece architecture) and Runtime(LLM-generated architecture), it is important to measure the baseline reference module runtime.
 
 We have provided a set of baseline results for the KernelBench problems on a variety of hardware as well as various PyTorch configurations.
-All baseline are ran with PyTorch `2.5.0+cu124` and CUDA `12.4`.
+All (current) baseline are ran with PyTorch `2.5.0+cu124` and CUDA `12.4`.
+
+Note: we will update it soon with PyTorch `2.9.0` and CUDA `12.8`    
 
 For timing, we measure wall clock time. We warm up 3 times and collect runtime statistics for 100 trials.
 

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
@@ -55,7 +55,7 @@
 app = modal.App("eval_from_generations_modal")
 gpu_arch_mapping = {"L40S": ["Ada"], "H100": ["Hopper"], "A100": ["Ampere"], "L4": ["Ada"], "T4": ["Turing"], "A10G": ["Ampere"]}
 
-cuda_version = "12.4.0"  # should be no greater than host CUDA version
+cuda_version = "12.8.0"  # should be no greater than host CUDA version
 flavor = "devel"  #  includes full CUDA toolkit
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -67,19 +67,7 @@
                 "g++-10",
                 "clang"
                 )
-    .pip_install(
-        "numpy",
-        "packaging",
-        "pydra_config",
-        "torch==2.5.0",
-        "tqdm",
-        "datasets",
-        "transformers",
-        "pytest",
-        "ninja",
-        "utils",
-        "python-dotenv",
-    )
+    .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
     .add_local_dir(
         KERNEL_BENCH_PATH,
         remote_path="/root/KernelBench"
@@ -165,17 +153,18 @@ class WorkArgs:
 # Modal Evaluation Class
 # GPU must be specified here for all instances
 # Retries are configured at the class level to handle GPU attachment failures
-# @modal.concurrent: Each container handles exactly ONE evaluation at a time - prevents memory leaks
+# scaledown_window=5 kills idle containers after 5 seconds
+# Combined with 10s sleep between batches, this prevents container reuse and GPU corruption spread
 @app.cls(
-    image=image, 
+    image=image,
     gpu="A10G",
+    scaledown_window=5,  # Kill idle containers after 5 seconds
     retries=modal.Retries(
         max_retries=3,
         backoff_coefficient=2.0,
         initial_delay=1.0,
     )
 )
-@modal.concurrent(max_inputs=1)  # One input per container - prevents GPU memory leaks
 class ModalEvaluator:
 
     @modal.method()
@@ -230,11 +219,11 @@ def evaluate_single_sample_modal(
             backend=backend,
             precision=get_torch_dtype_from_string(precision),
         )
-        
-        # Force cleanup and exit to prevent container reuse and memory leaks
+
+        # Cleanup GPU cache before returning
         torch.cuda.empty_cache()
-        
-        return result  # Never reached, but needed for type checking
+
+        return result
 
 
 def fetch_ref_arch_from_problem_id(
@@ -482,7 +471,8 @@ def batch_eval_modal(
                 evaluator_cls = ModalEvaluator.with_options(gpu=config.gpu) if config.gpu != "A10G" else ModalEvaluator
 
                 # Spawn all tasks in parallel
-                # Each spawn creates a NEW container instance with a GPU
+                # Modal assigns these to available containers (may reuse warm containers from previous batches)
+                # To prevent GPU corruption spread, we sleep between batches to ensure containers scale down
                 futures = []
                 for item in work_items:
                     if item is None:
@@ -538,7 +528,14 @@ def batch_eval_modal(
 
                 print("-" * 128)
                 print(f"[Modal Batch] Evaluation took {end_time - start_time:.2f} seconds")
-
+
+                # Wait for containers to scale down before next batch
+                # This prevents container reuse and GPU corruption from spreading between batches
+                if len(total_work) > 0:  # Only sleep if there are more batches
+                    scaledown_wait = 10  # Wait 10 seconds (2x the scaledown_window) to ensure containers are killed
+                    print(f"[Modal] Waiting {scaledown_wait}s for containers to scale down before next batch...")
+                    time.sleep(scaledown_wait)
+
                 pbar.update(len(curr_work_batch))
 
 

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
@@ -86,7 +86,7 @@ def verbose_logging(self):
     def __repr__(self):
         return f"EvalConfig({self.to_dict()})"
 
-cuda_version = "12.4.0"  # should be no greater than host CUDA version
+cuda_version = "12.8.0"  # should be no greater than host CUDA version
 flavor = "devel"  #  includes full CUDA toolkit
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -96,29 +96,10 @@ def __repr__(self):
     .apt_install("git",
                 "gcc-10",
                 "g++-10",
-                "clang" # note i skip a step 
+                "clang" # note i skip a step
                 )
-    .pip_install(  # required to build flash-attn
-        "numpy",
-        "openai",
-        "packaging",
-        "pydra_config",
-        "torch==2.5.0",
-        "tqdm",
-        "datasets",
-        "transformers",
-        "pytest",
-        "ninja",
-        "utils",
-        "tilelang",
-        "apache-tvm",
-        "python-dotenv",
-        "nvidia-cutlass-dsl",
-        "litellm[proxy]",  # Unified LLM interface
-        "einops",  # for numerics
-
-    )
-    .add_local_python_source("src") 
+    .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+    .add_local_python_source("src")
 )
 
 @app.cls(image=image)

diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py
@@ -55,7 +55,7 @@
 batch_size = 10
 gpu = "L40S"
 timeout = 1800
-cuda_version = "12.4.0"  # should be no greater than host CUDA version
+cuda_version = "12.8.0"  # should be no greater than host CUDA version
 flavor = "devel"  #  includes full CUDA toolkit
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
@@ -65,28 +65,14 @@
     .apt_install("git",
                 "gcc-10",
                 "g++-10",
-                "clang" # note i skip a step 
+                "clang" # note i skip a step
                 )
-    .pip_install(  # required to build flash-attn
-        # Let's unify these dependencies somewhere
-        "numpy",
-        "packaging",
-        "pydra_config",
-        "torch==2.5.0",
-        "tqdm",
-        "datasets",
-        "transformers",
-        "pytest",
-        "ninja",
-        "utils",
-        "einops",
-        "python-dotenv",
-    )
+    .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
     .add_local_dir(
         KERNEL_BENCH_PATH,
         remote_path="/root/KernelBench"
     )
-    .add_local_python_source("src") 
+    .add_local_python_source("src")
 )
 
 def write_batch_to_json(entries_to_write: list, f_path: str):

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
@@ -27,29 +27,15 @@
 REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
 
-cuda_version = "12.4.0"
+cuda_version = "12.8.0"
 flavor = "devel"
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
 
 image = (
     modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
     .apt_install("git", "gcc-10", "g++-10", "clang")
-    .pip_install(
-        "numpy",
-        "packaging",
-        "pydra_config",
-        "torch==2.5.0",
-        "tqdm",
-        "datasets",
-        "transformers",
-        "pytest",
-        "ninja",
-        "utils",
-        "einops",
-        "python-dotenv",
-        "litellm[proxy]",
-    )
+    .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
     .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
     .add_local_python_source("src")
     .add_local_python_source("scripts")

diff --git a/src/utils.py b/src/utils.py
@@ -71,14 +71,7 @@ def query_server(
 ):
     """
     Query various sort of LLM inference API providers
-    Supports:
-    - OpenAI
-    - Deepseek
-    - Together
-    - Sambanova
-    - Anthropic
-    - Gemini / Google AI Studio
-    - Fireworks (OpenAI compatbility)
+    Done through liteLLM:
     - Local Server (SGLang, vLLM, Tokasaurus)
     """
     # Local Server (SGLang, vLLM, Tokasaurus) - special handling