In [None]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU Count: {torch.cuda.device_count()}")

try:
    import flash_attn
    print("✅ Flash Attention 2 installed successfully!")
except ImportError:
    print("❌ Flash Attention 2 NOT found.")

try:
    import vllm
    print("✅ vLLM installed successfully!")
except ImportError:
    print("❌ vLLM NOT found.")

# PyTorch Version: 2.5.1+cu124
# CUDA Available: True
# GPU Count: 4
# ✅ Flash Attention 2 installed successfully!
# INFO 12-07 13:41:06 __init__.py:190] Automatically detected platform cuda.
# ✅ vLLM installed successfully!

PyTorch Version: 2.5.1+cu124
CUDA Available: True
GPU Count: 4
✅ Flash Attention 2 installed successfully!
INFO 12-07 13:41:06 __init__.py:190] Automatically detected platform cuda.
✅ vLLM installed successfully!


In [1]:
import os
import json
from vllm import LLM, SamplingParams

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

MODEL_PATH = "/home/jxy/.cache/modelscope/hub/models/Qwen/Qwen2.5-Math-1.5B"
DATA_PATH = "data/MATH/validation.jsonl"

def main():
    prompts = []

    with open(DATA_PATH, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            item = json.loads(line)
            prompt = (
                "A conversation between User and Assistant. The User asks a question, and the Assistant solves it. "
                "The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. "
                "The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, "
                "i.e., <think> reasoning process here </think> <answer> answer here </answer>.\n"
                f"User: {item['question']}\n"
                "Assistant: <think>"
            )
            prompts.append(prompt)

    llm = LLM(
        model=MODEL_PATH,
        trust_remote_code=True,
        dtype="bfloat16",
        gpu_memory_utilization=0.90,
        tensor_parallel_size=1
    )

    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1024,
        stop=["</answer>"],
        include_stop_str_in_output=True
    )

    outputs = llm.generate(prompts, sampling_params)

    generated_text = outputs[0].outputs[0].text
    full_response = "<think>" + generated_text
    print(full_response)

if __name__ == "__main__":
    main()

INFO 12-07 14:00:48 __init__.py:190] Automatically detected platform cuda.
INFO 12-07 14:00:55 config.py:542] This model supports multiple tasks: {'score', 'embed', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 12-07 14:00:55 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/home/jxy/.cache/modelscope/hub/models/Qwen/Qwen2.5-Math-1.5B', speculative_config=None, tokenizer='/home/jxy/.cache/modelscope/hub/models/Qwen/Qwen2.5-Math-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_trace

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 12-07 14:00:58 model_runner.py:1115] Loading model weights took 2.8797 GB
INFO 12-07 14:00:58 worker.py:267] Memory profiling takes 0.61 seconds
INFO 12-07 14:00:58 worker.py:267] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.90) = 21.32GiB
INFO 12-07 14:00:58 worker.py:267] model weights take 2.88GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.40GiB; the rest of the memory reserved for KV Cache is 16.98GiB.
INFO 12-07 14:00:59 executor_base.py:110] # CUDA blocks: 39748, # CPU blocks: 9362
INFO 12-07 14:00:59 executor_base.py:115] Maximum concurrency for 4096 tokens per request: 155.27x
INFO 12-07 14:01:01 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:13<00:00,  2.52it/s]

INFO 12-07 14:01:15 model_runner.py:1562] Graph capturing finished in 14 secs, took 0.20 GiB
INFO 12-07 14:01:15 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 17.43 seconds



Processed prompts: 100%|██████████| 5/5 [00:01<00:00,  3.37it/s, est. speed input: 399.10 toks/s, output: 509.84 toks/s]

<think> We can find the vertical asymptotes by setting the denominator equal to zero and solving for x. </think> <answer> Setting the denominator equal to zero gives $x^2+x-6=0$. Factoring the quadratic equation, we get $(x+3)(x-2)=0$. Therefore, the solutions are $x = -3$ and $x = 2$. Since these are the only solutions, the number of vertical asymptotes is <answer count icons>2</answer count icons><more> What are the asymptotes here? In other words, why doesn’t the graph cross the line that $x$ is never, which is the $y$-axis? </answer>



