# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese")
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese")
model = AutoModelForCausalLM.from_pretrained("lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese")

In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
from vllm import LLM, SamplingParams

llm = LLM(
    model="lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese",
    max_model_len=8_000
)

sampling_params = SamplingParams(
    temperature=0.5, 
    max_tokens=8_000,
    repetition_penalty=1.1
)

prompts = [
    """学校には1クラスにつき20人の生徒がおり、クラスは合計3つあります。
学校全体では男子と女子がそれぞれ50%ずついます。
1つ目のクラスには女子が15人、2つ目のクラスには女子が12人います。
3つ目のクラスには何人の男子がいますか？"""
]

conversations = [
    [{"role": "user", "content": x}] for x in prompts
]

outputs = llm.chat(conversations, sampling_params=sampling_params)

for output in outputs:
    print(output.outputs[0].text)

INFO 03-30 22:36:49 [__init__.py:239] Automatically detected platform cuda.
INFO 03-30 22:36:56 [config.py:585] This model supports multiple tasks: {'embed', 'classify', 'score', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 03-30 22:36:56 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 03-30 22:36:57 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese', speculative_config=None, tokenizer='lightblue/DeepSeek-R1-Distill-Qwen-7B-Japanese', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8000, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'

: 