## Load the model

In [3]:
base_model = "Qwen/Qwen3-0.6B"
adapter_path = "TechitoTamani/Qwen3-0.6B_FinetuneWithMyData"

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# base_model = "scb10x/typhoon2.1-gemma3-12b"
# adapter_path = "/mnt/mydrive/Audio/outputs/typhoon_finetune_all_data_001/checkpoint-200"

# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained(adapter_path)  # to include any tokenizer special tokens
model = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto", torch_dtype=torch.bfloat16)

# Attach LoRA adapter
model = PeftModel.from_pretrained(model, adapter_path)



In [5]:
# Merge the LoRA adapter weights into the model
model = model.merge_and_unload()

# Save the merged model
save_path = "merged_model"
model.save_pretrained(save_path)

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(adapter_path)
tokenizer.save_pretrained(save_path)

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/vocab.json',
 'merged_model/merges.txt',
 'merged_model/added_tokens.json',
 'merged_model/tokenizer.json')

## Run Inference

In [7]:
from vllm import LLM, SamplingParams

llm = LLM(
    model="merged_model",  # Local directory with merged weights 
    max_model_len=1024,
)

INFO 06-01 02:03:38 [config.py:793] This model supports multiple tasks: {'score', 'embed', 'reward', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 06-01 02:03:38 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0.1) with config: model='merged_model', speculative_config=None, tokenizer='merged_model', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces

[W601 02:03:51.007308638 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 06-01 02:04:01 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 06-01 02:04:01 [model_runner.py:1170] Starting to load model merged_model...


[W601 02:04:01.018116613 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 06-01 02:04:02 [default_loader.py:280] Loading weights took 0.88 seconds
INFO 06-01 02:04:02 [model_runner.py:1202] Model loading took 1.1207 GiB and 0.997261 seconds
INFO 06-01 02:04:04 [worker.py:291] Memory profiling takes 0.99 seconds
INFO 06-01 02:04:04 [worker.py:291] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 06-01 02:04:04 [worker.py:291] model weights take 1.12GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 1.38GiB; the rest of the memory reserved for KV Cache is 10.74GiB.
INFO 06-01 02:04:04 [executor_base.py:112] # cuda blocks: 6286, # CPU blocks: 2340
INFO 06-01 02:04:04 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 98.22x
INFO 06-01 02:04:10 [model_runner.py:1512] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in t

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 06-01 02:04:43 [model_runner.py:1670] Graph capturing finished in 33 secs, took 0.21 GiB
INFO 06-01 02:04:43 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 40.56 seconds


In [15]:
sampling_params = SamplingParams(
    temperature=0.0,
    top_p=1.0,
    max_tokens=128
)

In [17]:
%%time
prompt = "วันนี้วันอะไร"
outputs = llm.generate(prompt, sampling_params)
print(outputs)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[RequestOutput(request_id=2, prompt='วันนี้วันอะไร', prompt_token_ids=[37213, 66256, 20184, 124032, 37213, 66256, 128682], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='บ้าง ตอนนี้มีนัดอะไรบ้าง ช่วยลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบท', token_ids=(36142, 124150, 220, 125634, 20184, 124032, 26283, 28319, 20184, 124090, 128682, 36142, 124150, 220, 48120, 126168, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 31782, 126829, 3

In [27]:
outputs[0].outputs[0].text

'บ้าง ตอนนี้มีนัดอะไรบ้าง ช่วยลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบทลบท'