In [1]:
from vllm import LLM, SamplingParams
from jiwer import wer, cer

import pandas as pd
from tqdm import tqdm

import json
import re
import os

INFO 08-26 05:59:37 [__init__.py:241] Automatically detected platform cuda.


## Data Prep

In [2]:
test_df = pd.read_excel('/project/lt200304-dipmt/paweekorn/data/test_set.xlsx', index_col='ID')

with open('/project/lt200304-dipmt/paweekorn/data/WIPO.json', 'r') as f:
    wipo_data = json.load(f)

wipo_data = {int(k): v for k, v in wipo_data.items()}
test_df['WIPO'] = test_df['NAME'].map(wipo_data)
test_df.head()

Unnamed: 0_level_0,NAME,ENG,THA,WIPO
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,9,"sockets, plugs and other contacts (electric co...",เต้ารับ ปลั๊กและตัวติดต่อ (ตัวเชื่อมต่อไฟฟ้า),"Scientific, research, navigation, surveying, p..."
2,9,"material for electricity mains (wires, cables)",วัสดุสำหรับส่วนควบคุมไฟฟ้าหลัก (ลวด สายเคเบิล),"Scientific, research, navigation, surveying, p..."
3,9,electrical instrument element and electrical i...,ส่วนประกอบอุปกรณ์ไฟฟ้าและวัสดุที่เฉพาะกับอุปกร...,"Scientific, research, navigation, surveying, p..."
4,12,"parts and accessories for automobiles, in part...",ชิ้นส่วนและอุปกรณ์เสริมสำหรับยานยนต์ โดยเฉพาะอ...,"Vehicles; apparatus for locomotion by land, ai..."
5,42,industrial analysis and research services.,บริการวิเคราะห์และวิจัยทางอุตสาหกรรม,Scientific and technological services and rese...


In [3]:
instruction = """## Instructions:
You are an expert in the classification of goods and services under the WIPO Nice Classification system. Your task is to translate product names from English to accurate and direct Thai.

## Translation Guidelines:
- Maintain the original format of the input text.
- Use Thai legal and commercial terminology appropriate for trademarks and product classification.
- Do not include explanations, commentary, or any information beyond the translation output.
- Answer in Thai language only.

## Product Domain:
{}

## Example:
Input: material for electricity mains (wires, cables)
Output: วัสดุสำหรับส่วนควบคุมไฟฟ้าหลัก (ลวด สายเคเบิล)

## Source Text:
{}
"""

def formatting_prompt(df):
    batch = []
    for _, row in tqdm(df.iterrows()):
        src, dest = row['ENG'], row['THA']
        prompt = instruction.format(row['WIPO'], src) 
        batch.append(prompt)

    return batch

test_set = formatting_prompt(test_df)
print(test_set[0])

8392it [00:00, 29900.80it/s]

## Instructions:
You are an expert in the classification of goods and services under the WIPO Nice Classification system. Your task is to translate product names from English to accurate and direct Thai.

## Translation Guidelines:
- Maintain the original format of the input text.
- Use Thai legal and commercial terminology appropriate for trademarks and product classification.
- Do not include explanations, commentary, or any information beyond the translation output.
- Answer in Thai language only.

## Product Domain:
Scientific, research, navigation, surveying, photographic, cinematographic, audiovisual, optical, weighing, measuring, signalling, detecting, testing, inspecting, life-saving and teaching apparatus and instruments; apparatus and instruments for conducting, switching, transforming, accumulating, regulating or controlling the distribution or use of electricity; apparatus and instruments for recording, transmitting, reproducing or processing sound, images or data; recorded




## Inference Time!

In [6]:
root_dir = '/project/lt200304-dipmt/paweekorn'
model_id = "typhoon-translate-4b"

model = LLM(
    model=f"{root_dir}/models/fine-tuned/{model_id}",
    quantization=None,
    max_model_len=1024,
    tensor_parallel_size=1,
    enable_prefix_caching=True,
    gpu_memory_utilization=0.9,
    enforce_eager=True,
    )

INFO 08-26 06:00:19 [utils.py:326] non-default args: {'model': '/project/lt200304-dipmt/paweekorn/models/fine-tuned/typhoon-translate-4b', 'max_model_len': 1024, 'enable_prefix_caching': True, 'disable_log_stats': True, 'enforce_eager': True}
INFO 08-26 06:00:26 [__init__.py:711] Resolved architecture: Gemma3ForCausalLM
INFO 08-26 06:00:26 [__init__.py:1750] Using max model len 1024
INFO 08-26 06:00:29 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-26 06:00:29 [__init__.py:3565] Cudagraph is disabled under eager mode
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:30 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:30 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='/project/lt200304-dipmt/paweekorn/models/fine-tuned/typhoon-translate-4b', speculative_config=None, tokenizer='/project/lt200304-dipmt/paweekorn/models/fine-tuned/typhoon-translate-4b', s

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:37 [default_loader.py:262] Loading weights took 4.18 seconds
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:38 [gpu_model_runner.py:2007] Model loading took 7.7902 GiB and 4.323328 seconds
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:41 [gpu_worker.py:276] Available KV cache memory: 25.31 GiB
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:41 [kv_cache_utils.py:1013] GPU KV cache size: 189,584 tokens
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:41 [kv_cache_utils.py:1017] Maximum concurrency for 1,024 tokens per request: 182.70x
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:41 [core.py:214] init engine (profile, create kv cache, warmup model) took 3.48 seconds
[1;36m(EngineCore_0 pid=1390942)[0;0m INFO 08-26 06:00:43 [__init__.py:3565] Cudagraph is disabled under eager mode
INFO 08-26 06:00:43 [llm.py:298] Supported_tasks: ['generate']


In [7]:
decoding_params = SamplingParams(temperature=0.2,
                                 max_tokens=512,
                                 skip_special_tokens=True,
                                 repetition_penalty=1.15)

results = model.generate(test_set, decoding_params)
results[0].outputs[0].text

Adding requests:   0%|          | 0/8392 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/8392 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s…

'Plug connectors'

## Evaluation

In [8]:
def clean_thai_text(text):
    # Remove HTML tags
    clean = re.compile('<.*?->')
    text = re.sub(clean, '', text)
    # Filter for Thai characters and common punctuation/spaces
    filtered = "".join(re.findall(r'[\u0E00-\u0E7F\s\.,\/!?;\'"()\[\]]', text))
    
    if "[COT]" in filtered:
        filtered = filtered[:filtered.index("[COT]")]

    return filtered.replace('\n', "")

response = [r.outputs[0].text for r in results]
test_df['PRED'] = response
test_df['PRED_cleaned'] = test_df['PRED'].apply(clean_thai_text)
test_df.head()

Unnamed: 0_level_0,NAME,ENG,THA,WIPO,PRED,PRED_cleaned
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,9,"sockets, plugs and other contacts (electric co...",เต้ารับ ปลั๊กและตัวติดต่อ (ตัวเชื่อมต่อไฟฟ้า),"Scientific, research, navigation, surveying, p...",Plug connectors,
2,9,"material for electricity mains (wires, cables)",วัสดุสำหรับส่วนควบคุมไฟฟ้าหลัก (ลวด สายเคเบิล),"Scientific, research, navigation, surveying, p...",computer programs,
3,9,electrical instrument element and electrical i...,ส่วนประกอบอุปกรณ์ไฟฟ้าและวัสดุที่เฉพาะกับอุปกร...,"Scientific, research, navigation, surveying, p...",and components,
4,12,"parts and accessories for automobiles, in part...",ชิ้นส่วนและอุปกรณ์เสริมสำหรับยานยนต์ โดยเฉพาะอ...,"Vehicles; apparatus for locomotion by land, ai...",and parts thereof,
5,42,industrial analysis and research services.,บริการวิเคราะห์และวิจัยทางอุตสาหกรรม,Scientific and technological services and rese...,material testing,


In [7]:
fname = f"{model_id}_{model_type}"
test_df[['THA', 'PRED', 'PRED_cleaned']].to_csv(f"{root_dir}/results/{fname}.csv", index=False)