<a href="https://colab.research.google.com/github/Ptuancuong/TH-TimeSeries.csv/blob/main/C%C3%A2u3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers sentencepiece

In [3]:
from datasets import Dataset, DatasetDict

train_en = [
    "Hello, how are you?",
    "This product is very good.",
    "I need to check my order status.",
    "The weather is nice today.",
    "Please help me reset my password.",
    "I love drinking coffee in the morning.",
    "The package arrived late yesterday.",
    "Can you recommend a cheap hotel?",
    "The application crashed during login.",
    "Thank you for your support!"
]
train_vi = [
    "Xin chào, bạn khỏe không?",
    "Sản phẩm này rất tốt.",
    "Tôi cần kiểm tra trạng thái đơn hàng.",
    "Thời tiết hôm nay thật đẹp.",
    "Vui lòng giúp tôi đặt lại mật khẩu.",
    "Tôi thích uống cà phê vào buổi sáng.",
    "Bưu kiện đã đến muộn vào hôm qua.",
    "Bạn có thể gợi ý một khách sạn giá rẻ không?",
    "Ứng dụng bị lỗi khi đăng nhập.",
    "Cảm ơn bạn vì sự hỗ trợ!"
]

valid_en = [
    "I would like to change my delivery address.",
    "It is raining heavily in the city."
]
valid_vi = [
    "Tôi muốn thay đổi địa chỉ giao hàng.",
    "Trời đang mưa rất to trong thành phố."
]

train_ds = Dataset.from_dict({"en": train_en, "vi": train_vi})
valid_ds = Dataset.from_dict({"en": valid_en, "vi": valid_vi})
ds = DatasetDict({"train": train_ds, "validation": valid_ds})
ds


DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 10
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 2
    })
})

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint = "Helsinki-NLP/opus-mt-en-vi"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

max_src_len = 96
max_tgt_len = 96

def preprocess(batch):
    model_inputs = tokenizer(batch["en"], max_length=max_src_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["vi"], max_length=max_tgt_len, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tok = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
tok


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/289M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]



Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})

In [5]:
from torch.utils.data import DataLoader

pad_id = tokenizer.pad_token_id
def collate_fn(features):
    # pad input_ids & attention_mask
    batch = {}
    batch["input_ids"] = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(f["input_ids"]) for f in features],
        batch_first=True, padding_value=pad_id
    )
    batch["attention_mask"] = torch.nn.utils.rnn.pad_sequence(
        [torch.ones(len(f["input_ids"]), dtype=torch.long) for f in features],
        batch_first=True, padding_value=0
    )
    # pad labels và đổi pad_id -> -100 (để bỏ qua khi tính CE loss)
    labels = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(f["labels"]) for f in features],
        batch_first=True, padding_value=pad_id
    )
    labels[labels==pad_id] = -100
    batch["labels"] = labels
    return {k:v.to(device) for k,v in batch.items()}

train_loader = DataLoader(tok["train"], batch_size=8, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(tok["validation"], batch_size=8, shuffle=False, collate_fn=collate_fn)

optim = torch.optim.AdamW(model.parameters(), lr=3e-5)

epochs = 5
model.train()
for ep in range(1, epochs+1):
    total_loss = 0.0
    for batch in train_loader:
        optim.zero_grad()
        out = model(**batch)          # model tự tính Cross-Entropy theo token khi có 'labels'
        loss = out.loss
        loss.backward()
        optim.step()
        total_loss += loss.item()
    print(f"Epoch {ep} - Train loss: {total_loss/len(train_loader):.4f}")


Epoch 1 - Train loss: 1.3186
Epoch 2 - Train loss: 1.0715
Epoch 3 - Train loss: 0.7391
Epoch 4 - Train loss: 0.3979
Epoch 5 - Train loss: 0.3277


In [6]:
model.eval()
def translate(sentences, num_beams=5, max_new_tokens=96):
    enc = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True, max_length=max_src_len).to(device)
    gen = model.generate(
        **enc,
        num_beams=num_beams,
        max_new_tokens=max_new_tokens
    )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

tests = [
    "I have not received the package yet.",
    "Could you please provide a refund?",
    "The battery drains too fast when gaming."
]

preds = translate(tests)
for en, vi in zip(tests, preds):
    print(f"EN: {en}\nVI: {vi}\n")


EN: I have not received the package yet.
VI: Tôi vẫn chưa nhận được bưu kiện.

EN: Could you please provide a refund?
VI: Bạn có thể cung cấp một điện thoại không?

EN: The battery drains too fast when gaming.
VI: Áp suất quá nhanh khi lên cơn.

