In [1]:
# !pip install torch

In [2]:
# !pip install transformers

# import library

In [3]:
import pandas as pd
import numpy as np
import string
import re
import matplotlib.pyplot as plt 
import nltk
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

# read data

In [4]:
df=pd.read_csv('/kaggle/input/data-translator/data.csv')

In [5]:
df.head(1)

Unnamed: 0,english,vietnamese,len_en,len_vi
0,please put the dustpan in the broom closet,xin vui lòng đặt người quét rác trong tủ chổi,8,10


# Device Setup

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Train-validation split

In [7]:
x = df['english']
y = df['vietnamese']

In [8]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# Initialize Tokenizer and Model

In [9]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.to(device)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# Define Dataset Class

In [10]:
class TranslationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_text = "Translate English to Vietnamese: " + self.texts.iloc[idx]
        target_text = self.labels.iloc[idx]

        # Tokenize input and target
        inputs = self.tokenizer(input_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        targets = self.tokenizer(target_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0),
        }


# Create Dataset Instances

In [11]:
train_dataset = TranslationDataset(train_texts, train_labels, tokenizer)
val_dataset = TranslationDataset(val_texts, val_labels, tokenizer)

#  Define Training Arguments

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Increase epochs if dataset is small
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_strategy="epoch",
    report_to="none"
)


# Initialize Trainer

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


# Train the Model

In [14]:
trainer.train()

***** Running training *****
  Num examples = 203240
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 63515


Epoch,Training Loss,Validation Loss
1,0.2976,0.168725
2,0.1774,0.125455
3,0.1459,0.108495
4,0.1316,0.100811
5,0.1253,0.098394


***** Running Evaluation *****
  Num examples = 50810
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-12703
Configuration saved in ./results/checkpoint-12703/config.json
Model weights saved in ./results/checkpoint-12703/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-12703/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-12703/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50810
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-25406
Configuration saved in ./results/checkpoint-25406/config.json
Model weights saved in ./results/checkpoint-25406/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-25406/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-25406/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50810
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-38109
Configuration saved in ./results/checkpoin

TrainOutput(global_step=63515, training_loss=0.17553624451409117, metrics={'train_runtime': 12026.4997, 'train_samples_per_second': 84.497, 'train_steps_per_second': 5.281, 'total_flos': 3.43835846639616e+16, 'train_loss': 0.17553624451409117, 'epoch': 5.0})

# Save Model and Tokenizer

In [15]:
model.save_pretrained("./t5_translation_model")
tokenizer.save_pretrained("./t5_translation_model")
print("Model and Tokenizer saved successfully.")


Configuration saved in ./t5_translation_model/config.json
Model weights saved in ./t5_translation_model/pytorch_model.bin
tokenizer config file saved in ./t5_translation_model/tokenizer_config.json
Special tokens file saved in ./t5_translation_model/special_tokens_map.json


Model and Tokenizer saved successfully.


# Define Translation Function

In [16]:
def translate(text, model, tokenizer, device, max_length=128):
    model.eval()
    input_text = "Translate English to Vietnamese: " + text
    input_ids = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length).input_ids.to(device)
    output_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


# Perform Inference

In [17]:
# Translate a test sentence
test_sentence = "I live in New York."
translated_sentence = translate(test_sentence, model, tokenizer, device)
print("Translated Sentence:", translated_sentence)

Translated Sentence: tôi sng  New York
