In [1]:
!pip install -q datasets evaluate nltk sacremoses

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [2]:
import wandb
# Disable wandb logging
wandb.init(mode="disabled")

In [3]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import nltk
from sklearn.model_selection import train_test_split
from transformers import (M2M100ForConditionalGeneration,
                          M2M100Tokenizer,
                          Seq2SeqTrainingArguments, Seq2SeqTrainer,
                          Trainer,
                          TrainingArguments,
                          pipeline,
                          EarlyStoppingCallback)


### Prepair custom metric

In [4]:
# Download the necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Define compute_metrics function for BLEU score
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU scores
    bleu_scores = []
    for pred, label in zip(decoded_preds, decoded_labels):
        reference = nltk.word_tokenize(label)
        candidate = nltk.word_tokenize(pred)
        bleu_score = sentence_bleu([reference], candidate, weights=(0.5, 0.5))  # 2-gram BLEU
        bleu_scores.append(bleu_score)

    return {"bleu": sum(bleu_scores) / len(bleu_scores)}

### Prepair Model and Tokenizer

In [6]:
# Load the pre-trained model and tokenizer
model_name = "facebook/m2m100_418M"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

In [7]:
# Set the source and target languages
tokenizer.src_lang = "th"  # Thai
tokenizer.tgt_lang = "th"  # Boran Thai

In [8]:
# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["Modern Thai"], truncation=True, padding="max_length", max_length=128)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(examples["Thai Boran"], truncation=True, padding="max_length", max_length=128)
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids,
    }

### Load Data

In [9]:
import os
import pandas as pd

In [14]:
!git clone https://github.com/OREOSITY/MachineTranslation.git

Cloning into 'MachineTranslation'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 26.50 KiB | 5.30 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [15]:
# Folder containing .txt files
folder_path = '/kaggle/working/MachineTranslation/THA-BORAN/'

# Initialize an empty list to store DataFrames
df_list = []

# Loop through all files in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a .txt file
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        try:
            # Read the .txt file into a DataFrame (adjust separator if needed)
            df = pd.read_csv(file_path, sep=',', header=0)  # Assuming tab-separated values
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Concatenate all DataFrames into one
final_df = pd.concat(df_list, ignore_index=True)

In [16]:
# Split the dataset
dataset = Dataset.from_pandas(final_df)
train_test = dataset.train_test_split(test_size=0.2, seed=92)

In [17]:
tokenized_train = train_test['train'].map(tokenize_function, batched=True)
tokenized_test = train_test['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/761 [00:00<?, ? examples/s]



Map:   0%|          | 0/191 [00:00<?, ? examples/s]

### Train

In [36]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy= "epoch", #"steps",
#     eval_steps = 20,
    save_strategy = "epoch", #"steps",
#     save_steps = 20,
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=2,
    predict_with_generate=True,
#     fp16=torch.cuda.is_available(),  # Enable mixed precision training if available
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
)

# Define the EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=5, # Number of evaluations with no improvement to wait before stopping
    early_stopping_threshold=0.01 # Minimum change to qualify as an improvement
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

In [37]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu
1,2.8451,2.289082,0.903561
2,2.0546,1.904323,0.912503


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=96, training_loss=2.4498852094014487, metrics={'train_runtime': 269.5817, 'train_samples_per_second': 5.646, 'train_steps_per_second': 0.356, 'total_flos': 412291650551808.0, 'train_loss': 2.4498852094014487, 'epoch': 2.0})

### Inference

In [38]:
# Create a translation pipeline
translator = pipeline("translation",
                      model= model,
                      tokenizer=tokenizer)

Device set to use cuda:0


In [39]:
# Example usage of the pipeline
northern_thai_text = "พี่ฮ้อ เราจะไปนอนกันกี่โมงครับ เราจะร่ำรวยกันใช่ไหม"
result = translator(northern_thai_text, src_lang="th", tgt_lang="th")
print(f"Modern Thai: {northern_thai_text}")
print(f"Thai Boran: {result[0]['translation_text']}")

Modern Thai: พี่ฮ้อ เราจะไปนอนกันกี่โมงครับ เราจะร่ำรวยกันใช่ไหม
Thai Boran: พี่ฮ้อ เราจักไปนอนกันกี่โมงครับ เราจักร่ํารวยกันใช่ฤไม่
