# Library

In [1]:
!pip install transformers[torch] datasets sentencepiece sacrebleu evaluate rouge_score -U

import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback, get_linear_schedule_with_warmup, AdamW, BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset, load_metric
import evaluate
import numpy as np
import pandas as pd
import zipfile
import requests
import os

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting transformers[torch]
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers[torch])
  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers[torch])
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64

2024-05-21 11:57:32.878510: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 11:57:32.878640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 11:57:33.017919: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# verify that packages are install correctly

In [2]:
import torch
import transformers
import datasets
import sentencepiece
import sacrebleu
import evaluate

print(f"transformers version: {transformers.__version__}")
print(f"datasets version: {datasets.__version__}")
print(f"sentencepiece version: {sentencepiece.__version__}")
print(f"sacrebleu version: {sacrebleu.__version__}")
print(f"evaluate version: {evaluate.__version__}")
print("CUDA available:", torch.cuda.is_available())

transformers version: 4.41.0
datasets version: 2.19.1
sentencepiece version: 0.2.0
sacrebleu version: 2.4.2
evaluate version: 0.4.2
CUDA available: True


# Load the dataset 

In [81]:
# List of CSV files to load
csv_files = [
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/wikipedia.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/assorted_government.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/generated_reviews_crowd.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/generated_reviews_translator.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/generated_reviews_yn.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/mozilla_common_voice.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/msr_paraphrase.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/nus_sms.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/paracrawl.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/task_master_1.csv',
    '/kaggle/input/scb-mt-en-th-2020/scb-mt-en-th-2020/thai_websites.csv'
]

# Load the CSV files
dataframes = [pd.read_csv(file) for file in csv_files]
data_df = pd.concat(dataframes, ignore_index=True)

# Rename columns 
data_df = data_df.rename(columns={'en_text': 'source_text', 'th_text': 'target_text'})

# Check the size of the dataset
print(f"Total number of examples: {len(data_df)}")

Total number of examples: 988249


In [82]:
# Use only a subset of the dataset for training to reduce time
subset_fraction = 0.12  # Use 12% of the dataset
data_df = data_df.sample(frac=subset_fraction, random_state=42) 

# Split the dataset into train and test sets
train_df = data_df.sample(frac=0.8, random_state=42)
test_df = data_df.drop(train_df.index)


In [84]:
from datasets import Dataset

# Balance the dataset by duplicating and reversing sentence pairs
train_en_th = train_df.sample(frac=0.5, random_state=42)
train_th_en = train_en_th.rename(columns={'source_text': 'target_text', 'target_text': 'source_text'})
train_balanced_df = pd.concat([train_en_th, train_th_en], axis=0)

# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_balanced_df)

# Use only a smaller subset of the test dataset
test_df = test_df.sample(frac=0.1, random_state=42)  # Use 10% of the test dataset
test_dataset = Dataset.from_pandas(test_df)


# Verify sizes after splitting
print(f"Number of training examples: {len(train_balanced_df)}")
print(f"Number of test examples: {len(test_df)}")



Number of training examples: 94872
Number of test examples: 2372


In [None]:
# Load pre-trained tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Tokenization function
def tokenize_function(examples):
    source = examples['source_text']
    target = examples['target_text']
    model_inputs = tokenizer(source, max_length=64, truncation=True, padding='max_length')
    labels = tokenizer(text_target=target, max_length=64, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


# Verify the number of examples after tokenization
print(f"Number of tokenized training examples: {len(train_dataset)}")
print(f"Number of tokenized test examples: {len(test_dataset)}")


In [15]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Reduce number of epochs
    per_device_train_batch_size=1,  # Reduce batch size
    per_device_eval_batch_size=1,  # Reduce batch size
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    learning_rate=5e-5,
    predict_with_generate=True,
    gradient_accumulation_steps=16,  # Increase gradient accumulation
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=3,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True
)

# Define evaluation metrics
metric_bleu = evaluate.load('sacrebleu')
metric_rouge = evaluate.load('rouge')


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.replace("\n", " ") for pred in decoded_preds]
    decoded_labels = [label.replace("\n", " ") for label in decoded_labels]
    decoded_labels = [[label] for label in decoded_labels]
    bleu_result = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)
    rouge1_fmeasure = rouge_result['rouge1']
    rouge2_fmeasure = rouge_result['rouge2']
    rougeL_fmeasure = rouge_result['rougeL']
    result = {
        "bleu": bleu_result["score"],
        "rouge1": rouge1_fmeasure,
        "rouge2": rouge2_fmeasure,
        "rougeL": rougeL_fmeasure,
    }
    return result




In [16]:
# Initialize the trainer with early stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Increase patience
)

# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel
1000,1.3224,1.062462,6.470108,0.026844,0.014404,0.026401
2000,1.1485,0.881074,6.75184,0.033744,0.016658,0.033401
3000,1.0557,0.766987,9.012571,0.033445,0.014595,0.033081
4000,0.9893,0.709166,10.573017,0.043552,0.018963,0.043167
5000,0.9745,0.663262,10.775226,0.044642,0.017878,0.044312


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=5929, training_loss=1.2391288993450698, metrics={'train_runtime': 15116.8448, 'train_samples_per_second': 6.276, 'train_steps_per_second': 0.392, 'total_flos': 1.2848763185922048e+16, 'train_loss': 1.2391288993450698, 'epoch': 0.9999156758579981})

In [17]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

# Save the model
model.save_pretrained('./eng-tha-translation-model')
tokenizer.save_pretrained('./eng-tha-translation-model')



Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'eval_loss': 0.6632624268531799, 'eval_bleu': 10.775226257004267, 'eval_rouge1': 0.044642346309467645, 'eval_rouge2': 0.017877552932359, 'eval_rougeL': 0.04431212428248934, 'eval_runtime': 947.7877, 'eval_samples_per_second': 2.503, 'eval_steps_per_second': 2.503, 'epoch': 0.9999156758579981}


('./eng-tha-translation-model/tokenizer_config.json',
 './eng-tha-translation-model/special_tokens_map.json',
 './eng-tha-translation-model/vocab.json',
 './eng-tha-translation-model/merges.txt',
 './eng-tha-translation-model/added_tokens.json')

**end train1**

# Translation function

In [13]:
def translate(text, source_lang='en', target_lang='th'):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding='max_length')
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same device as the model
    
    model.to(device)  # Move model to the correct device
    
    # Generate translation
    outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=128)
    
    # Decode the generated outputs
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return translated_text

def test_translation():
    direction = input("Enter translation direction (1 for EN-TH, 2 for TH-EN): ")
    text = input("Enter the text to translate: ")
    if direction == '1':
        translated_text = translate(text, source_lang='en', target_lang='th')
    else:
        translated_text = translate(text, source_lang='th', target_lang='en')
    print(f"Translated text: {translated_text}")



train2 start and load model1 from zip with new data set

In [28]:
# Load all CSV files
dfs = [pd.read_csv(file) for file in csv_files]
data_df = pd.concat(dfs, axis=0)

# Use a smaller subset of the dataset for training
subset_fraction = 0.11  # Use 11% of the dataset for faster training
data_df = data_df.sample(frac=subset_fraction, random_state=50)

# Split the dataset into train and test sets
train_df = data_df.sample(frac=0.8, random_state=50) #80 percent of data set
test_df = data_df.drop(train_df.index) #20 percent of data set

# Use only a smaller subset of the test dataset
test_df = test_df.sample(frac=0.2, random_state=50)  # 20% of the test dataset
test_dataset = Dataset.from_pandas(test_df)

# Balance the dataset by duplicating and reversing sentence pairs
train_en_th = train_df.sample(frac=0.8, random_state=50)
train_th_en = train_en_th.rename(columns={'en_text': 'th_text', 'th_text': 'en_text'})
train_balanced_df = pd.concat([train_en_th, train_th_en], axis=0)

# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_balanced_df)
test_dataset = Dataset.from_pandas(test_df)






In [29]:
# Verify sizes after splitting
print(f"Number of training examples: {len(train_balanced_df)}")
print(f"Number of test examples: {len(test_df)}")


Number of training examples: 139146
Number of test examples: 2981


In [30]:
# Load the pre-trained tokenizer and model from the first model
model_path = '/kaggle/input/model1/eng-tha-translation-model'
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Tokenization function
def tokenize_function(examples):
    source = examples['en_text']
    target = examples['th_text']
    model_inputs = tokenizer(source, max_length=64, truncation=True, padding='max_length')
    labels = tokenizer(text_target=target, max_length=64, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [31]:
# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/139146 [00:00<?, ? examples/s]

Map:   0%|          | 0/2981 [00:00<?, ? examples/s]

In [32]:
print(f"Number of tokenized training examples: {len(train_dataset)}")
print(f"Number of tokenized test examples: {len(test_dataset)}")

Number of tokenized training examples: 139146
Number of tokenized test examples: 2981


In [34]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.replace("\n", " ") for pred in decoded_preds]
    decoded_labels = [label.replace("\n", " ") for label in decoded_labels]
    decoded_labels = [[label] for label in decoded_labels]
    bleu_result = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)
    rouge1_fmeasure = rouge_result['rouge1']
    rouge2_fmeasure = rouge_result['rouge2']
    rougeL_fmeasure = rouge_result['rougeL']
    result = {
        "bleu": bleu_result["score"],
        "rouge1": rouge1_fmeasure,
        "rouge2": rouge2_fmeasure,
        "rougeL": rougeL_fmeasure,
    }
    return result

In [40]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,  
    evaluation_strategy="steps",
    save_steps=2000,  
    eval_steps=2000,  
    learning_rate=5e-5,  
    predict_with_generate=True,
    gradient_accumulation_steps=8,  # gradient accumulation 
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=3,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)





In [None]:
trainer.train()

In [42]:
# save the second model  after second train
trainer.save_model('./eng-tha-translation-second-model')
tokenizer.save_pretrained('./eng-tha-translation-second-model')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./eng-tha-translation-second-model/tokenizer_config.json',
 './eng-tha-translation-second-model/special_tokens_map.json',
 './eng-tha-translation-second-model/vocab.json',
 './eng-tha-translation-second-model/merges.txt',
 './eng-tha-translation-second-model/added_tokens.json')

# result after second train

In [44]:
test_translation()

Enter translation direction (1 for EN-TH, 2 for TH-EN):  1
Enter the text to translate:  กินข้าว


Translated text: Very disappointing.


# third train with new random data set

In [46]:
dfs = [pd.read_csv(file) for file in csv_files]
data_df = pd.concat(dfs, axis=0)


subset_fraction = 0.05  
data_df = data_df.sample(frac=subset_fraction, random_state=50)


train_df = data_df.sample(frac=0.8, random_state=25) #80 percent of data set
test_df = data_df.drop(train_df.index) #20 percent of data set


test_df = test_df.sample(frac=0.2, random_state=25)  # Use 10% of the test dataset
test_dataset = Dataset.from_pandas(test_df)


train_en_th = train_df.sample(frac=0.8, random_state=25)
train_th_en = train_en_th.rename(columns={'en_text': 'th_text', 'th_text': 'en_text'})
train_balanced_df = pd.concat([train_en_th, train_th_en], axis=0)


train_dataset = Dataset.from_pandas(train_balanced_df)
test_dataset = Dataset.from_pandas(test_df)


In [47]:
print(f"Number of training examples: {len(train_balanced_df)}")
print(f"Number of test examples: {len(test_df)}")


Number of training examples: 63248
Number of test examples: 1662


In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
metric_bleu = evaluate.load('sacrebleu')
metric_rouge = evaluate.load('rouge')

tokenizer = BartTokenizer.from_pretrained('./eng-tha-translation-second-model')
model = BartForConditionalGeneration.from_pretrained('./eng-tha-translation-second-model')


In [58]:
# Define training arguments for further training
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000, 
    evaluation_strategy="steps",
    save_steps=500, 
    eval_steps=500, 
    learning_rate=1e-4,  
    predict_with_generate=True,
    gradient_accumulation_steps=8,
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=3,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True
)

# Initialize the trainer with early stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Reduce patience for quicker stopping
)




In [59]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel
500,No log,0.546934,10.175849,0.038889,0.010396,0.03762


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=988, training_loss=0.8553760235126202, metrics={'train_runtime': 2331.1065, 'train_samples_per_second': 27.132, 'train_steps_per_second': 0.424, 'total_flos': 8564397387546624.0, 'train_loss': 0.8553760235126202, 'epoch': 0.9997470275739945})

In [60]:
# save the third trained model 
trainer.save_model('./eng-tha-translation-third-model')
tokenizer.save_pretrained('./eng-tha-translation-third-model')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./eng-tha-translation-third-model/tokenizer_config.json',
 './eng-tha-translation-third-model/special_tokens_map.json',
 './eng-tha-translation-third-model/vocab.json',
 './eng-tha-translation-third-model/merges.txt',
 './eng-tha-translation-third-model/added_tokens.json')

In [51]:
#zip the second model and create download link
!zip -r ./eng-tha-translation-second-model.zip ./eng-tha-translation-second-model

from IPython.display import FileLink
FileLink(r'eng-tha-translation-second-model.zip')

  adding: eng-tha-translation-second-model/ (stored 0%)
  adding: eng-tha-translation-second-model/special_tokens_map.json (deflated 85%)
  adding: eng-tha-translation-second-model/generation_config.json (deflated 45%)
  adding: eng-tha-translation-second-model/training_args.bin (deflated 51%)
  adding: eng-tha-translation-second-model/model.safetensors (deflated 7%)
  adding: eng-tha-translation-second-model/tokenizer_config.json (deflated 76%)
  adding: eng-tha-translation-second-model/vocab.json (deflated 68%)
  adding: eng-tha-translation-second-model/config.json (deflated 63%)
  adding: eng-tha-translation-second-model/merges.txt (deflated 53%)


In [61]:
#zip the third model
!zip -r ./eng-tha-translation-third-model.zip ./eng-tha-translation-third-model

from IPython.display import FileLink
FileLink(r'eng-tha-translation-third-model.zip')

  adding: eng-tha-translation-third-model/ (stored 0%)
  adding: eng-tha-translation-third-model/special_tokens_map.json (deflated 85%)
  adding: eng-tha-translation-third-model/generation_config.json (deflated 45%)
  adding: eng-tha-translation-third-model/training_args.bin (deflated 51%)
  adding: eng-tha-translation-third-model/model.safetensors (deflated 7%)
  adding: eng-tha-translation-third-model/tokenizer_config.json (deflated 76%)
  adding: eng-tha-translation-third-model/vocab.json (deflated 68%)
  adding: eng-tha-translation-third-model/config.json (deflated 63%)
  adding: eng-tha-translation-third-model/merges.txt (deflated 53%)


# translation after thrid train

In [70]:
test_translation()

Enter translation direction (1 for EN-TH, 2 for TH-EN):  2
Enter the text to translate:  ฉันชอบรถ


Translated text: I love the car.


# fourth train

In [71]:
dfs = [pd.read_csv(file) for file in csv_files]
data_df = pd.concat(dfs, axis=0)


subset_fraction = 0.05  #5 percent
data_df = data_df.sample(frac=subset_fraction, random_state=80)


train_df = data_df.sample(frac=0.8, random_state=80) #80 percent of data set
test_df = data_df.drop(train_df.index) #20 percent of data set


test_df = test_df.sample(frac=0.2, random_state=80)  # Use 10% of the test dataset
test_dataset = Dataset.from_pandas(test_df)


train_en_th = train_df.sample(frac=0.8, random_state=80)
train_th_en = train_en_th.rename(columns={'en_text': 'th_text', 'th_text': 'en_text'})
train_balanced_df = pd.concat([train_en_th, train_th_en], axis=0)


train_dataset = Dataset.from_pandas(train_balanced_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/63248 [00:00<?, ? examples/s]

Map:   0%|          | 0/1646 [00:00<?, ? examples/s]

In [72]:
tokenizer = BartTokenizer.from_pretrained('./eng-tha-translation-third-model')
model = BartForConditionalGeneration.from_pretrained('./eng-tha-translation-third-model')

In [73]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000, 
    evaluation_strategy="steps",
    save_steps=500, 
    eval_steps=500, 
    learning_rate=1e-4,  
    predict_with_generate=True,
    gradient_accumulation_steps=8,
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=3,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  
)




In [74]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel
500,No log,0.510781,10.624615,0.031829,0.011363,0.031941


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=988, training_loss=0.8604749301184527, metrics={'train_runtime': 2348.6722, 'train_samples_per_second': 26.929, 'train_steps_per_second': 0.421, 'total_flos': 8564397387546624.0, 'train_loss': 0.8604749301184527, 'epoch': 0.9997470275739945})

In [75]:
# save the third trained model 
trainer.save_model('./eng-tha-translation-fourth-model')
tokenizer.save_pretrained('./eng-tha-translation-fourth-model')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./eng-tha-translation-fourth-model/tokenizer_config.json',
 './eng-tha-translation-fourth-model/special_tokens_map.json',
 './eng-tha-translation-fourth-model/vocab.json',
 './eng-tha-translation-fourth-model/merges.txt',
 './eng-tha-translation-fourth-model/added_tokens.json')

In [76]:
#zip the fourth model 
!zip -r ./eng-tha-translation-fourth-model.zip ./eng-tha-translation-fourth-model

from IPython.display import FileLink
FileLink(r'eng-tha-translation-fourth-model.zip')

  adding: eng-tha-translation-fourth-model/ (stored 0%)
  adding: eng-tha-translation-fourth-model/special_tokens_map.json (deflated 85%)
  adding: eng-tha-translation-fourth-model/generation_config.json (deflated 45%)
  adding: eng-tha-translation-fourth-model/training_args.bin (deflated 51%)
  adding: eng-tha-translation-fourth-model/model.safetensors (deflated 7%)
  adding: eng-tha-translation-fourth-model/tokenizer_config.json (deflated 76%)
  adding: eng-tha-translation-fourth-model/vocab.json (deflated 68%)
  adding: eng-tha-translation-fourth-model/config.json (deflated 64%)
  adding: eng-tha-translation-fourth-model/merges.txt (deflated 53%)


# Example results after fourth train.

In [89]:
test_translation()

Enter translation direction (1 for EN-TH, 2 for TH-EN):  1
Enter the text to translate:  i love studying


Translated text: ฉันชอบการเรียน


In [103]:
test_translation()


Enter translation direction (1 for EN-TH, 2 for TH-EN):  2
Enter the text to translate:  ฉันสอบเสร็จเมื่อวันจันทร์


Translated text: I was so excited about it when it arrived.


In [104]:
test_translation()

Enter translation direction (1 for EN-TH, 2 for TH-EN):  1
Enter the text to translate:  I want more time.


Translated text: ฉันต้องการเวลา


In [105]:
test_translation()

Enter translation direction (1 for EN-TH, 2 for TH-EN):  1
Enter the text to translate:  How are you?


Translated text: คุณเป็นอย่างไร


In [106]:
test_translation()

Enter translation direction (1 for EN-TH, 2 for TH-EN):  1
Enter the text to translate:  I finished my exam last week


Translated text: ฉันจบการสอบได้เมื่อแล้ว


In [115]:
test_translation()

Enter translation direction (1 for EN-TH, 2 for TH-EN):  2
Enter the text to translate:  ฉันอ่านหนังสือ


Translated text: I read the book.
