<!DOCTYPE html>
<html lang="en">
    <head>
        <meta charset="UTF-8" />
        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
        <title>UAS NLP</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                background-color: #f0f0f0;
                margin: 0;
                padding: 0;
            }
            .container {
                text-align: center;
                margin-top: 50px;
            }
            h1 {
                color: #fff; /* Adjusted color for dark mode */
                font-size: 36px;
                margin-bottom: 10px;
            }
            h2 {
                color: #ccc; /* Adjusted color for dark mode */
                font-size: 24px;
                margin-bottom: 20px;
            }
            .members {
                font-size: 18px;
                line-height: 1.6;
                margin-top: 20px;
            }
            .member {
                display: block;
                margin-bottom: 10px;
            }
        </style>
    </head>
    <body>
        <div class="container">
            <h1>Ujian Akhir Semester</h1>
            <h2>IBDA3322 / Natural Language Processing</h2>
            <p class="members">
                <span class="member">Jennifer Atalya (202000208)</span>
                <span class="member">Renata Valencia (202001021)</span>
                <span class="member">Stefannus Christian (202000138)</span>
            </p>
        </div>
    </body>
</html>

# Import Libraries

In [12]:
# Import libraries
from transformers import (
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
import csv
import re
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import evaluate
from datasets import load_dataset
import warnings
# Ignore warnings (not recommended for production)
warnings.filterwarnings('ignore')

# Define Global Variables

In [16]:
# Define training parameters
TRAIN: bool = True  # Set to True to train the model
SEED = 42  # Random seed for reproducibility

# Define model and data paths
MODEL_PATH = "model/mbart-large-50-one-to-many-mmt-finetuned-en-to-id"
NUMBER_OF_DATA_TO_FINETUNED = 2000  # Limit data used for fine-tuning
PREFIX = ""  # Prefix to add before source language text
SOURCE_LANG = "en"  # Source language
TARGET_LANG = "id"  # Target language
MAX_INPUT_LENGTH = 128  # Maximum length of input sequence
MAX_TARGET_LENGTH = 128  # Maximum length of target sequence

TRAINING_RESULTS_DIRECTORY_BASE_PATH = "./training_results/"
TRAINING_RESULTS_FILENAME = "training_results.csv"
LOG_RESULTS_FILENAME = "log.pkl"

TRAINING_RESULTS_PATH = os.path.join(TRAINING_RESULTS_DIRECTORY_BASE_PATH, TRAINING_RESULTS_FILENAME)
LOG_RESULTS_PATH = os.path.join(TRAINING_RESULTS_DIRECTORY_BASE_PATH, LOG_RESULTS_FILENAME)

print(f'Will save training results csv to {TRAINING_RESULTS_PATH}')
print(f'Will save log results (log_history, bleu_log, meteor_log, eval_log) to {LOG_RESULTS_PATH}')

Will save training results csv to ./training_results/training_results.csv
Will save log results (log_history, bleu_log, meteor_log, eval_log) to ./training_results/log.pkl


# Load Raw Datasets

In [15]:
# Load dataset
raw_datasets = load_dataset("Helsinki-NLP/opus-100", "en-id")
model_mbart = 'facebook/mbart-large-50-one-to-many-mmt'
display(raw_datasets)  # View dataset information

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [16]:
# Load tokenizer for the pre-trained model
tokenizer = MBart50TokenizerFast.from_pretrained(
    model_mbart.replace('one-to-many-mmt',
                        'many-to-many-mmt'),  # Fix model name
    src_lang="en_XX",
    tgt_lang="id_ID"
)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

In [7]:
def lowercase_text(text: str) -> str:
    text = text.lower()
    return text

def remove_extra_spaces(text :str) -> str:
    """
    Removes extra spaces from a string using regular expression.

    Args:
    - text (str): The input text with extra spaces.

    Returns:
    - str: The text with extra spaces removed.
    """
    # Use regular expression to substitute multiple spaces with a single space
    cleaned_text = re.sub(' +', ' ', text)
    return cleaned_text.strip()

def preprocess_function(examples):
    """
    Preprocess data examples:
        - Add prefix to source language text
        - Lowercase source language text
        - Remove extra spaces from source and target language text
        - Encode text with tokenizer
    """
    # Lowercase source language text and remove extra spaces
    inputs = [PREFIX + lowercase_text(remove_extra_spaces(ex[SOURCE_LANG])) for ex in examples["translation"]]
    # Lowercase target language text and remove extra spaces
    targets = [lowercase_text(remove_extra_spaces(ex[TARGET_LANG])) for ex in examples["translation"]]

    model_inputs = tokenizer(
        inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Set tokenizer for target language
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
lowercase_text(remove_extra_spaces("  The     quICk brown    FOX "))

'the quick brown fox'

In [19]:
# Preprocess dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# Split datasets into train and validation sets
train_tokenized_dataset = tokenized_datasets["train"]
eval_tokenized_dataset = tokenized_datasets["validation"]
print(f"Number of train tokenized data: {len(train_tokenized_dataset)}")
print(f"Number of eval tokenized data: {len(eval_tokenized_dataset)}")

# Select a small subset of data for fine-tuning (for faster training)
small_train_dataset = train_tokenized_dataset.shuffle(
    seed=SEED).select(range(NUMBER_OF_DATA_TO_FINETUNED))
small_eval_dataset = eval_tokenized_dataset.shuffle(
    seed=SEED).select(range(NUMBER_OF_DATA_TO_FINETUNED))

print(f"Number of small_train_dataset {len(small_train_dataset)}")
print(f"Number of small_eval_dataset {len(small_eval_dataset)}")

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Number of train tokenized data: 1000000
Number of eval tokenized data: 2000
Number of small_train_dataset 2000
Number of small_eval_dataset 2000


In [20]:
# Load pre-trained model (make sure to use 'cuda' for GPU training)
model = MBartForConditionalGeneration.from_pretrained(model_mbart).cuda()

In [42]:
# Define training hyperparameters
hyperparameters = {
    'learning_rate': 2e-5,
    'batch_size': 16,
    'num_epochs': 10
}

# Define training arguments (`Seq2SeqTrainingArguments`)
args = Seq2SeqTrainingArguments(
    # Create a name for your fine-tuned model
    f"{MODEL_NAME}-finetuned-{SOURCE_LANG}-to-{TARGET_LANG}",
    # Evaluate the model after every epoch
    evaluation_strategy="epoch",
    # Save the model after every epoch
    save_strategy="epoch",
    # Set learning rate from hyperparameters
    learning_rate=hyperparameters["learning_rate"],
    # Batch size for training on a single device
    per_device_train_batch_size=hyperparameters["batch_size"],
    # Batch size for evaluation on a single device
    per_device_eval_batch_size=hyperparameters["batch_size"],
    # Weight decay to avoid overfitting
    weight_decay=0.01,
    # Limit number of saved models during training
    save_total_limit=hyperparameters["num_epochs"],
    # Set the total number of training epochs
    num_train_epochs=hyperparameters["num_epochs"],
    # Generate text with beam search during evaluation
    predict_with_generate=True,
    # Load the best model from the training process for evaluation
    load_best_model_at_end=True,
)

In [21]:
# Create a data collator to prepare batches for training and evaluation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [44]:
# Load evaluation metrics: BLEU score and METEOR
metric = evaluate.load("sacrebleu")
meteor = evaluate.load('meteor')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels)
    meteor_result = meteor.compute(
        predictions=decoded_preds, references=decoded_labels)
    prediction_lens = [np.count_nonzero(
        pred != tokenizer.pad_token_id) for pred in preds]
    result = {'bleu': result['score']}
    result["gen_len"] = np.mean(prediction_lens)
    result["meteor"] = meteor_result["meteor"]
    result = {k: round(v, 4) for k, v in result.items()}
    return result

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [45]:
if TRAIN:
    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()
    trainer.save_model(MODEL_NAME)

    log_history = pd.DataFrame(trainer.state.log_history)
    bleu_log = log_history["eval_bleu"].dropna().to_list()
    meteor_log = log_history["eval_meteor"].dropna().to_list()
    eval_log = log_history["eval_loss"].dropna().to_list()

    if not os.path.exists(LOG_RESULTS_PATH):
        os.makedirs(LOG_RESULTS_PATH)

    # Save the variables using pickle
    with open(LOG_RESULTS_PATH, "wb") as f:
        pickle.dump((log_history, bleu_log, meteor_log, eval_log), f)
else:
    # Load the saved variables using pickle
    with open(LOG_RESULTS_PATH, "rb") as f:
        log_history, bleu_log, meteor_log, eval_log = pickle.load(f)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(20, 10))
plt.subplot(2, 2, 1)
plt.plot(range(len(bleu_log)), bleu_log, label='BLEU Score')
plt.title("BLEU Score")
plt.xlabel('Epoch')
plt.ylabel('Score')

plt.subplot(2, 2, 2)
plt.plot(range(len(meteor_log)), meteor_log, label='METEOR Score')
plt.title("METEOR Score")
plt.xlabel('Epoch')
plt.ylabel('Score')

plt.subplot(2, 2, 3)
plt.plot(range(len(eval_log)), eval_log, label='Generation Length')
plt.title("Validation Loss")
plt.xlabel('Epoch')
plt.ylabel('Score')

plt.legend()
plt.show()

In [None]:
rows = []
for i in range(len(bleu_log)):
    row = {
        'Epoch': i+1,
        'BLEU': bleu_log[i],
        'METEOR': meteor_log[i],
        'validation_loss': eval_log[i],
    }
    rows.append(row)

# Define the fieldnames for the CSV file
fieldnames = ['Epoch', 'BLEU', 'METEOR', 'validation_loss']

if not os.path.exists(TRAINING_RESULTS_PATH):
    os.makedirs(TRAINING_RESULTS_PATH)

# Write the rows to a CSV file
with open(TRAINING_RESULTS_PATH, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

In [46]:
tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH, src_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained(MODEL_PATH).cuda()

src_text =  "This model is a fine-tuned checkpoint of mBART-large-50. mbart-large-50-many-to-many-mmt is fine-tuned for multilingual machine translation. It was introduced in Multilingual Translation with Extensible Multilingual Pretraining and Finetuning paper."

src_text = src_text.replace("!",".")
src_text = lowercase_text(remove_extra_spaces(src_text))
sentences = [sentence+". " for sentence in src_text.split(".") if len(sentence) > 0]
print(sentences)

translation_results = ""
for sentence in sentences:
    model_inputs = tokenizer(sentence, return_tensors="pt")

    generated_tokens = model.generate(
        **model_inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["id_ID"],
    )

    translation = tokenizer.batch_decode(
        generated_tokens, skip_special_tokens=True)
    translation_results += translation[0].strip()

print()
print(translation_results)

['My Wonderful Family. ', ' I live in a house near the mountains. ', ' I have two brothers and one sister, and I was born last. ', ' My father teaches mathematics, and my mother is a nurse at a big hospital. ', ' My brothers are very smart and work hard in school. ', ' My sister is a nervous girl, but she is very kind. ', ' My grandmother also lives with us. ', ' She came from Italy when I was two years old. ', ' She has grown old, but she is still very strong. ', ' She cooks the best food. ', ' My family is very important to me. ', ' We do lots of things together. ', ' My brothers and I like to go on long walks in the mountains. ', ' My sister likes to cook with my grandmother. ', ' On the weekends we all play board games together. ', ' We laugh and always have a good time. ', ' I love my family very much. ']


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)