In [None]:
!pip install optuna

In [None]:
!pip install rouge_score

!pip install nltk
!pip install transformers
!pip install datasets

In [None]:
# -*- coding: utf-8 -*-
"""Tshiluba_English_Dataset.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/105GRn0pQldcwPb_3C80TL0JXChjZq9h_
"""

import pandas as pd
from google.colab import drive
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and preprocess English data

In [None]:
english_data = pd.read_csv('/content/drive/My Drive/English_Mathew_Data.csv')

# Split 'Verse' into 'Number' and 'Text'
english_data[['Number', 'Text']] = english_data['Verse Text'].str.extract(r'^(\d+)(.*)')

# Drop the original 'Verse' column
english_data.drop(columns=['Verse Text'], inplace=True)

# Load and preprocess Tshiluba data

In [None]:
file_path = '/content/drive/My Drive/Tshiluba_Matthew_Data.csv'
tshiluba_data = pd.read_csv(file_path, encoding="latin-1")
tshiluba_data.drop("Unnamed: 2", axis=1, inplace=True)
tshiluba_data.replace('', '', regex=True, inplace=True)

# Combine Data

In [None]:
parallel_data = pd.concat([tshiluba_data, english_data], axis=1)
parallel_data.drop("Number", axis=1, inplace=True)
parallel_data.to_csv("/content/drive/My Drive/Cleaned_Parallel_Data.csv", index=False)

# Import the necessary libraries

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM, AutoTokenizer, TrainerCallback
import optuna
from datasets import load_metric
import numpy as np

# Define Functions

In [None]:
def preprocess_function(examples):
    """
    Preprocesses the input examples for the translation task.

    Args:
        examples (dict): A dictionary containing 'Verse Text' and 'Text' as keys.
                         'Verse Text' contains the source text, and 'Text' contains the target text.

    Returns:
        dict: A dictionary containing tokenized inputs and labels, ready for model training.
              The dictionary includes:
              - 'input_ids': Tokenized input IDs with padding and truncation.
              - 'attention_mask': Attention mask for the tokenized inputs.
              - 'labels': Tokenized target IDs with padding and truncation.
    """
    inputs = examples['Verse Text']
    targets = examples['Text']

    # Tokenize inputs with padding and truncation
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize targets using the target tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Add the tokenized labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [None]:
def compute_metrics(pred):
    """
    Computes the BLEU score for the predictions.

    Args:
        pred (EvalPrediction): An object containing predictions and labels from the model.

    Returns:
        dict: A dictionary with the BLEU score for the predictions.
              The dictionary includes:
              - 'bleu': The BLEU score as a float.
    """
    # Access the logits from predictions
    logits = pred.predictions[0]
    # Get the predicted token IDs by taking the argmax along the last dimension
    pred_ids = logits.argmax(axis=-1)
    # Decode the predicted token IDs to text
    pred_texts = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # Get the label IDs and replace -100 with the pad token ID
    labels_ids = pred.label_ids
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    # Decode the label IDs to text
    labels_texts = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Strip leading/trailing whitespace from the decoded texts
    pred_texts = [text.strip() for text in pred_texts]
    labels_texts = [text.strip() for text in labels_texts]

    # Compute BLEU score using the decoded texts
    bleu_score = bleu_metric.compute(predictions=[pred.split() for pred in pred_texts], references=[[label.split()] for label in labels_texts])

    return {"bleu": bleu_score["bleu"]}


In [None]:
def mock_objective(trial):
    """
    A mock objective function for Optuna optimization.

    Args:
        trial (optuna.trial.Trial): An Optuna trial object used to suggest hyperparameters.

    Returns:
        float: A mock score calculated based on suggested hyperparameters.
    """
    # Suggest hyperparameters for the trial
    num_train_epochs = trial.suggest_int('num_train_epochs', 1, 10)
    batch_size = trial.suggest_categorical('per_device_train_batch_size', [1, 2])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
    gradient_accumulation_steps = trial.suggest_int('gradient_accumulation_steps', 1, 16)
    weight_decay = trial.suggest_float('weight_decay', 0.0, 0.3)

    # Compute a mock score based on the suggested hyperparameters
    mock_score = (
        (num_train_epochs * learning_rate) / (batch_size * gradient_accumulation_steps)
        - weight_decay
    )

    # Add random noise to the mock score
    noise = trial.suggest_uniform('noise', 0.0, 0.1)
    mock_score += noise

    return mock_score


In [None]:
from transformers import TrainerCallback

class PrintLossCallback(TrainerCallback):
    """
    A custom callback for the Hugging Face Trainer that prints training and evaluation loss values.

    Inherits from:
        TrainerCallback: The base callback class provided by the Hugging Face `transformers` library.

    Methods:
        on_log(args, state, control, logs=None, **kwargs):
            Called when logs are available, prints training and evaluation loss values.
    """
    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        Callback function that prints the loss values during training and evaluation.

        Args:
            args (TrainingArguments): The training arguments.
            state (TrainerState): The current state of the trainer.
            control (TrainerControl): The control object used to manage training.
            logs (dict, optional): A dictionary containing logs such as 'loss' and 'eval_loss'.
            **kwargs: Additional keyword arguments.
        """
        if logs:
            if 'loss' in logs:
                print(f"Training loss: {logs['loss']}")
            if 'eval_loss' in logs:
                print(f"Evaluation loss: {logs['eval_loss']}")


In [None]:
# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-mul-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(parallel_data)
dataset = dataset.map(preprocess_function, batched=True)

# Split dataset into train and eval
dataset_dict = dataset.train_test_split(test_size=0.1)
train_dataset = dataset_dict['train']
eval_dataset = dataset_dict['test']

# Define evaluation metrics function
bleu_metric = load_metric("bleu")




# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(mock_objective, n_trials=5)

# Extract best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")


# Load model for final training
final_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
final_model.gradient_checkpointing_enable()

# Set up final training arguments
final_training_args = Seq2SeqTrainingArguments(
    output_dir='./final_model',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=best_params['per_device_train_batch_size'],
    learning_rate=best_params['learning_rate'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./final_logs',
    logging_steps=10,
    load_best_model_at_end=True,
    gradient_accumulation_steps=best_params.get('gradient_accumulation_steps', 1),
    fp16=True,
    save_total_limit=1,
    weight_decay=best_params.get('weight_decay', 0.0),
    report_to="none"  # Disable TensorBoard to focus on print output
)

# Train the final model with best hyperparameters
final_trainer = Seq2SeqTrainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[PrintLossCallback()]  # Add custom callback here
)

final_trainer.train()

# Save the best model and tokenizer
model_save_path = '/content/drive/My Drive/New_best_model'
tokenizer_save_path = '/content/drive/My Drive/New_best_tokenizer'
final_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)




Map:   0%|          | 0/1071 [00:00<?, ? examples/s]

[I 2024-08-05 22:15:22,365] A new study created in memory with name: no-name-5ea9bb4a-ee62-4407-b57e-916848fae498
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  noise = trial.suggest_uniform('noise', 0.0, 0.1)
[I 2024-08-05 22:15:22,369] Trial 0 finished with value: -0.19699082287325884 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 1, 'learning_rate': 3.6393434907557593e-06, 'gradient_accumulation_steps': 13, 'weight_decay': 0.29643952184597544, 'noise': 0.09944729922522014}. Best is trial 0 with value: -0.19699082287325884.
[I 2024-08-05 22:15:22,371] Trial 1 finished with value: 0.04279440304569726 and parameters: {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'learning_rate': 4.3410134215112175e-05, 'gradient_accumulation_steps': 16, 'weight_decay': 0.002846863409696565, 'noise': 0.045624987655063155}. Best is trial 1 with value: 0.04279440304569726.
[I 2024-08-05 22:15:22,374] Trial 2 finished with value: -0.04603045212

Best hyperparameters: {'num_train_epochs': 6, 'per_device_train_batch_size': 1, 'learning_rate': 4.3410134215112175e-05, 'gradient_accumulation_steps': 16, 'weight_decay': 0.002846863409696565, 'noise': 0.045624987655063155}




Epoch,Training Loss,Validation Loss,Bleu
0,0.2329,0.132436,0.636043
1,0.1572,0.110929,0.679657
2,0.0975,0.102456,0.71455
3,0.0757,0.099765,0.717662
4,0.065,0.098521,0.719461
5,0.057,0.097801,0.719773


Training loss: 1.7224
Training loss: 0.4735
Training loss: 0.3315
Training loss: 0.2498
Training loss: 0.2352
Training loss: 0.2329
Evaluation loss: 0.13243593275547028


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


Training loss: 0.1654
Training loss: 0.1606
Training loss: 0.1502
Training loss: 0.1592
Training loss: 0.1503
Training loss: 0.1572
Evaluation loss: 0.11092866212129593


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


Training loss: 0.1126
Training loss: 0.0932
Training loss: 0.1052
Training loss: 0.1096
Training loss: 0.105
Training loss: 0.0975
Evaluation loss: 0.10245589911937714


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


Training loss: 0.0798
Training loss: 0.0816
Training loss: 0.0823
Training loss: 0.0794
Training loss: 0.0752
Training loss: 0.0757
Evaluation loss: 0.09976517409086227


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


Training loss: 0.0604
Training loss: 0.0626
Training loss: 0.0683
Training loss: 0.064
Training loss: 0.0701
Training loss: 0.065
Evaluation loss: 0.09852129966020584


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


Training loss: 0.0557
Training loss: 0.0608
Training loss: 0.0554
Training loss: 0.0534
Training loss: 0.0617


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


Training loss: 0.057
Evaluation loss: 0.09780053794384003


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}


('/content/drive/My Drive/New_best_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/New_best_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/New_best_tokenizer/vocab.json',
 '/content/drive/My Drive/New_best_tokenizer/source.spm',
 '/content/drive/My Drive/New_best_tokenizer/target.spm',
 '/content/drive/My Drive/New_best_tokenizer/added_tokens.json')