In [2]:
%%capture
!pip install --upgrade datasets evaluate sacrebleu sentencepiece git+https://github.com/csebuetnlp/normalizer
!pip install transformers

In [3]:
import pandas as pd
import torch
import unicodedata
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer
import evaluate
import os
from typing import Tuple


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Preprocessing functions
def standardize_to_NFC(text_list):
    """Normalize the text to NFC form for consistent diacritic handling."""
    return [unicodedata.normalize('NFC', text) for text in text_list]

def filter_single_word_sentence(eng_sents, yor_sents):
    """Filter out sentences that are single words in either language."""
    eng_inds = set([i for i, sent in enumerate(eng_sents) if len(sent.split()) > 1])
    yor_inds = set([i for i, sent in enumerate(yor_sents) if len(sent.split()) > 1])
    common_inds = sorted(list(eng_inds & yor_inds))

    eng_filtered = [eng_sents[i] for i in common_inds]
    yor_filtered = [yor_sents[i] for i in common_inds]

    return eng_filtered, yor_filtered

In [6]:
def load(input_dir: str):
    """Load data from CSV, normalize and filter it."""
    # Example dataset paths
    train_file = os.path.join(input_dir, 'train.tsv')  # Adjust the path to your training data
    val_file = os.path.join(input_dir, 'dev.tsv')      # Adjust the path to your validation data
    test_file = os.path.join(input_dir, 'test.tsv')

    # Load the training data
    train_df = pd.read_csv(train_file, sep='\t', names=['English', 'Yoruba'])
    val_df = pd.read_csv(val_file, sep='\t', names=['English', 'Yoruba'])
    test_df = pd.read_csv(test_file, sep='\t', names=['English', 'Yoruba'])

    return train_df, val_df, test_df

def preprocess(train_df, val_df, test_df) -> Tuple[Dataset, Dataset, pd.DataFrame]:
    # Normalize Yorùbá sentences to NFC
    train_df['Yoruba'] = standardize_to_NFC(train_df['Yoruba'])
    val_df['Yoruba'] = standardize_to_NFC(val_df['Yoruba'])
    test_df['Yoruba'] = standardize_to_NFC(test_df['Yoruba'])  # Normalize test data

    # Filter out single-word sentences from training and validation data only
    train_en, train_yo = filter_single_word_sentence(train_df['English'], train_df['Yoruba'])
    val_en, val_yo = filter_single_word_sentence(val_df['English'], val_df['Yoruba'])

    # Create Hugging Face dataset from pandas DataFrame
    train_dataset = Dataset.from_pandas(pd.DataFrame({'en': train_en, 'yo': train_yo}))
    val_dataset = Dataset.from_pandas(pd.DataFrame({'en': val_en, 'yo': val_yo}))
    test_dataset = Dataset.from_pandas(test_df)  # No filtering for test data

    return train_dataset, val_dataset, test_dataset  # Return the unfiltered test dataset


In [9]:
input_dir = '/kaggle/input/english-to-yoruba/'
train_df, val_df, test_df = load(input_dir)

In [10]:
display(train_df.head(3))
display(val_df.head(3))
display(test_df.head(3))

Unnamed: 0,English,Yoruba
0,English,Yoruba
1,Unit 1: What is Creative Commons?,﻿Ìdá 1: Kín ni Creative Commons?
2,This work is licensed under a Creative Commons...,Iṣẹ́ yìí wà lábẹ́ àṣẹ Creative Commons Attribu...


Unnamed: 0,English,Yoruba
0,English,Yoruba
1,"We prepare the saddle, and the goat presents i...",A di gàárì sílẹ̀ ewúrẹ́ ń yọjú; ẹrù ìran rẹ̀ ni?
2,"You have been crowned a king, and yet you make...",A fi ọ́ jọba ò ń ṣàwúre o fẹ́ jẹ Ọlọ́run ni?


Unnamed: 0,English,Yoruba
0,English,Yoruba
1,Pending the time she would finally pack and go...,Títí di ìgbà tí ó máa fi kó ẹrù rẹ̀ lọ pátápát...
2,She knew how best she was going to take care o...,Ó mọ bí ó ṣe má a tọ́jú ara rẹ̀ àti Tinú.


In [11]:
display(train_df.head(3))
display(val_df.head(3))
display(test_df.head(3))

Unnamed: 0,English,Yoruba
0,English,Yoruba
1,Unit 1: What is Creative Commons?,﻿Ìdá 1: Kín ni Creative Commons?
2,This work is licensed under a Creative Commons...,Iṣẹ́ yìí wà lábẹ́ àṣẹ Creative Commons Attribu...


Unnamed: 0,English,Yoruba
0,English,Yoruba
1,"We prepare the saddle, and the goat presents i...",A di gàárì sílẹ̀ ewúrẹ́ ń yọjú; ẹrù ìran rẹ̀ ni?
2,"You have been crowned a king, and yet you make...",A fi ọ́ jọba ò ń ṣàwúre o fẹ́ jẹ Ọlọ́run ni?


Unnamed: 0,English,Yoruba
0,English,Yoruba
1,Pending the time she would finally pack and go...,Títí di ìgbà tí ó máa fi kó ẹrù rẹ̀ lọ pátápát...
2,She knew how best she was going to take care o...,Ó mọ bí ó ṣe má a tọ́jú ara rẹ̀ àti Tinú.


In [12]:
train_dataset, val_dataset, test_dataset = preprocess(train_df, val_df, test_df)

In [13]:
# Rename columns in the Hugging Face datasets
train_dataset = train_dataset.rename_column('en', 'input_text')
train_dataset = train_dataset.rename_column('yo', 'labels')

val_dataset = val_dataset.rename_column('en', 'input_text')
val_dataset = val_dataset.rename_column('yo', 'labels')

test_dataset = test_dataset.rename_column('English', 'input_text')
test_dataset = test_dataset.rename_column('Yoruba', 'labels')



In [14]:
from transformers import pipeline, VitsModel, AutoTokenizer, AutoModelForSeq2SeqLM
model_name = "Davlan/m2m100_418M-eng-yor-mt"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

In [15]:
# model_name = 'google-t5/t5-base'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [16]:
from normalizer import normalize
from torch.utils.data import Dataset, DataLoader
class Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.input_text = pd.Series(data['input_text']).apply(normalize).tolist()
        self.labels = pd.Series(data['labels']).apply(normalize).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, idx):
        input_text = self.input_text[idx]
        label_text = self.labels[idx]

        # Tokenize the input text
        input_encodings = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Tokenize the label text to get its 'input_ids' and 'attention_mask'
        label_encodings = self.tokenizer(
            label_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': label_encodings['input_ids'].squeeze(),
        }

In [17]:
# Modify the data collation process to handle PyTorch tensors correctly
class MyDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"] for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"] for feature in features])

        # Labels should be processed differently for PyTorch tensors
        if isinstance(features[0]["labels"], torch.Tensor):
            batch["labels"] = torch.stack([feature["labels"] for feature in features])
        else:
            # Convert the list of lists to a PyTorch tensor
            batch["labels"] = torch.tensor([feature["labels"] for feature in features])

        return batch

In [18]:
from transformers import Trainer, TrainingArguments

# Custom Trainer class to ensure tensors are contiguous during training
class CustomSeq2SeqTrainer(Trainer):
    def save_model(self, output_dir=None, **kwargs):
        """Override save_model to ensure all model tensors are contiguous before saving."""
        if output_dir is None:
            output_dir = self.args.output_dir
        for param in self.model.parameters():
            if not param.is_contiguous():
                param.data = param.contiguous()
        super().save_model(output_dir, **kwargs)

    def training_step(self, model, inputs):
        """Override training_step to ensure tensors are contiguous during gradient updates."""
        for param in model.parameters():
            if not param.is_contiguous():
                param.data = param.contiguous()

        return super().training_step(model, inputs)

In [19]:
# Create train , test and validation datasets
train_dataset = Seq2SeqDataset(train_dataset, tokenizer)
val_dataset = Seq2SeqDataset(val_dataset, tokenizer)
test_dataset = Seq2SeqDataset(test_dataset, tokenizer)
# validation_dataset = Seq2SeqDataset(validation_data, tokenizer)

# Create train , test and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)  #batch_size=32
val_dataloader = DataLoader(val_dataset, batch_size=32) #batch_size=32
test_dataloader = DataLoader(test_dataset, batch_size=32) #batch_size=32
# validation_dataloader = DataLoader(validation_dataset, batch_size=32) #batch_size=32

In [20]:
# Create a custom optimizer using torch.optim.AdamW
custom_optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,
    eps=1e-8,
    weight_decay=0.01,
)

In [25]:
# Define the TrainingArguments for fine-tuning
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    save_total_limit=0,
    eval_steps=50,
    save_steps=15000,
    learning_rate=1e-3,
    do_train=True,
    do_eval=True,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=False,
    lr_scheduler_type="cosine_with_restarts",
    warmup_steps=100,
    weight_decay=0.01,
    #logging_dir='D:\\Datasets\\Thesis Data Test',
    logging_steps=20,

)



In [26]:

# Create a data collator for sequence-to-sequence tasks
data_collator = MyDataCollatorForSeq2Seq(
    tokenizer=tokenizer2,
    model=model2,
    padding=False,
    max_length=80,
    label_pad_token_id=tokenizer2.pad_token_id,
)

In [27]:
# Create Trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(custom_optimizer, None),
)

In [28]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss
50,1.279,1.282183
100,1.0759,1.163745
150,0.9698,1.074869
200,0.8456,1.013761
250,0.7942,0.967038


KeyboardInterrupt: 

In [None]:
# Correct directory paths
model_output_dir = "/content/model/"
tokenizer_output_dir = "/content/model/"

# Save the model to the specified directory
model.save_pretrained(model_output_dir)

# Save the tokenizer to the specified directory
tokenizer.save_pretrained(tokenizer_output_dir)

print(f"Model saved to {model_output_dir}")
print(f"Tokenizer saved to {tokenizer_output_dir}")


In [29]:
from tqdm import tqdm
import torch

def evaluate_model(model, tokenizer, eval_dataloader, device, max_samples=5):
    model.eval()  # Set model to evaluation mode
    model.to(device)

    predictions = []
    references = []
    num_samples = 0  # Initialize a counter

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Generate translations
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=80,  # Adjust max_length according to your data
                num_beams=5,    # Beam search for better results
                early_stopping=True
            )

            # Decode predictions and references
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

            # Calculate how many new samples to add
            remaining_samples = max_samples - num_samples

            # Store results, ensuring not to exceed the max_samples limit
            predictions.extend(decoded_preds[:remaining_samples])
            references.extend(decoded_labels[:remaining_samples])

            # Update the count of total samples collected
            num_samples += len(decoded_preds)

            # Stop when we have evaluated the specified number of samples
            if num_samples >= max_samples:
                break

    return predictions[:max_samples], references[:max_samples]



In [30]:
preds, references = evaluate_model(model, tokenizer, val_dataloader, device)


Evaluating:   0%|          | 0/107 [00:05<?, ?it/s]


In [31]:
preds

['A n gba, omo n gba; omo ni n gba omo ni?',
 'O ti jé omo e, o ti n gba; o ti jé Olórun?',
 'Bi a ba n ba n ba n ba Awa; bi a ba n ba n ba n ba n ba Awa; bi a o ba n ba n ba n ba n ba n ba n ba Awa, sugbón ba n ba Awa?',
 'A n gba, omo n gba; omo ni n gba.',
 'A ki i gbe agbe agbe agbe ki i gbe agbe agbe agbe.']

In [34]:
i = 4

test = val_dataset[i]
print('English: ', tokenizer.decode(test['input_ids']))
print('Correct yoruba: ', tokenizer.decode(test['labels']))
print('Predicted yoruba: ', preds[i])

English:  One does not share a farm boundary with a king without getting one's feet gashed by the king's hoe.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Correct yoruba:  A ki i ba oba pala ki okó oba ma sanni lésè.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [41]:
# Load the BLEU metric for evaluation using the new library
bleu_metric = evaluate.load("bleu")

# Tokenize predictions and references
tokenized_predictions = [pred.split() for pred in preds]  # Tokenize each prediction
tokenized_references = [ref for ref in references]  # Tokenize each reference and wrap it in a list

# Format predictions and references for BLEU metric calculation
bleu_metric.add_batch(
    predictions=preds,
    references=tokenized_references
)

# Calculate BLEU score
bleu_score = bleu_metric.compute()
print(f"BLEU Score: {bleu_score['bleu'] * 100:.2f}")


BLEU Score: 9.55


In [42]:
preds

['A n gba, omo n gba; omo ni n gba omo ni?',
 'O ti jé omo e, o ti n gba; o ti jé Olórun?',
 'Bi a ba n ba n ba n ba Awa; bi a ba n ba n ba n ba n ba Awa; bi a o ba n ba n ba n ba n ba n ba n ba Awa, sugbón ba n ba Awa?',
 'A n gba, omo n gba; omo ni n gba.',
 'A ki i gbe agbe agbe agbe ki i gbe agbe agbe agbe.']

In [43]:
print(preds)
print(tokenized_references)

['A n gba, omo n gba; omo ni n gba omo ni?', 'O ti jé omo e, o ti n gba; o ti jé Olórun?', 'Bi a ba n ba n ba n ba Awa; bi a ba n ba n ba n ba n ba Awa; bi a o ba n ba n ba n ba n ba n ba n ba Awa, sugbón ba n ba Awa?', 'A n gba, omo n gba; omo ni n gba.', 'A ki i gbe agbe agbe agbe ki i gbe agbe agbe agbe.']
['A di gaari silè ewuré n yoju; eru iran rè ni?', 'A fi ó joba o n sawure o fé je Olórun ni?', 'A fijo gba Awa; a fija gba Awa; bi a o ba jo, bi a o ba ja, bi a ba ti gba Awa, ko tan bi?', 'A gbe gaari omo ewuré n roju; ki i se eru aguntan.', 'A ki i ba oba pala ki okó oba ma sanni lésè.']


In [46]:
from tqdm import tqdm
import evaluate

def evaluate_model(model, tokenizer, eval_dataloader, device):
    model.eval()  # Set model to evaluation mode
    model.to(device)

    predictions = []
    references = []

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Generate translations
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=80,  # Adjust max_length according to your data
                num_beams=5,  # Beam search for better results
                early_stopping=True
            )

            # Decode predictions
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

            # Store results
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

    return predictions, references

In [48]:
predictions, references =  evaluate_model(model, tokenizer, val_dataloader, device)

Evaluating: 100%|██████████| 107/107 [10:07<00:00,  5.68s/it]


In [49]:
# Load the BLEU metric for evaluation using the new library
bleu_metric = evaluate.load("bleu")

# Format predictions and references for BLEU metric calculation
bleu_metric.add_batch(
    predictions=predictions,
    references=references
)

# Calculate BLEU score
bleu_score = bleu_metric.compute()
print(f"BLEU Score: {bleu_score['bleu'] * 100:.2f}")


BLEU Score: 5.53


In [51]:
predictions, references =  evaluate_model(model, tokenizer, test_dataloader, device)

Evaluating: 100%|██████████| 208/208 [20:04<00:00,  5.79s/it]


In [50]:
# Display some sample results
for i in range(5):  # Display first 5 samples
    print(f"Input: {val_dataset.input_text[i]}")
    print(f"Prediction: {predictions[i]}")
    print(f"Reference: {references[i]}")
    print("-" * 30)


Input: We prepare the saddle, and the goat presents itself; is it a burden for the lineage of goats?
Prediction: A n gba, omo n gba; omo ni n gba omo ni?
Reference: A di gaari silè ewuré n yoju; eru iran rè ni?
------------------------------
Input: You have been crowned a king, and yet you make good-luck charms; would you be crowned God?
Prediction: O ti jé omo e, o ti n gba; o ti jé Olórun?
Reference: A fi ó joba o n sawure o fé je Olórun ni?
------------------------------
Input: By dancing we take possession of Awa; through fighting we take possession of Awa; if we neither dance nor fight, but take possession of Awa anyway, is the result not the same?
Prediction: Bi a ba n ba n ba n ba Awa; bi a ba n ba n ba n ba n ba Awa; bi a o ba n ba n ba n ba n ba n ba n ba Awa, sugbón ba n ba Awa?
Reference: A fijo gba Awa; a fija gba Awa; bi a o ba jo, bi a o ba ja, bi a ba ti gba Awa, ko tan bi?
------------------------------
Input: We lift a saddle and the goat (kin) scowls; it is no burden 

In [54]:
pd.Series(predictions).to_csv('first_submission.csv')