In [1]:
# Packages that you will need to install
!pip install transformers
!pip install torch
!pip install sentencepiece
!pip install sacremoses



In [2]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader
import torch
from torch.utils.data import DataLoader, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('Luo-Eng.csv')
display(df)

Unnamed: 0,Luo Content,English Content
0,"Jaode niluongo ni Maaka,",And the name of his wife [was] Maacah.
1,Giil odiechiengʼ duto nikech nyingi; gidhialo ...,"In thy name shall they rejoice all the day, an..."
2,Anakech ngʼama adwaro kecho kendo anangʼwon-ne...,"Truly, I am the Oft-Forgiving, the Most Mercif..."
3,"Negie kaka luoro dwaro negogi, nikech Nyasaye ...","But they will be overwhelmed with dread, for G..."
4,kama gichokoree (Jonjore).,and gathered at one place.
...,...,...
2825276,Tachi waza kata dori dai ikkyo omote,baking a pie or loaf of bread
2825277,kore kuom weche modonjnego.,be set on the response.
2825278,giluwe ka giwuotho gi tiendegi.,by following in His footsteps.
2825279,Kik Me - Legs Pics Kik User Me Date Ing,don't let me down - the chainsmokers ft. daya


In [7]:
total_rows = len(df)
chunk_size = total_rows // 1400

chunks = [df.iloc[i * chunk_size:(i + 1) * chunk_size] for i in range(100)]

# Name of the chunk we are to use
working = chunks[0]
display(working)

Unnamed: 0,Luo Content,English Content
0,"Jaode niluongo ni Maaka,",And the name of his wife [was] Maacah.
1,Giil odiechiengʼ duto nikech nyingi; gidhialo ...,"In thy name shall they rejoice all the day, an..."
2,Anakech ngʼama adwaro kecho kendo anangʼwon-ne...,"Truly, I am the Oft-Forgiving, the Most Mercif..."
3,"Negie kaka luoro dwaro negogi, nikech Nyasaye ...","But they will be overwhelmed with dread, for G..."
4,kama gichokoree (Jonjore).,and gathered at one place.
...,...,...
2013,(rwako) e chunygi.,Inside their hearts.
2014,God Be Great and God Be Wise,God is so good and so wise.
2015,"Kadi bed gin mo keken otimme i kwowa, pud watw...",No matter what happens in our lives we can be ...
2016,Mano e chier mokwongo.,this is the First Resurrection.


In [8]:
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-luo-en")

# Tokenize the data with padding and truncation
tokenized_data = tokenizer(
    working['Luo Content'].tolist(),
    text_target=working['English Content'].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt"
)
display(tokenized_data)

{'input_ids': tensor([[19414,    16,  6288,  ..., 52235, 52235, 52235],
        [ 1701,  6071,   826,  ..., 52235, 52235, 52235],
        [39655,    38, 18178,  ..., 52235, 52235, 52235],
        ...,
        [  101,  9030,   362,  ..., 52235, 52235, 52235],
        [  223,     9,  1210,  ..., 52235, 52235, 52235],
        [  382, 19237,  8636,  ..., 52235, 52235, 52235]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  414,     5,   420,  ..., 52235, 52235, 52235],
        [  134,  6347,   650,  ..., 52235, 52235, 52235],
        [ 7400,     3,    47,  ..., 52235, 52235, 52235],
        ...,
        [  538,   965,    86,  ..., 52235, 52235, 52235],
        [  108,    29,     5,  ..., 52235, 52235, 52235],
        [   47,    56,    50,  ..., 52235, 52235, 52235]])}

In [9]:
class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_data["input_ids"][idx],
            "attention_mask": self.tokenized_data["attention_mask"][idx],
            "labels": self.tokenized_data["labels"][idx],
        }

In [10]:
dataset = CustomDataset(tokenized_data)

model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-luo-en")

# Training parameters
learning_rate = 1e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
epochs = 5

# DataLoader for training
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Define the training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}: Average Loss - {average_loss}")


Epoch 1: Average Loss - 1.0532041636144691
Epoch 2: Average Loss - 0.49017216606809216
Epoch 3: Average Loss - 0.3463274550885551
Epoch 4: Average Loss - 0.25121456811668375
Epoch 5: Average Loss - 0.18753307881091424


In [18]:
# Check the model's translation
input_sentence = "mokwongo"  
tokenized_input = tokenizer(input_sentence, return_tensors="pt")
with torch.no_grad():
    model.eval()
    output = model.generate(**tokenized_input, max_length=50)
translated_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Input Sentence (Luo): {input_sentence}")
print(f"Translated Sentence (English): {translated_sentence}")

Input Sentence (Luo): mokwongo
Translated Sentence (English): First of all,


In [19]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Define the path to save the model
save_path = os.path.join(current_directory, "Luo2Eng-model")

# Save the trained model
model.save_pretrained(save_path)


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[52235]], 'forced_eos_token_id': 0}
