#### English to Swahili Translation Model

#### 1.0 Importing necessary libraries

In [1]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install transformers[torch]
!pip install accelerate -U



In [2]:
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import sentencepiece
import warnings
warnings.filterwarnings("ignore")

#### 1.1 loading the datasets

In [None]:
# Load the CSV dataset
df = load_dataset("csv", data_files="ensw_v2.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

#### 1.2 Viewing the dataset

In [None]:
print("Dataset object:\n\n", df)

Dataset object:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation', 'Unnamed: 2'],
        num_rows: 193521
    })
})


#### 1.3 Viewing the information about the dataset

In [None]:
train_dataset = df['train']
print("Train dataset information:\n\n", train_dataset.features)

Train dataset information:

 {'English sentence': Value(dtype='string', id=None), 'Swahili Translation': Value(dtype='string', id=None), 'Unnamed: 2': Value(dtype='float64', id=None)}


#### 1.4  Viewing the split dataset information


In [None]:
split_df = df['train'].train_test_split(train_size=0.9, seed=20)
print("\nSplit datasets:\n\n", split_df)


Split datasets:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation', 'Unnamed: 2'],
        num_rows: 174168
    })
    test: Dataset({
        features: ['English sentence', 'Swahili Translation', 'Unnamed: 2'],
        num_rows: 19353
    })
})



#### 1.5 Loading the tokenizer and model

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-swc"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 1.5 Set the maximum sequence length and define the preprocessing function

In [None]:
max_length = 128

In [None]:
def preprocess_function(examples):
    inputs = str(examples['English sentence'])
    targets = str(examples['Swahili Translation'])

    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

#### 1.6 Preprocess the training and validation sets

In [None]:
# # loading the tokenizer and model
# model_checkpoint = "Helsinki-NLP/opus-mt-en-swc"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# max_length = 128
# def preprocess_function(examples):
#     inputs = str(examples['English sentence'])
#     targets = str(examples['Swahili Translation'])

#     model_inputs = tokenizer(
#         inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
#     )
#     model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

#     return model_inputs
train_dataset = split_df['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["English sentence", "Swahili Translation"]
)


In [None]:
eval_dataset = split_df['test'].map(
    preprocess_function, batched=True, remove_columns=["English sentence", "Swahili Translation"]
)

#### 1.7 Define the training arguments and create the trainer

In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir='../model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../logs/",
    logging_steps=500,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

#### 1.8 Train the model

In [None]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

Step,Training Loss,Validation Loss


#### 1.9 Evaluate the model on the validation set

In [None]:
result = trainer.evaluate()
print(result)

{'eval_loss': 6.103726387023926, 'eval_runtime': 0.169, 'eval_samples_per_second': 124.252, 'eval_steps_per_second': 11.834, 'epoch': 30.0}


#### 2.0 Export the trained model

In [None]:
# saving the model as a pkl  file
import pickle
pickle_out = open("model.pkl", mode = "wb")
pickle.dump(model, pickle_out)
pickle_out.close()



In [None]:
model.save_pretrained("../model/")
tokenizer.save_pretrained("../model/")

('../model/tokenizer_config.json',
 '../model/special_tokens_map.json',
 '../model/vocab.json',
 '../model/source.spm',
 '../model/target.spm',
 '../model/added_tokens.json')

#### 2.1 Creating a pipeline for translation


In [None]:
translator = pipeline(
    "text2text-generation",
    model="../model/",
    tokenizer="../model/",
)

#### 2.2 Prompt the user to enter a sentence for translation


In [None]:
while True:
    text = input("Andika sentensi Unayotaka itafsiriwe kwa Kiingereza (andika 'exit' ndio utoke): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")

# To enforce The biderectional model,
we have to have two models, One that translates from English to Swahili and another which translates from Swahili to English
# 3.0 Load the swaeng_v2 Dataset

In [3]:
# Load the Swa-Eng CSV dataset
df = load_dataset("csv", data_files="sweng_v2.csv")

# 3.1 Viewing the dataset



In [4]:

print("Dataset object:\n\n", df)

Dataset object:

 DatasetDict({
    train: Dataset({
        features: ['Swahili sentence', 'English Translation'],
        num_rows: 209498
    })
})


In [5]:
# Viewing more info about the dataset

train_dataset = df['train']
print("Train dataset information:\n\n", train_dataset.features)

Train dataset information:

 {'Swahili sentence': Value(dtype='string', id=None), 'English Translation': Value(dtype='string', id=None)}


# 3.2 Viewing and Spliting the dataset

In [6]:
split_df = df['train'].train_test_split(train_size=0.9, seed=20)
print("\nSplit datasets:\n\n", split_df)


Split datasets:

 DatasetDict({
    train: Dataset({
        features: ['Swahili sentence', 'English Translation'],
        num_rows: 188548
    })
    test: Dataset({
        features: ['Swahili sentence', 'English Translation'],
        num_rows: 20950
    })
})


# 3.3 Loading Tokenizer and Model

In [7]:
model_checkpoint = "Helsinki-NLP/opus-mt-swc-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 3.4 Set the maximum sequence length and define the preprocessing function

In [8]:
max_length = 128

In [9]:
def preprocess_function(examples):
    inputs = str(examples['English Translation'])
    targets = str(examples['Swahili sentence'])

    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

# 3.5 Preprocess the training and validation sets

In [10]:
train_dataset = split_df['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["Swahili sentence", "English Translation"]
)


In [11]:
eval_dataset = split_df['test'].map(
    preprocess_function, batched=True, remove_columns=["Swahili sentence", "English Translation"]
)

# 3.6 Define the training arguments and create the trainer

In [12]:

training_args = Seq2SeqTrainingArguments(
    output_dir='../model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../logs/",
    logging_steps=500,
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# 3.7 Training the model

In [14]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

Step,Training Loss,Validation Loss


# 3.8 Model Evaluation

In [15]:
result = trainer.evaluate()
print(result)

{'eval_loss': 5.32317590713501, 'eval_runtime': 0.2101, 'eval_samples_per_second': 99.935, 'eval_steps_per_second': 9.518, 'epoch': 20.0}


In [16]:

# saving the model as a pkl  file
import pickle
pickle_out = open("swa_eng_model.pkl", mode = "wb")
pickle.dump(model, pickle_out)
pickle_out.close()



In [19]:
model.save_pretrained("./")
tokenizer.save_pretrained("./")

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.json',
 './source.spm',
 './target.spm',
 './added_tokens.json')

# 4.1 Creating pipeline for Translation

In [20]:
translator = pipeline(
    "text2text-generation",
    model="./",
    tokenizer="./",
)

# 4.2 Prompt the user to enter a sentence for translation





In [None]:
while True:
    text = input("Andika Sentensi yako itafsiriwe (andika 'exit' to quit): ")
    if text == "exit":
        break

    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")

Andika Sentensi yako itafsiriwe (andika 'exit' to quit): bwana mkubwa
Translated text: The master
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): habari yako mzee?
Translated text: about your old age?
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): chakula ki tayari
Translated text: food ready
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): niliufanya mtihani 
Translated text: I made a test
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): chajio
Translated text: mjiji
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): mjini
Translated text: town
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): mji
Translated text: town
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): kijiji
Translated text: village
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): hey boy
Translated text: hey boy
Andika Sentensi yako itafsiriwe (andika 'exit' to quit): wewe kijana
Translated text: You young
Andika Sentensi yako itafsiriwe (andika 'exit' to