#### English to Swahili Translation Model

#### 1.0 Importing necessary libraries

In [1]:
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import warnings
warnings.filterwarnings("ignore")

2023-11-02 13:47:31.966244: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### 1.1 loading the datasets

In [2]:
# Load the CSV dataset
df = load_dataset("csv", data_files="../dataset/ensw.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

#### 1.2 Viewing the dataset

In [3]:
print("Dataset object:\n\n", df)

Dataset object:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 209155
    })
})


#### 1.3 Viewing the information about the dataset

In [4]:
train_dataset = df['train']
print("Train dataset information:\n\n", train_dataset.features)

Train dataset information:

 {'English sentence': Value(dtype='string', id=None), 'Swahili Translation': Value(dtype='string', id=None)}


#### 1.4  Viewing the split dataset information


In [5]:
split_df = df['train'].train_test_split(train_size=0.9, seed=20)
print("\nSplit datasets:\n\n", split_df)


Split datasets:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 188239
    })
    test: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 20916
    })
})



#### 1.5 Loading the tokenizer and model

In [6]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-swc"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### 1.5 Set the maximum sequence length and define the preprocessing function

In [7]:
max_length = 128

In [8]:
def preprocess_function(examples):
    inputs = str(examples['English sentence'])
    targets = str(examples['Swahili Translation'])
    
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

#### 1.6 Preprocess the training and validation sets

In [9]:
train_dataset = split_df['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["English sentence", "Swahili Translation"]
)


Map (num_proc=4):   0%|          | 0/188239 [00:00<?, ? examples/s]

In [10]:
eval_dataset = split_df['test'].map(
    preprocess_function, batched=True, remove_columns=["English sentence", "Swahili Translation"]
)

Map:   0%|          | 0/20916 [00:00<?, ? examples/s]

#### 1.7 Define the training arguments and create the trainer

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir='../model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../logs/",
    logging_steps=500,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

#### 1.8 Train the model

In [13]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

Step,Training Loss,Validation Loss


#### 1.9 Evaluate the model on the validation set

In [14]:
result = trainer.evaluate()
print(result)

{'eval_loss': 6.145776748657227, 'eval_runtime': 6.9935, 'eval_samples_per_second': 3.003, 'eval_steps_per_second': 0.286, 'epoch': 20.0}


#### 2.0 Export the trained model

In [15]:
model.save_pretrained("../model/")
tokenizer.save_pretrained("../model/")

('../model/tokenizer_config.json',
 '../model/special_tokens_map.json',
 '../model/vocab.json',
 '../model/source.spm',
 '../model/target.spm',
 '../model/added_tokens.json')

#### 2.1 Creating a pipeline for translation


In [16]:
translator = pipeline(
    "text2text-generation",
    model="../model/",
    tokenizer="../model/",
)

#### 2.2 Prompt the user to enter a sentence for translation


In [None]:
while True:
    text = input("Enter an English sentence for translation to Swahili (type 'exit' to quit): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")

Enter an English sentence for translation to Swahili (type 'exit' to quit):  baby


Translated text: Mtoto


Enter an English sentence for translation to Swahili (type 'exit' to quit):  Human immunodeficiency virus (HIV) is an infection that attacks the bodyâ€™s immune system. Acquired immunodeficiency syndrome (AIDS) is the most advanced stage of the disease.


Translated text: Virusi vya mfumo wa kinga wa binadamu (ADDS) ni virusi vinavyoshambulia mfumo wa kinga wa mwili.


Enter an English sentence for translation to Swahili (type 'exit' to quit):  ONLY candidates with the above documents will be interviewed. Candidates who present false documents or information will be disqualified.


Translated text: WATU wanaotaka kujua kusoma hati hizo watahojiwa.


Enter an English sentence for translation to Swahili (type 'exit' to quit):  Candidates who present false documents or information will be disqualified


Translated text: Watendaji wanaoandika hati au habari za uwongo wataharibiwa


Enter an English sentence for translation to Swahili (type 'exit' to quit):  Lily, Hopper, and Gus spent their days exploring the enchanting woods, discovering hidden glades, and sharing secrets only they could understand. Hopper would guide them through the thick underbrush with his swift leaps, while Gus would nibble on the sweetest grass, offering bites to his friends


Translated text: Kwa kawaida, Hopper, na Gus walitumia siku nyingi wakichunguza misitu yenye kuvutia, kugundua maua yaliyofichwa, na kueleza siri ambazo wangeweza kuelewa tu, na hivyo kuwaongoza kutokana na kelele zake za kuruka kwa kasi, huku Gus akiuma kwenye nyasi tamu zaidi na kuwauma rafiki zake


## THANK YOU

In [23]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# translator pipeline for english to swahili translations
eng_swa_model_checkpoint = "Helsinki-NLP/opus-mt-en-swc"
eng_swa_tokenizer = AutoTokenizer.from_pretrained("../model/eng_swa_model/")
eng_swa_model = AutoModelForSeq2SeqLM.from_pretrained("../model/eng_swa_model/")

eng_swa_translator = pipeline(
    "text2text-generation",
    model=eng_swa_model,
    tokenizer=eng_swa_tokenizer,
)

def translate_text_eng_swa(text):
    translated_text = eng_swa_translator(text, max_length=128, num_beams=5)[0]['generated_text']
    return translated_text

# translator pipeline for swahili to english translations
swa_eng_model_checkpoint = "Helsinki-NLP/opus-mt-swc-en"
swa_eng_tokenizer = AutoTokenizer.from_pretrained("../model/swa_eng_model/")
swa_eng_model = AutoModelForSeq2SeqLM.from_pretrained("../model/swa_eng_model/")

swa_eng_translator = pipeline(
    "text2text-generation",
    model=swa_eng_model,
    tokenizer=swa_eng_tokenizer,
)

def translate_text_swa_eng(text):
    translated_text = swa_eng_translator(text, max_length=128, num_beams=5)[0]['generated_text']
    return translated_text


  from .autonotebook import tqdm as notebook_tqdm
2023-12-17 17:14:08.670451: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-17 17:14:08.670720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-17 17:14:08.772513: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-17 17:14:09.070647: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector


# Language detector
def get_lang_detector(nlp, name):
    return LanguageDetector()

# # Load the English language model
nlp = spacy.load("en_core_web_sm")

# # Register the language detection factory
Language.factory("language_detector", func=get_lang_detector)

# # Add the language detection component to the pipeline
nlp.add_pipe('language_detector', last=True)

# res="We went to the club for drinks. We were so drunk"


<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x7f4fffcfffd0>

In [46]:

import csv
import os
import pandas as pd

def write_to_csv(user_input,translated_text, correct_translation):
    # Create a CSV file
    with open('../incorrect_translations/translations.csv', 'a', newline='') as file:
        # Create a CSV writer
        writer = csv.writer(file)

        # Check if the file is empty, i.e., it does not have a header row yet
        if os.path.getsize('translations.csv') == 0:
            # Write the column header to the file
            writer.writerow(['User Input', 'Translated Text', 'Correct Translation'])


        # Write the data to the CSV file
        writer.writerow([ user_input, translated_text, correct_translation])



# user_text="Once upon a time in the lush grasslands of East Africa, a wise old tortoise named Kiume lived a quiet, slow life. One sunny day, he stumbled upon a mischievous hare named Njeru. "
# user_text="Njeru, a master of deception, was known for tricking other animals into thinking he was a slower creature. This made him an outcast in the animal kingdom, and he often felt alone."
# user_text="Njeru eventually decided to join forces with Kiume. The two formed an unbreakable bond, with Njeru vowing to change his ways and start treating others with respect."
# user_text="Kiume, sensing Njeru's loneliness, offered his friendship. At first, Njeru hesitated, remembering the tricks he had played on others. But as time went by, he found himself growing fonder of Kiume."
user_text="You can access its rows, columns, and data by using standard indexing or by using column headers"
# language detector here

# language detection
doc=nlp(user_text)
detected_language = doc._.language['language']
print(f"Detected language: {detected_language}")   

if detected_language=="en":   
    translated_text=translate_text_eng_swa(user_text)
    print(translate_text_eng_swa(user_text))
elif detected_language=='sw':
    translated_text=translate_text_swa_eng(user_text)
    print(translate_text_swa_eng(user_text))





is_translation_correct = input("Is translation correct? ")
# is_translation_correct = bool(is_translation_correct)
if is_translation_correct.lower()=='yes': # If is_translation_correct is True
    print("correct translation")
    
elif  is_translation_correct.lower()=='no': # If is_translation_correct is False
    corrected_text = input("Enter the corrected text: ")
    
    print("correct translation " + corrected_text)
    write_to_csv(user_text, translated_text, corrected_text)



Detected language: en
Unaweza kupata safu zake, safu za nguzo, na habari kwa kuandika orodha ya kawaida au kwa kutumia safu za safu
correct translation


In [52]:
# # proprocess the saved csv file 
# def preprocess_csv(file):
#     import pandas as pd
#     df = pd.read_csv(file)
#     # remove rows with missing values (NaN or None)
#     df = df.dropna()
#     # df = df.
#     return df

# data = open('../incorrect_translations/translations.csv')
# preprocess_csv(data)    

In [51]:
import pandas as pd
df = pd.read_csv("../incorrect_translations/translations.csv")
# remove rows with missing values (NaN or None)
df = df.dropna()
df 


Unnamed: 0,User Input,Translated Text,Correct Translation
0,what was the original crime,Uhalifu wa awali ulisababishwa na nini?,uhalifu wa kwanza ulikuwa?
1,"Kiume, sensing Njeru's loneliness, offered his...","Kiume, ambaye alitambua upweke wa Njeru, alito...","Kiume ambaye alitambua upweke wa Njeru, alijit..."
2,"Kiume, sensing Njeru's loneliness, offered his...","Kiume, ambaye alitambua upweke wa Njeru, alito...","Kiume, ambaye alitambua upweke wa Njeru, aliji..."
3,Njeru eventually decided to join forces with K...,Mwishowe Njeru aliamua kujiunga na jeshi la Kiume,Mwishowe Njeru aliamua kujiunga na Kiume
4,Njeru eventually decided to join forces with K...,Mwishowe Njeru aliamua kujiunga na majeshi pam...,Mwishowe Njeru aliamua kujiunga pamoja na Kium...
