#### English to Swahili Translation Model

#### 1.0 Importing necessary libraries

In [35]:
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import warnings
warnings.filterwarnings("ignore")

#### 1.1 loading the datasets

In [36]:
# Load the CSV dataset
df = load_dataset("csv", data_files="../dataset/ensw.csv")

#### 1.2 Viewing the dataset

In [37]:
print("Dataset object:\n\n", df)

Dataset object:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 209160
    })
})


#### 1.3 Viewing the information about the dataset

In [38]:
train_dataset = df['train']
print("Train dataset information:\n\n", train_dataset.features)

Train dataset information:

 {'English sentence': Value(dtype='string', id=None), 'Swahili Translation': Value(dtype='string', id=None)}


#### 1.4  Viewing the split dataset information


In [39]:
split_df = df['train'].train_test_split(train_size=0.9, seed=20)
print("\nSplit datasets:\n\n", split_df)


Split datasets:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 188244
    })
    test: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 20916
    })
})



#### 1.5 Loading the tokenizer and model

In [40]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-swc"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### 1.5 Set the maximum sequence length and define the preprocessing function

In [41]:
max_length = 128

In [42]:
def preprocess_function(examples):
    inputs = str(examples['English sentence'])
    targets = str(examples['Swahili Translation'])
    
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

#### 1.6 Preprocess the training and validation sets

In [43]:
train_dataset = split_df['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["English sentence", "Swahili Translation"]
)


In [44]:
eval_dataset = split_df['test'].map(
    preprocess_function, batched=True, remove_columns=["English sentence", "Swahili Translation"]
)

#### 1.7 Define the training arguments and create the trainer

In [45]:
training_args = Seq2SeqTrainingArguments(
    output_dir='../model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../logs/",
    logging_steps=500,
)

In [46]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

#### 1.8 Train the model

In [47]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

Step,Training Loss,Validation Loss


#### 1.9 Evaluate the model on the validation set

In [48]:
result = trainer.evaluate()
print(result)

{'eval_loss': 6.455812931060791, 'eval_runtime': 5.2599, 'eval_samples_per_second': 3.992, 'eval_steps_per_second': 0.38, 'epoch': 20.0}


#### 2.0 Export the trained model

In [49]:
model.save_pretrained("../model/")
tokenizer.save_pretrained("../model/")

('../model/tokenizer_config.json',
 '../model/special_tokens_map.json',
 '../model/vocab.json',
 '../model/source.spm',
 '../model/target.spm',
 '../model/added_tokens.json')

#### 2.1 Creating a pipeline for translation


In [50]:
translator = pipeline(
    "text2text-generation",
    model="../model/",
    tokenizer="../model/",
)

#### 2.2 Prompt the user to enter a sentence for translation


In [None]:
while True:
    text = input("Enter an English sentence for translation to Swahili (type 'exit' to quit): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")

Enter an English sentence for translation to Swahili (type 'exit' to quit):  i


Translated text: i


Enter an English sentence for translation to Swahili (type 'exit' to quit):  leave me alone


Translated text: Uniache peke yangu


Enter an English sentence for translation to Swahili (type 'exit' to quit):  ove you daughter


Translated text: kujua binti yako


Enter an English sentence for translation to Swahili (type 'exit' to quit):  love you daughter


Translated text: anakupenda binti yako


Enter an English sentence for translation to Swahili (type 'exit' to quit):  i love you daughter


Translated text: Upendo wako kwa binti yako


Enter an English sentence for translation to Swahili (type 'exit' to quit):  This command will create an environment.yml file 


Translated text: Amri hiyo itatokeza mfumo wa mazingira


Enter an English sentence for translation to Swahili (type 'exit' to quit):   Once upon a time in the heart of the African savanna, a hare and a hyena lived in the same neighborhood. They were neighbors, but they couldn't have been more different in nature


Translated text: Wakati mmoja katika maeneo ya savana barani Afrika, sungura mmoja na mtu mwingine mwenye ugonjwa wa akili waliishi katika eneo hilohilo, walikuwa majirani, lakini hawakuwa na tofauti yoyote katika mazingira ya asili


## THANK YOU ANY QUESTION