#### English to Swahili Translation Model

#### 1.0 Importing necessary libraries

In [52]:
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import warnings
warnings.filterwarnings("ignore")

#### 1.1 loading the datasets

In [53]:
# Load the CSV dataset
df = load_dataset("csv", data_files="../dataset/ensw.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

#### 1.2 Viewing the dataset

In [54]:
print("Dataset object:\n\n", df)

Dataset object:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 209160
    })
})


#### 1.3 Viewing the information about the dataset

In [55]:
train_dataset = df['train']
print("Train dataset information:\n\n", train_dataset.features)

Train dataset information:

 {'English sentence': Value(dtype='string', id=None), 'Swahili Translation': Value(dtype='string', id=None)}


#### 1.4  Viewing the split dataset information


In [56]:
split_df = df['train'].train_test_split(train_size=0.9, seed=20)
print("\nSplit datasets:\n\n", split_df)


Split datasets:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 188244
    })
    test: Dataset({
        features: ['English sentence', 'Swahili Translation'],
        num_rows: 20916
    })
})



#### 1.5 Loading the tokenizer and model

In [57]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-swc"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### 1.5 Set the maximum sequence length and define the preprocessing function

In [58]:
max_length = 128

In [59]:
def preprocess_function(examples):
    inputs = str(examples['English sentence'])
    targets = str(examples['Swahili Translation'])
    
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

#### 1.6 Preprocess the training and validation sets

In [60]:
train_dataset = split_df['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["English sentence", "Swahili Translation"]
)


Map (num_proc=4):   0%|          | 0/188244 [00:00<?, ? examples/s]

In [61]:
eval_dataset = split_df['test'].map(
    preprocess_function, batched=True, remove_columns=["English sentence", "Swahili Translation"]
)

Map:   0%|          | 0/20916 [00:00<?, ? examples/s]

#### 1.7 Define the training arguments and create the trainer

In [65]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="./logs/",
    logging_steps=500,
)

In [66]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

#### 1.8 Train the model

In [None]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

Step,Training Loss,Validation Loss


#### 1.9 Evaluate the model on the validation set

In [None]:
result = trainer.evaluate()
print(result)

#### 2.0 Export the trained model

In [None]:
model.save_pretrained("./model/")
tokenizer.save_pretrained("./model/")

In [None]:
model = Model([encoder_input, decoder_input], decoder_output)

#### 2.1 Creating a pipeline for translation


In [None]:
translator = pipeline(
    "text2text-generation",
    model="./model/",
    tokenizer="./model/",
)

#### 2.2 Prompt the user to enter a sentence for translation


In [None]:
while True:
    text = input("Enter an English sentence for translation to Swahili (type 'exit' to quit): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")