#### French to Swahili Translation Model

In [1]:
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import warnings
warnings.filterwarnings("ignore")

2024-01-20 09:37:38.694305: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-20 09:37:38.694514: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-20 09:37:38.790957: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-20 09:37:39.033689: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### 1.1 loading the datasets

In [2]:
# Load the CSV dataset
df = load_dataset("csv", data_files="../dataset/en-fr.csv")

#### 1.2 Viewing the dataset

In [3]:
print("Dataset object:\n\n", df)


Dataset object:

 DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 22520376
    })
})


#### 1.3 Viewing the information about the dataset

In [4]:
train_dataset = df['train']
print("Train dataset information:\n\n", train_dataset.features)

Train dataset information:

 {'en': Value(dtype='string', id=None), 'fr': Value(dtype='string', id=None)}


#### 1.4  Viewing the split dataset information


In [5]:
split_df = df['train'].train_test_split(train_size=0.9, seed=20)
print("\nSplit datasets:\n\n", split_df)


Split datasets:

 DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 20268338
    })
    test: Dataset({
        features: ['en', 'fr'],
        num_rows: 2252038
    })
})



#### 1.5 Loading the tokenizer and model

In [6]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### 1.5 Set the maximum sequence length and define the preprocessing function

In [7]:
max_length = 128

In [8]:
def preprocess_function(examples):
    inputs = str(examples['en'])
    targets = str(examples['fr'])
    
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

#### 1.6 Preprocess the training and validation sets

In [9]:
train_dataset = split_df['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["en", "fr"]
)


In [10]:
eval_dataset = split_df['test'].map(
    preprocess_function, batched=True, remove_columns=["en", "fr"]
)

#### 1.7 Define the training arguments and create the trainer

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir='../en_fr_model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../eng_fr_logs/",
    logging_steps=500,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

### 1.8 Train the Model

In [13]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

  0%|          | 0/25340 [00:00<?, ?it/s]

#### 1.9 Evaluate the model on the validation set

In [None]:
result = trainer.evaluate()
print(result)

#### 2.0 Export the trained model

In [None]:
model.save_pretrained("../en_fr_model/")
tokenizer.save_pretrained("../en_fr_model/")

#### 2.1 Creating a pipeline for translation


In [None]:
translator = pipeline(
    "text2text-generation",
    model="../en_fr_model/",
    tokenizer="../en_fr_model/",
)

#### 2.2 Prompt the user to enter a sentence for translation


In [None]:
while True:
    text = input("Enter an English sentence for translation to French (type 'exit' to quit): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")