## English-French Translation

In [1]:
# !pip install datasets
# !pip install --upgrade transformers
# !pip install sentencepiece

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

In [2]:
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [3]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [4]:
split_datasets["validation"] = split_datasets.pop("test")

In [5]:
split_datasets["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

In [9]:
from transformers import AutoTokenizer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### Set the maximum sequence length and define the preprocessing function
max_length = 128

# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr", tokenizer=tokenizer)


In [13]:
def preprocess_function(examples):
    inputs = str(examples['id'])
    targets = str(examples['translation'])
    
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

In [None]:
####  Preprocess the training and validation sets
train_dataset = split_datasets['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["id", "translation"]
)


In [17]:
eval_dataset = split_datasets['validation'].map(
    preprocess_function, batched=True, remove_columns=["id", "translation"]
)

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir='../model/en_fr_model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../logs/",
    logging_steps=500,
)

In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [20]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

  0%|          | 0/240 [00:00<?, ?it/s]

{'train_runtime': 3705.4557, 'train_samples_per_second': 1.036, 'train_steps_per_second': 0.065, 'train_loss': 5.353313191731771, 'epoch': 20.0}


In [21]:
result = trainer.evaluate()
print(result)

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 5.731449127197266, 'eval_runtime': 8.376, 'eval_samples_per_second': 2.627, 'eval_steps_per_second': 0.239, 'epoch': 20.0}


In [22]:
model.save_pretrained("../model/en_fr_model/")
tokenizer.save_pretrained("../model/en_fr_model/")

('../model/en_fr_model/tokenizer_config.json',
 '../model/en_fr_model/special_tokens_map.json',
 '../model/en_fr_model/vocab.json',
 '../model/en_fr_model/source.spm',
 '../model/en_fr_model/target.spm',
 '../model/en_fr_model/added_tokens.json')

In [23]:
translator = pipeline(
    "text2text-generation",
    model="../model/en_fr_model/",
    tokenizer="../model/en_fr_model/",
)

In [24]:
while True:
    text = input("Enter an English sentence for translation to French (type 'exit' to quit): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")

Translated text: Mon nom est Peter.
Translated text: Quel est votre nom?
Translated text: A sa grande surprise, la porte s'ouvrit avec un creak hantant. A l'intérieur, l'air était épais avec de la poussière, et les toiles d'écran ornaient les coins. Les yeux de Lila s'élargissaient alors qu'elle remarquait un vieux livre particulier allongé sur une table. Ses pages étaient remplies de symboles et de dessins étranges qui semblaient s'animer à chaque tour. Lila plongeant dans le livre, elle découvrit qu'elle tenait la clé du déverrouillage de la magie depuis longtemps oubliée de la
Translated text: le nom de l'utilisateur;
Translated text: le nom de l'utilisateur;
Translated text: le nom de l'utilisateur;


# French to English

In [8]:
from transformers import AutoTokenizer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "Helsinki-NLP/opus-mt-fr-en"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### Set the maximum sequence length and define the preprocessing function
max_length = 128

# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr", tokenizer=tokenizer)


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/Helsinki-NLP/opus-mt-fr-en/599b819e3488f0fb888fef09511370ce4c0388b6f0f6beeb49a1f4b19043bebc?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1706335611&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjMzNTYxMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9IZWxzaW5raS1OTFAvb3B1cy1tdC1mci1lbi81OTliODE5ZTM0ODhmMGZiODg4ZmVmMDk1MTEzNzBjZTRjMDM4OGI2ZjBmNmJlZWI0OWExZjRiMTkwNDNiZWJjP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=TowwA-DRoMoRmDKMt2%7EjKzoLzXlrZtvOyGXw60K-rpzXGsWiGV%7EinSDQJGaWilZZCuWPr6ASBVD9JlYtBOEPh4n-URTFJ%7EQsP9Vw4dpBzRq%7Eg0VEBwAsCGgRXdzKF2zs2W%7EBpA5aoQZSZkRjzMvWLzsf8Sz5leW0JMJ9lISSsuVCoXYyDZ6p1BPPRiLJD4eYyG0zRP5H4VZyemVYEw1JhqVMgMCFgsgmszgp5P%7EwK7AVkKPyHksGCNYT39ID1%7EaP0M4DVTvPZZ8

pytorch_model.bin:  10%|#         | 31.5M/301M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/Helsinki-NLP/opus-mt-fr-en/599b819e3488f0fb888fef09511370ce4c0388b6f0f6beeb49a1f4b19043bebc?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1706335611&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjMzNTYxMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9IZWxzaW5raS1OTFAvb3B1cy1tdC1mci1lbi81OTliODE5ZTM0ODhmMGZiODg4ZmVmMDk1MTEzNzBjZTRjMDM4OGI2ZjBmNmJlZWI0OWExZjRiMTkwNDNiZWJjP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=TowwA-DRoMoRmDKMt2%7EjKzoLzXlrZtvOyGXw60K-rpzXGsWiGV%7EinSDQJGaWilZZCuWPr6ASBVD9JlYtBOEPh4n-URTFJ%7EQsP9Vw4dpBzRq%7Eg0VEBwAsCGgRXdzKF2zs2W%7EBpA5aoQZSZkRjzMvWLzsf8Sz5leW0JMJ9lISSsuVCoXYyDZ6p1BPPRiLJD4eYyG0zRP5H4VZyemVYEw1JhqVMgMCFgsgmszgp5P%7EwK7AVkKPyHksGCNYT39ID1%7EaP0M4DVTvPZZ8

pytorch_model.bin:  38%|###8      | 115M/301M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/Helsinki-NLP/opus-mt-fr-en/599b819e3488f0fb888fef09511370ce4c0388b6f0f6beeb49a1f4b19043bebc?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1706335611&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjMzNTYxMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9IZWxzaW5raS1OTFAvb3B1cy1tdC1mci1lbi81OTliODE5ZTM0ODhmMGZiODg4ZmVmMDk1MTEzNzBjZTRjMDM4OGI2ZjBmNmJlZWI0OWExZjRiMTkwNDNiZWJjP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=TowwA-DRoMoRmDKMt2%7EjKzoLzXlrZtvOyGXw60K-rpzXGsWiGV%7EinSDQJGaWilZZCuWPr6ASBVD9JlYtBOEPh4n-URTFJ%7EQsP9Vw4dpBzRq%7Eg0VEBwAsCGgRXdzKF2zs2W%7EBpA5aoQZSZkRjzMvWLzsf8Sz5leW0JMJ9lISSsuVCoXYyDZ6p1BPPRiLJD4eYyG0zRP5H4VZyemVYEw1JhqVMgMCFgsgmszgp5P%7EwK7AVkKPyHksGCNYT39ID1%7EaP0M4DVTvPZZ8

pytorch_model.bin:  59%|#####9    | 178M/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [9]:
def preprocess_function_fr_en(examples):
    inputs = str(examples['translation'])
    targets = str(examples['id'])
    
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

In [10]:
####  Preprocess the training and validation sets
train_dataset = split_datasets['train'].map(
    preprocess_function_fr_en, batched=True, num_proc=4, remove_columns=["id", "translation"]
)


Map (num_proc=4):   0%|          | 0/189155 [00:00<?, ? examples/s]

In [11]:
eval_dataset = split_datasets['validation'].map(
    preprocess_function_fr_en, batched=True, remove_columns=["id", "translation"]
)

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir='../model/fr_en_model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../logs/",
    logging_steps=500,
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [14]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

  0%|          | 0/240 [00:00<?, ?it/s]

{'train_runtime': 3483.0497, 'train_samples_per_second': 1.102, 'train_steps_per_second': 0.069, 'train_loss': 3.638966369628906, 'epoch': 20.0}


In [15]:
result = trainer.evaluate()
print(result)

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 3.6643800735473633, 'eval_runtime': 6.822, 'eval_samples_per_second': 3.225, 'eval_steps_per_second': 0.293, 'epoch': 20.0}


In [16]:
model.save_pretrained("../model/fr_en_model/")
tokenizer.save_pretrained("../model/fr_en_model/")

('../model/fr_en_model/tokenizer_config.json',
 '../model/fr_en_model/special_tokens_map.json',
 '../model/fr_en_model/vocab.json',
 '../model/fr_en_model/source.spm',
 '../model/fr_en_model/target.spm',
 '../model/fr_en_model/added_tokens.json')

In [17]:
translator = pipeline(
    "text2text-generation",
    model="../model/fr_en_model/",
    tokenizer="../model/fr_en_model/",
)

In [18]:
while True:
    text = input("Enter an English sentence for translation to French (type 'exit' to quit): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    print(f"Translated text: {translated_text}")

Translated text: I'm the pendulum
Translated text: It's just, uh, it's, uh, it's, uh, it's, uh, it's, uh, it's, uh, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's, it's,
Translated text: My name's Rogendo
Translated text: I'm going to Lecole
Translated text: I'm going to a school.


In [21]:


def translate_fr_en(text):
   

    translator = pipeline(
        "text2text-generation",
        model="../model/fr_en_model/",
        tokenizer="../model/fr_en_model/",
    )  
    translated_text = translator(text, max_length=max_length, num_beams=5)[0]['generated_text']
    return translated_text

In [22]:
translate_fr_en("Je suis la pendule")

"I'm the pendulum"

In [39]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

In [48]:
raw_datasets['train'][6]

{'id': '6', 'translation': {'en': 'kdeaddons', 'fr': 'kdeaddons'}}

In [71]:
raw_datasets.column_names

{'train': ['id', 'translation']}

In [46]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [54]:
split_datasets["validation"] = split_datasets.pop("test")

In [62]:
split_datasets["train"][415]["translation"]

{'en': 'Crop picture to edges', 'fr': 'Changer la flèche de fin'}

In [63]:
from transformers import AutoTokenizer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### Set the maximum sequence length and define the preprocessing function
max_length = 128

# translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr", tokenizer=tokenizer)


In [72]:
def preprocess_function(examples):
    inputs = str(examples['translation'])
    targets = str(examples['id'])
    
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    model_inputs['decoder_input_ids'] = model_inputs['input_ids'].clone()

    return model_inputs

In [73]:
####  Preprocess the training and validation sets
train_dataset = split_datasets['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["translation", "id"]
)


Map (num_proc=4):   0%|          | 0/189155 [00:00<?, ? examples/s]

In [75]:
eval_dataset = split_datasets['validation'].map(
    preprocess_function, batched=True, remove_columns=["translation", "id"]
)

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

In [76]:
training_args = Seq2SeqTrainingArguments(
    output_dir='../model/fr_en_model/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="../logs/",
    logging_steps=500,
)

In [77]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [78]:
try:
    trainer.train()
except ValueError as e:
    print("Error during training:", e)

Exception ignored in: <function tqdm.__del__ at 0x7f902a1345e0>
Traceback (most recent call last):
  File "/home/rogendo/.local/lib/python3.10/site-packages/tqdm/std.py", line 1148, in __del__
    def __del__(self):
KeyboardInterrupt: 


  0%|          | 0/240 [00:00<?, ?it/s]