In [None]:
!pip install transformers datasets sentencepiece -q

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/MyDrive/archive/en-es_train.csv'  # Update path as needed
df = pd.read_csv(file_path)

# Extract 'en' and 'fr' translations from the 'translation' column
df['en'] = df['translation'].apply(lambda x: eval(x)['en'])
df['es'] = df['translation'].apply(lambda x: eval(x)['es'])

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

# Save data to CSVs for Hugging Face datasets
train_data.to_csv('train.csv', index=False)
val_data.to_csv('val.csv', index=False)
test_data.to_csv('test.csv', index=False)


In [None]:
from datasets import load_dataset

# Load the data into Hugging Face datasets
data_files = {
    "train": "train.csv",
    "validation": "val.csv",
    "test": "test.csv"
}
dataset = load_dataset('csv', data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
from transformers import AutoTokenizer

# Use a pretrained English-French translation model
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the preprocessing function
def preprocess_function(examples):
    inputs = examples['en']
    targets = examples['es']

    # Tokenize inputs and targets with padding and truncation
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



Map:   0%|          | 0/84123 [00:00<?, ? examples/s]



Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/4674 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM

# Load the pretrained translation model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

# Define a data collator to handle padding during batching
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True
)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.4926,0.465939
2,0.4637,0.455902
3,0.4376,0.453218




TrainOutput(global_step=15774, training_loss=0.47089066039296146, metrics={'train_runtime': 2965.8835, 'train_samples_per_second': 85.091, 'train_steps_per_second': 5.318, 'total_flos': 8554896458514432.0, 'train_loss': 0.47089066039296146, 'epoch': 3.0})

In [None]:
# Evaluate the model on the test set
results = trainer.evaluate(tokenized_datasets["test"])
print("Test Results:", results)

Test Results: {'eval_loss': 0.45739755034446716, 'eval_runtime': 16.1989, 'eval_samples_per_second': 288.538, 'eval_steps_per_second': 18.088, 'epoch': 3.0}


In [None]:
# Save the model and tokenizer to your Google Drive
model.save_pretrained("/content/drive/MyDrive/models/en-es-translation_model")
tokenizer.save_pretrained("/content/drive/MyDrive/models/en-es-translation_model")

('/content/drive/MyDrive/models/en-es-translation_model/tokenizer_config.json',
 '/content/drive/MyDrive/models/en-es-translation_model/special_tokens_map.json',
 '/content/drive/MyDrive/models/en-es-translation_model/vocab.json',
 '/content/drive/MyDrive/models/en-es-translation_model/source.spm',
 '/content/drive/MyDrive/models/en-es-translation_model/target.spm',
 '/content/drive/MyDrive/models/en-es-translation_model/added_tokens.json')

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the model and tokenizer from your saved path
model_path = "/content/drive/MyDrive/models/en-es-translation_model"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)




In [None]:
def translate(sentence, model, tokenizer, max_length=50):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True)

    # Generate translation using the model
    outputs = model.generate(**inputs, max_length=max_length, num_beams=5, early_stopping=True)

    # Decode the output tokens to text
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation


In [None]:
# Example sentences to test
test_sentences = [
    "Hello, how are you?",
    "I love programming in Python.",
    "The weather is nice today.",
    "Can you help me with this task?",
    "This is a wonderful opportunity!"
]

# Generate translations
for sentence in test_sentences:
    spanish_translation = translate(sentence, model, tokenizer)
    print(f"English: {sentence}")
    print(f"Spanish: {spanish_translation}\n")


English: Hello, how are you?
French: ¿Cómo estás?

English: I love programming in Python.
French: Me encanta programar en Python.

English: The weather is nice today.
French: Hoy hace buen tiempo.

English: Can you help me with this task?
French: ¿Puede ayudarme en esta tarea?

English: This is a wonderful opportunity!
French: ¡Es una magnífica ocasión!

