In [None]:
!pip install transformers datasets sentencepiece -q
!pip install sacremoses

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/MyDrive/archive/en-it_train.csv'  # Update path as needed
df = pd.read_csv(file_path)

# Extract 'en' and 'fr' translations from the 'translation' column
df['en'] = df['translation'].apply(lambda x: eval(x)['en'])
df['it'] = df['translation'].apply(lambda x: eval(x)['fr'])

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

# Save data to CSVs for Hugging Face datasets
train_data.to_csv('train.csv', index=False)
val_data.to_csv('val.csv', index=False)
test_data.to_csv('test.csv', index=False)


In [None]:
from datasets import load_dataset

# Load the data into Hugging Face datasets
data_files = {
    "train": "train.csv",
    "validation": "val.csv",
    "test": "test.csv"
}
dataset = load_dataset('csv', data_files=data_files)

In [None]:
from transformers import AutoTokenizer

# Use a pretrained English-French translation model
model_name = "Helsinki-NLP/opus-mt-it-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the preprocessing function
def preprocess_function(examples):
    inputs = examples['it']
    targets = examples['en']

    # Tokenize inputs and targets with padding and truncation
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [None]:
from transformers import AutoModelForSeq2SeqLM

# Load the pretrained translation model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from transformers import DataCollatorForSeq2Seq

# Define a data collator to handle padding during batching
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True
)
# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

In [None]:
# Evaluate the model on the test set
results = trainer.evaluate(tokenized_datasets["test"])
print("Test Results:", results)

In [None]:
# Save the model and tokenizer to your Google Drive
model.save_pretrained("/content/drive/MyDrive/models/it-en-translation_model")
tokenizer.save_pretrained("/content/drive/MyDrive/models/it-en-translation_model")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the model and tokenizer from your saved path
model_path = "/content/drive/MyDrive/models/it-en-translation_model"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
def translate(sentence, model, tokenizer, max_length=50):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True)

    # Generate translation using the model
    outputs = model.generate(**inputs, max_length=max_length, num_beams=5, early_stopping=True)

    # Decode the output tokens to text
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation


In [None]:
# Example sentences to test
test_sentences = [
]

# Generate translations
for sentence in test_sentences:
    english_translation = translate(sentence, model, tokenizer)
    print(f"italian: {sentence}")
    print(f"English: {english_translation}\n")
