In [11]:
!pip install datasets scikit-learn sacrebleu
!pip install -U transformers





In [12]:
import pandas as pd
import warnings
import logging
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from sacrebleu import corpus_bleu

# Suppress specific warning related to 'Trainer.tokenizer'
logging.getLogger("transformers").setLevel(logging.ERROR)

In [4]:
from google.colab import files
uploaded = files.upload()

Saving english_french.csv to english_french.csv


In [14]:
file_path = "english_french.csv"
data = pd.read_csv(file_path)
data = data.dropna()

sampled_data = data.sample(frac=0.01, random_state=42)

In [6]:
data.head()

Unnamed: 0,English,French
0,Go.,Va !
1,Go.,Marche.
2,Go.,En route !
3,Go.,Bouge !
4,Hi.,Salut !


In [15]:
#Split into 80% training and 20% evaluation
train_size = int(0.8 * len(sampled_data))
train_data = sampled_data[:train_size]
eval_data = sampled_data[train_size:]

#Create Dataset objects for hugging face
train_dict = {"English": train_data["English"].tolist(), "French": train_data["French"].tolist()}
eval_dict = {"English": eval_data["English"].tolist(), "French": eval_data["French"].tolist()}
datasets = DatasetDict({"train": Dataset.from_dict(train_dict), "eval": Dataset.from_dict(eval_dict)})


In [16]:
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [9]:
# Tokenize data
def tokenize_function(examples):
  return tokenizer(examples["English"], text_target=examples["French"], padding=True, truncation=True)

#Map tokenization over dataset
tokenized_datasets = datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/1838 [00:00<?, ? examples/s]

Map:   0%|          | 0/460 [00:00<?, ? examples/s]

In [20]:
# Data collator for dynamic padding
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    save_total_limit=1,
    eval_strategy="epoch",
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    report_to="none",
)

In [21]:
def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # Convert tensors to numpy arrays if necessary
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Decode labels, handling -100 masking for tokenizers
    labels = [[label for label in batch if label != -100] for batch in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        decoded_labels, decoded_preds, average="weighted", zero_division=1
    )
    acc = accuracy_score(decoded_labels, decoded_preds)

    # Calculate BLEU score
    bleu = corpus_bleu(decoded_preds, [decoded_labels]).score

    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall, "bleu": bleu}


In [22]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

  trainer = Seq2SeqTrainer(


{'loss': 1.3233, 'grad_norm': 4.788868427276611, 'learning_rate': 4.804347826086957e-05, 'epoch': 0.043478260869565216}
{'loss': 0.1844, 'grad_norm': 1.1199538707733154, 'learning_rate': 4.586956521739131e-05, 'epoch': 0.08695652173913043}
{'loss': 0.1815, 'grad_norm': 3.042302131652832, 'learning_rate': 4.3695652173913046e-05, 'epoch': 0.13043478260869565}
{'loss': 0.2033, 'grad_norm': 1.426413893699646, 'learning_rate': 4.1521739130434786e-05, 'epoch': 0.17391304347826086}
{'loss': 0.196, 'grad_norm': 1.8878957033157349, 'learning_rate': 3.9347826086956525e-05, 'epoch': 0.21739130434782608}
{'loss': 0.1707, 'grad_norm': 1.6308917999267578, 'learning_rate': 3.7173913043478264e-05, 'epoch': 0.2608695652173913}
{'loss': 0.1605, 'grad_norm': 1.6243313550949097, 'learning_rate': 3.5e-05, 'epoch': 0.30434782608695654}
{'loss': 0.1331, 'grad_norm': 1.6404900550842285, 'learning_rate': 3.282608695652174e-05, 'epoch': 0.34782608695652173}
{'loss': 0.1661, 'grad_norm': 1.8007391691207886, 'lea

TrainOutput(global_step=230, training_loss=0.21841997685639755, metrics={'train_runtime': 1193.2973, 'train_samples_per_second': 1.54, 'train_steps_per_second': 0.193, 'train_loss': 0.21841997685639755, 'epoch': 1.0})

In [23]:
# Evaluate the model and print the results
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

{'eval_loss': 0.19931964576244354, 'eval_accuracy': 0.3217391304347826, 'eval_f1': 0.3217391304347826, 'eval_precision': 1.0, 'eval_recall': 0.3217391304347826, 'eval_bleu': 56.35052131690946, 'eval_runtime': 193.0632, 'eval_samples_per_second': 2.383, 'eval_steps_per_second': 0.3, 'epoch': 1.0}
Evaluation Results: {'eval_loss': 0.19931964576244354, 'eval_accuracy': 0.3217391304347826, 'eval_f1': 0.3217391304347826, 'eval_precision': 1.0, 'eval_recall': 0.3217391304347826, 'eval_bleu': 56.35052131690946, 'eval_runtime': 193.0632, 'eval_samples_per_second': 2.383, 'eval_steps_per_second': 0.3, 'epoch': 1.0}


In [25]:
# Generate sample translations
sample_texts = ["The patient was diagnosed with hypertension and prescribed beta-blockers.", "Symptoms include fever, fatigue, and persistent coughing.", "Take two tablets of paracetamol every six hours after meals.", "The surgery was successful, and the patient is now in stable condition.", "Please report any side effects such as dizziness or nausea.", "The blood test results indicate a low level of hemoglobin", "The child was vaccinated against measles, mumps, and rubella.", "This medication must be taken on an empty stomach.", "The MRI scan showed no signs of internal bleeding."]
inputs = tokenizer(sample_texts, return_tensors="pt", padding=True).to(device)
translated = model.generate(**inputs)
translations = tokenizer.batch_decode(translated, skip_special_tokens=True)

# Display translations
for i, (src, tgt) in enumerate(zip(sample_texts, translations)):
    print(f"Source: {src}\nTranslation: {tgt}\n")

Source: The patient was diagnosed with hypertension and prescribed beta-blockers.
Translation: On a diagnostiqué l'hypertension chez le patient et prescrit des bêtabloquants.

Source: Symptoms include fever, fatigue, and persistent coughing.
Translation: Les symptômes sont la fièvre, la fatigue et la toux persistante.

Source: Take two tablets of paracetamol every six hours after meals.
Translation: Prenez deux comprimés de paracétamol toutes les six heures après les repas.

Source: The surgery was successful, and the patient is now in stable condition.
Translation: L'opération a été couronnée de succès, et le patient est maintenant dans un état stable.

Source: Please report any side effects such as dizziness or nausea.
Translation: Veuillez signaler tout effet indésirable comme les vertiges ou les nausées.

Source: The blood test results indicate a low level of hemoglobin
Translation: Les résultats des analyses sanguines indiquent un faible taux d'hémoglobine.

Source: The child was 