In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Loading

In [52]:
with open(r'D:\Personal\code\ML comp\New folder\en-fr\en-fr.en', encoding='utf-8') as en:
    en_lines = [line.strip() for line in en if line.strip()]
with open(r'D:\Personal\code\ML comp\New folder\en-fr\en-fr.fr', encoding='utf-8') as fr:
    fr_lines = [line.strip() for line in fr if line.strip()]

df = pd.DataFrame({'en': en_lines, 'fr': fr_lines})
df= df.sample(n=60000, random_state=42).reset_index(drop=True)

In [53]:
df.head()

Unnamed: 0,en,fr
0,It explains how the Committee for Medicinal Pr...,Il explique comment le comité des médicaments ...
1,"Concomitant use of CYP3A4 inhibitors (e. g., a...",L’ usage concomitant d’ inhibiteurs du CYP3A4 ...
2,"- If you have any further questions, ask your ...","- Si vous avez toute autres question, si vous ..."
3,Date of first authorisation:,Date de première autorisation:
4,0(0.0) 0(0.0),Hémorragie Neutropénie fébrile


In [54]:
df.isnull().sum()

en    0
fr    0
dtype: int64

In [55]:
len(df)

60000

In [None]:
df= df.sample(n= 40000).reset_index()

In [57]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_data = dataset['train']
val_data = dataset['test']

### Importing pretrained seq2seq model

In [58]:
from transformers import MarianMTModel, MarianTokenizer, MarianConfig

model_name = "Helsinki-NLP/opus-mt-en-fr"

tokenizer = MarianTokenizer.from_pretrained(model_name)
config = MarianConfig.from_pretrained(model_name, dropout=0.2)
model = MarianMTModel.from_pretrained(model_name, config = config)



In [59]:
en_token_lengths = [len(tokenizer.encode(s)) for s in df['en']]
fr_token_lengths = [len(tokenizer.encode(s)) for s in df['fr']]

import numpy as np
max_len_en = int(np.percentile(en_token_lengths, 95))
max_len_fr = int(np.percentile(fr_token_lengths, 95))         # similar to prev notebooks
print(max_len_en, max_len_fr)


Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


49 86


### Preprocessing Data

In [60]:
def preprocess(batch):
    inputs = batch["en"]
    targets = batch["fr"]
    
    model_inputs = tokenizer(inputs, max_length=100, truncation=True, padding="max_length")
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=100, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_data.map(preprocess, batched=True, remove_columns=train_data.column_names)
tokenized_val = val_data.map(preprocess, batched=True, remove_columns=val_data.column_names)


Map: 100%|██████████| 24000/24000 [00:07<00:00, 3159.78 examples/s]
Map: 100%|██████████| 6000/6000 [00:01<00:00, 3125.84 examples/s]


In [61]:
from torch.utils.data import DataLoader
import torch

tokenized_val.set_format("torch", columns=["input_ids", "attention_mask"])

batch_size = 16  
val_loader = DataLoader(
    tokenized_val, 
    batch_size=batch_size
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_generations = []

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        generated_outputs = model.generate(
            **batch,
            max_new_tokens=100  
        )
    
    decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
    all_generations.extend(decoded)

tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

### BLEU Before Fine Tuning:

In [62]:
import evaluate

bleu = evaluate.load("bleu")

references = [[item] for item in val_data["fr"]]
results = bleu.compute(predictions=all_generations, references=references)

print(results)
print(f"BLEU Before fine tuning: {results['bleu']}")

{'bleu': 0.44886527817580946, 'precisions': [0.6645809159229176, 0.4983239186665584, 0.3906051944824158, 0.31380975118043397], 'brevity_penalty': 1.0, 'length_ratio': 1.0134410374766458, 'translation_length': 92213, 'reference_length': 90990}
BLEU Before fine tuning: 0.44886527817580946


## Hyperparameters

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.5,
    save_total_limit=2,
    warmup_steps = 500,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,  
)

## Training

In [72]:
from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [73]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1341,0.178673
2,0.1432,0.17666
3,0.127,0.176538
4,0.1227,0.176177
5,0.1348,0.174572
6,0.123,0.17435
7,0.1148,0.174438
8,0.1118,0.174653


TrainOutput(global_step=6000, training_loss=0.12570794701576232, metrics={'train_runtime': 1231.3048, 'train_samples_per_second': 155.932, 'train_steps_per_second': 4.873, 'total_flos': 5084754739200000.0, 'train_loss': 0.12570794701576232, 'epoch': 8.0})

## BLEU Post fine tuning

In [66]:
all_generations = []
for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        generated_outputs = model.generate(
            **batch,
            max_new_tokens=100  
        )
    
    decoded = tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
    all_generations.extend(decoded)

In [67]:
import evaluate

bleu = evaluate.load("bleu")

references = [[item] for item in val_data["fr"]]
results = bleu.compute(predictions=all_generations, references=references)

print(results)
print(f"BLEU after fine tuning: {results['bleu']}")

{'bleu': 0.48076335647936286, 'precisions': [0.7001557030378657, 0.5345151791099495, 0.42398254338780067, 0.3431856611430504], 'brevity_penalty': 0.9952298944405686, 'length_ratio': 0.9952412353005825, 'translation_length': 90557, 'reference_length': 90990}
BLEU after fine tuning: 0.48076335647936286


In [75]:
import torch
from transformers import MarianMTModel, MarianTokenizer, pipeline


model_path = r"D:\Personal\code\ML comp\New folder\results\checkpoint-6000"

tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

translator_pipeline = pipeline(
    "translation",
    model = model,
    tokenizer = tokenizer,
    device = device
)
input_text = "The dysregulation of pro-inflammatory cytokines, particularly interleukin-6 and tumor necrosis factor-α, precipitates a cascade of endothelial dysfunction and microvascular thrombosis, ultimately culminating in multi-organ failure."
translated = translator_pipeline(input_text, src_lang = "en", tgt_lang = "fr")
print(translated[0]["translation_text"])



Device set to use cuda


La dysrégulation des cytokines pro-inflammatoires, en particulier de l interleukine-6 et du facteur-α de nécrose tumorale, accélère une cascade de dysfonction endothéliale et de thrombose microvasculaire, aboutissant en fin de compte à une défaillance multi-organe.


###### A note: I was able to achieve a BLEU of around 55 after fine tuning (44->55) but lost the weights and the saved notebook.