In [1]:
import cv2
import numpy
print("OpenCV:", cv2.__version__)
print("NumPy:", numpy.__version__)

OpenCV: 4.12.0
NumPy: 2.2.6


In [2]:
import torch
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MarianTokenizer, MarianMTModel
from tqdm import tqdm
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from tqdm import tqdm
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SRI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from huggingface_hub import login
login(token="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")

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def load_data(data_path):
    df = pd.read_csv(data_path)
    eng_col, fr_col = df.columns[:2]
    eng_sentences = [preprocess_text(str(x)) for x in df[eng_col]]
    fr_sentences = [preprocess_text(str(x)) for x in df[fr_col]]
    pairs = [(e, f) for e, f in zip(eng_sentences, fr_sentences) if e and f]
    return zip(*pairs)

eng_sentences, fr_sentences = load_data("eng_-french.csv")

eng_train, eng_test, fr_train, fr_test = train_test_split(list(eng_sentences), list(fr_sentences), test_size=0.2, random_state=42)

In [5]:
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [6]:
class TranslationDataset(Dataset):
    def __init__(self, eng, fr, tokenizer, max_len=64):
        self.eng = eng
        self.fr = fr
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.eng)

    def __getitem__(self, idx):
        src = self.eng[idx]
        tgt = self.fr[idx]
        src_enc = self.tokenizer(
            src, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt")
        tgt_enc = self.tokenizer(
            tgt, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt")
        input_ids = src_enc["input_ids"].squeeze()
        attention_mask = src_enc["attention_mask"].squeeze()
        labels = tgt_enc["input_ids"].squeeze()

        labels[labels == tokenizer.pad_token_id] = -100
        return input_ids, attention_mask, labels

In [7]:
train_dataset = TranslationDataset(eng_train, fr_train, tokenizer)
test_dataset = TranslationDataset(eng_test, fr_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 2

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for input_ids, attention_mask, labels in loop:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} | Average Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/2: 100%|█████████████████████████████████████████████████████| 17562/17562 [43:50<00:00,  6.68it/s, loss=0.808]


Epoch 1 | Average Loss: 0.7942


Epoch 2/2: 100%|█████████████████████████████████████████████████████| 17562/17562 [43:57<00:00,  6.66it/s, loss=0.615]

Epoch 2 | Average Loss: 0.4870





In [9]:
model.save_pretrained("./finetuned_en_fr")
tokenizer.save_pretrained("./finetuned_en_fr")



('./finetuned_en_fr\\tokenizer_config.json',
 './finetuned_en_fr\\special_tokens_map.json',
 './finetuned_en_fr\\vocab.json',
 './finetuned_en_fr\\source.spm',
 './finetuned_en_fr\\target.spm',
 './finetuned_en_fr\\added_tokens.json')

In [10]:
model.eval()

def translate(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        generated = model.generate(**inputs, max_length=64)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

test_samples = ["I love drinking juice.", "The weather is nice today."]
for s in test_samples:
    print(f"English: {s}")
    print(f"French:  {translate(s)}")
    print()


English: I love drinking juice.
French:  jaime boire du jus

English: The weather is nice today.
French:  le temps est beau aujourdhui



In [11]:
test_samples = [
    "I have always dreamed of traveling to France to experience its rich culture, delicious food, and beautiful architecture.",
    "Despite the challenges we faced during the project, our team managed to deliver outstanding results on time.",
    "Technology continues to evolve rapidly, transforming the way we communicate, work, and learn every single day.",
    "After a long day at work, I like to relax by reading a good book or watching my favorite series.",
    "Machine learning models require large amounts of data to identify patterns and make accurate predictions.",
    "The seminar on renewable energy sources provided valuable insights into sustainable power generation.",
    "Collaboration between different research teams accelerated the discovery of innovative solutions.",
    "Standing on the edge of the cliff, she could feel the wind rush past her face as the sun dipped below the horizon.",
    "The bond between the two friends grew stronger as they faced obstacles together and celebrated each victory.",
    "It was one of those rainy evenings when all you want to do is curl up under a blanket and listen to the sound of the storm."
]

for s in test_samples:
    print(f"English: {s}")
    print(f"French:  {translate(s)}\n")

English: I have always dreamed of traveling to France to experience its rich culture, delicious food, and beautiful architecture.
French:  jai toujours rêvé de voyager en français pour faire lexpérience de sa culture riche et de la nourriture délicieuse et de la belle artiste

English: Despite the challenges we faced during the project, our team managed to deliver outstanding results on time.
French:  malgré les défis que nous avons rencontrés au cours du projet noël notre équipe sest débrouillée pour produire des résultats évidents à lheure

English: Technology continues to evolve rapidly, transforming the way we communicate, work, and learn every single day.
French:  la technologie continue à croître rapidement sauter la façon dont nous communiquons le travail de quantité le maintenant et apprenons par jour les humains

English: After a long day at work, I like to relax by reading a good book or watching my favorite series.
French:  après une longue journée au travail dixneuf jappréc

In [12]:
def evaluate_model(model, tokenizer, eng_sentences, fr_sentences, num_samples=100):
    model.eval()
    references = []
    hypotheses = []
    smooth_fn = SmoothingFunction().method1  


    sample_eng = eng_sentences[:num_samples]
    sample_fr = fr_sentences[:num_samples]

    for eng, fr in tqdm(zip(sample_eng, sample_fr), total=len(sample_eng), desc="Evaluating"):
        inputs = tokenizer(eng, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            generated = model.generate(**inputs, max_length=64)
        pred = tokenizer.decode(generated[0], skip_special_tokens=True)

        # Tokenize both predicted and reference text
        hypotheses.append(nltk.word_tokenize(pred.lower()))
        references.append([nltk.word_tokenize(fr.lower())])

    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smooth_fn)
    print(f"\n BLEU Score on Test Data: {bleu_score:.4f}")

    # Display a few sample translations
    print("\nSample Translations:")
    for i in range(3):
        print(f"English:  {sample_eng[i]}")
        print(f"Expected: {sample_fr[i]}")
        print(f"Predicted:{' '.join(hypotheses[i])}")
        print()


evaluate_model(model, tokenizer, list(eng_test), list(fr_test))

Evaluating: 100%|████████████████████████████████████████████████████████████████████| 100/100 [00:36<00:00,  2.72it/s]


 BLEU Score on Test Data: 0.5155

Sample Translations:
English:  take a seat
Expected: prends place
Predicted:assiedstoi

English:  i wish tom was here
Expected: jaimerais que tom soit là
Predicted:jaimerais que tom soit là

English:  how did the audition go
Expected: comment sest passée laudition
Predicted:comment sest déroulée laudition






In [13]:
def evaluate_long_sentences(model, tokenizer, eng_sentences, fr_sentences, min_length=15, num_samples=20):
    model.eval()
    references = []
    hypotheses = []
    smooth_fn = SmoothingFunction().method1  
    long_eng = []
    long_fr = []
    for e, f in zip(eng_sentences, fr_sentences):
        if len(e.split()) >= min_length:
            long_eng.append(e)
            long_fr.append(f)
        if len(long_eng) >= num_samples:
            break

    if not long_eng:
        print("No long sentences found in the dataset. Try reducing min_length.")
        return

    for eng, fr in tqdm(zip(long_eng, long_fr), total=len(long_eng), desc="Evaluating Long Sentences"):
        inputs = tokenizer(eng, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            generated = model.generate(**inputs, max_length=128, num_beams=4)  
        pred = tokenizer.decode(generated[0], skip_special_tokens=True)

        hypotheses.append(nltk.word_tokenize(pred.lower()))
        references.append([nltk.word_tokenize(fr.lower())])

    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smooth_fn)
    print(f"\n BLEU Score on Long Sentences: {bleu_score:.4f}")

    print("\nSample Translations (Long Sentences):")
    for i in range(min(3, len(long_eng))):
        print(f"\nEnglish:  {long_eng[i]}")
        print(f"Expected: {long_fr[i]}")
        print(f"Predicted:{' '.join(hypotheses[i])}")


evaluate_long_sentences(model, tokenizer, list(eng_test), list(fr_test))

Evaluating Long Sentences: 100%|███████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.27it/s]


 BLEU Score on Long Sentences: 0.4860

Sample Translations (Long Sentences):

English:  my father believed that anyone who could not make a living in japan was lazy
Expected: mon père pensait que celui qui ne pouvait gagner sa vie au japon était un paresseux
Predicted:mon père croyait que quiconque ne pouvait pas vivre au japon était paresseux

English:  do you think your parents spent enough time with you when you were in your teens
Expected: pensezvous que vos parents ont passé suffisamment de temps avec vous lorsque vous étiez adolescents
Predicted:pensestu que tes parents ont passé suffisamment de temps avec toi lorsque tu étais adolescent

English:  father took his place at the head of the table and began to say grace
Expected: père pris place au bout de la table et entama le bénédicité
Predicted:père prit sa place à la tête de la table et commença à dire grâce



