<a href="https://colab.research.google.com/github/Sahar-Sheikhi/Multilingual_Translation/blob/main/mbart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the path to your dataset folder
data_path = "/content/drive/MyDrive/translation_data/"

# Load datasets directly from Google Drive
import pandas as pd

train_df = pd.read_csv(data_path + "train_data.csv")
val_df = pd.read_csv(data_path + "val_data.csv")
test_df = pd.read_csv(data_path + "test_data.csv")


In [None]:

import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/translation_data/train_data.csv')
print(df.head())


                English             Italian
0         MEMBRANE UD-1       MEMBRANA UD-1
1      PLASTIC BASE LT8   BASE CABINATO LT8
2      COMMUTATOR 6 POS   COMMUTATORE 6 POS
3     MOTOR CASING K120  CARTER MOTORE K120
4  SCREW F3,5X13 N11725                VITE


# Disable wandb to train free

In [None]:
!pip uninstall -y wandb



[0m

In [None]:
!pip install transformers datasets sentencepiece sacrebleu evaluate accelerate


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 

In [None]:

import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict


# Load the datasets
train_df = pd.read_csv('/content/drive/MyDrive/translation_data/train_data.csv')
val_df = pd.read_csv('/content/drive/MyDrive/translation_data/val_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/translation_data/test_data.csv')

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Reduce dataset
train_dataset = train_dataset.select(range(min(300, len(train_dataset))))
val_dataset = val_dataset.select(range(min(50, len(val_dataset))))
test_dataset = test_dataset.select(range(min(50, len(test_dataset))))

# Verify the new sizes
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")


from transformers import MBart50TokenizerFast

model_checkpoint = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_checkpoint, src_lang="en_XX", tgt_lang="it_IT")

max_length = 128

def tokenize_data(example):
    inputs = tokenizer(example['English'], max_length=max_length, truncation=True, padding="max_length")
    targets = tokenizer(example['Italian'], max_length=max_length, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_data, batched=True, remove_columns=['English', 'Italian'])
tokenized_val = val_dataset.map(tokenize_data, batched=True, remove_columns=['English', 'Italian'])
tokenized_test = test_dataset.map(tokenize_data, batched=True, remove_columns=['English', 'Italian'])




Train size: 300
Validation size: 50
Test size: 50


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

#  Setup Data Collator and Metrics

In [None]:
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_checkpoint, padding=True)

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return {"bleu": result["score"]}


#Initialize the mBART Model

In [None]:
from transformers import MBartForConditionalGeneration

model = MBartForConditionalGeneration.from_pretrained(model_checkpoint)


#Define Training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
       output_dir="mbart-eng-it-domain-specific",
       eval_strategy="epoch",
       save_strategy="epoch",
       learning_rate=2e-5,
       per_device_train_batch_size=4,  #  batch size
       per_device_eval_batch_size=4,  #  batch size
       num_train_epochs=3,
       weight_decay=0.01,
       predict_with_generate=True,
       save_total_limit=2,
       fp16=True,
       report_to=None
   )

# Initialize the Trainer

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


#Start training

In [None]:
trainer.train()


#Evaluate the model

In [None]:
eval_results = trainer.evaluate(tokenized_test)
print(f"BLEU score on test set: {eval_results['eval_bleu']:.2f}")


NameError: name 'trainer' is not defined

# Save model and tokenizer

In [None]:
trainer.save_model("mbart-eng-it-trained")
tokenizer.save_pretrained("mbart-eng-it-trained")


#Inference - Testing translations

In [None]:
# from transformers import pipeline

# translator = pipeline(
#     "translation",
#     model="mbart-eng-it-trained",
#     tokenizer="mbart-eng-it-trained",
#     src_lang="en_XX",
#     tgt_lang="it_IT"
# )

# sentence = "This is a domain-specific sentence for translation."
# translated_sentence = translator(sentence)[0]['translation_text']
# print(f"Translated sentence: {translated_sentence}")


#  Load and Use Saved mBART Translator

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_path = "/content/drive/MyDrive/translation_models/mbart-eng-it-trained"



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
local_files_only=True


# Load the Saved Model and Tokenizer

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_path = "/content/drive/MyDrive/translation_models/mbart-eng-it-trained"

model = MBartForConditionalGeneration.from_pretrained(model_path, local_files_only=True)
tokenizer = MBart50TokenizerFast.from_pretrained(model_path, src_lang="en_XX", tgt_lang="it_IT", local_files_only=True)


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/drive/MyDrive/translation_models/mbart-eng-it-trained'. Use `repo_type` argument if needed.

# Translate English to Italian

In [None]:
input_sentence = "This is a test sentence for translation."
translation = translator(input_sentence)[0]['translation_text']
print("Translated:", translation)


# Create a Translator Pipeline

In [None]:
from transformers import pipeline

translator = pipeline(
    "translation",
    model=model,
    tokenizer=tokenizer,
    src_lang="en_XX",
    tgt_lang="it_IT"
)


# VECTORDB

In [None]:
!pip install faiss-cpu sentence-transformers


# Load Your Trained Model

In [None]:
from transformers import pipeline

translator_pipeline = pipeline(
    "translation",
    model="mbart-eng-it-trained",
    tokenizer="mbart-eng-it-trained",
    src_lang="en_XX",
    tgt_lang="it_IT"
)


# Prepare a Knowledge Base and Vector Index

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

reference_texts = [
    "A clinical diagnosis requires a comprehensive patient history.",
    "In legal terms, liability refers to responsibility.",
    "GDP stands for gross domestic product.",
    "Molarity is a unit of concentration in chemistry.",
]

# Load embedder and build vector index
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(reference_texts, convert_to_numpy=True)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

doc_store = {i: text for i, text in enumerate(reference_texts)}


# Define a Retriever Function

In [None]:
def retrieve_context(query, top_k=2):
    query_vec = embedder.encode([query])
    _, indices = index.search(np.array(query_vec), top_k)
    return [doc_store[i] for i in indices[0]]


# Define a RAG-Style Translator Wrapper

In [None]:
def rag_translate(input_text, translator_pipeline, context_k=2):
    context = retrieve_context(input_text, top_k=context_k)
    combined_input = " ".join(context) + " " + input_text
    translation = translator_pipeline(combined_input)[0]['translation_text']
    return translation


# Test VDB

In [None]:
query = "What is the meaning of molarity in a lab test?"
translated = rag_translate(query, translator_pipeline)
print("RAG Translation:", translated)


#  Save & Load VectorDB (FAISS)

In [None]:
faiss.write_index(index, "context_index.faiss")

# Load later:
# index = faiss.read_index("context_index.faiss")



In [None]:

index = faiss.read_index("context_index.faiss")
