In [1]:
# Install necessary libraries
!pip install transformers
!pip install sentencepiece



In [2]:
# Import libraries
from transformers import MarianTokenizer, MarianMTModel

In [3]:

# Function to load model and tokenizer
def load_model_and_tokenizer(src_lang, tgt_lang):
    """
    Load the MarianMT model and tokenizer for the given source and target language pair.
    """
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Translation function
def translate(texts, tokenizer, model):
    """
    Translate a list of texts using the provided tokenizer and model.
    """
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Example usage
if __name__ == "__main__":
    # Define the language pair
    src_lang = "en"  # Source language (English)
    tgt_lang = "hi"  # Target language (Hindi)

    """
        Language Pair Codes:
        en = English
        hi = Hindi
        fr = French
        es = Spanish
        de = German
        """

    # Load the model and tokenizer
    tokenizer, model = load_model_and_tokenizer(src_lang, tgt_lang)

    # Texts to translate
    texts_to_translate = [
        "Hello, how are you?",
        "This is a multilingual translation system.",
        "Let's work together to make progress."
    ]

    # Perform translation
    translated_texts = translate(texts_to_translate, tokenizer, model)

    # Print results
    for i, (src, tgt) in enumerate(zip(texts_to_translate, translated_texts)):
        print(f"Text {i+1}:")
        print(f"Source: {src}")
        print(f"Translated: {tgt}")
        print("-" * 50)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Text 1:
Source: Hello, how are you?
Translated: हैलो, तुम कैसे हो?
--------------------------------------------------
Text 2:
Source: This is a multilingual translation system.
Translated: यह अलग - अलग भाषाओं में अनुवाद करने की व्यवस्था है ।
--------------------------------------------------
Text 3:
Source: Let's work together to make progress.
Translated: चलो एक साथ प्रगति करने के लिए काम करते हैं.
--------------------------------------------------


In [7]:
# Function to translate into multiple target languages
def translate_to_multiple_languages(input_text, src_lang, target_languages):
    """
    Translate the input text from the source language to multiple target languages.
    """
    translations = {}
    for tgt_lang in target_languages:
        tokenizer, model = load_model_and_tokenizer(src_lang, tgt_lang)
        translated_text = translate(input_text, tokenizer, model)
        translations[tgt_lang] = translated_text
    return translations

# Main program
if __name__ == "__main__":
    # Define source and target languages
    src_lang = "en"  # Source language (English)
    target_languages = ["hi", "fr", "es", "de"]  # Hindi, French, Spanish, German

    # Take input from user
    input_text = input("Enter a sentence in English: ")

    # Translate to all target languages
    translations = translate_to_multiple_languages(input_text, src_lang, target_languages)

    # Print results
    print("\nTranslations:")
    for lang, translated_text in translations.items():
        print(f"{lang.upper()}: {translated_text}")

Enter a sentence in English: I have been working all day to complete this task.

Translations:
HI: ['मैं इस कार्य को पूरा करने के लिए पूरे दिन काम कर रहा हूँ.']
FR: ["J'ai travaillé toute la journée pour mener à bien cette tâche."]
ES: ['He estado trabajando todo el día para completar esta tarea.']
DE: ['Ich habe den ganzen Tag gearbeitet, um diese Aufgabe zu erledigen.']


In [8]:
!pip install nltk




In [12]:
for source, reference in test_sentences:
    predicted = translate(source, tokenizer, model)
    print(f"Source: {source}")
    print(f"Reference: {reference}")
    print(f"Predicted: {predicted}")
    print("-" * 50)


Source: How are you?
Reference: आप कैसे हैं?
Predicted: आप कैसे हैं?
--------------------------------------------------
Source: Good morning
Reference: Bonjour
Predicted: सुप्रभात
--------------------------------------------------
Source: Thank you
Reference: Gracias
Predicted: धन्यवाद
--------------------------------------------------
Source: See you later
Reference: Bis später
Predicted: आप बाद में देखें
--------------------------------------------------


In [18]:
!pip install bert-score

from bert_score import score  # Ensure bert-score is installed

# Evaluate model with BLEU and collect predictions for BERTScore
def evaluate_model_with_bert_and_bleu(test_data, target_languages):
    """
    Evaluate the translation model using BLEU and BERTScore.
    """
    results = {}
    smoothing = SmoothingFunction().method1  # Add smoothing
    all_predictions = []  # Store predictions for BERTScore
    all_references = []  # Store references for BERTScore

    for tgt_lang in target_languages:
        tokenizer, model = load_model_and_tokenizer(src_lang, tgt_lang)
        scores = []

        print(f"\nEvaluating translations to {tgt_lang.upper()}:")

        for source, reference in test_data:
            predicted = translate(source, tokenizer, model)
            all_predictions.append(predicted)  # Save predicted translations
            all_references.append(reference)  # Save reference translations

            score = sentence_bleu([reference.split()], predicted.split(), smoothing_function=smoothing)
            scores.append(score)
            print(f"Source: {source}")
            print(f"Reference: {reference}")
            print(f"Predicted: {predicted}")
            print("-" * 50)

        results[tgt_lang] = sum(scores) / len(scores)

    return results, all_predictions, all_references

# Run evaluation and collect data
results, predictions, references = evaluate_model_with_bert_and_bleu(test_sentences, target_languages)

# Calculate BERTScore
P, R, F1 = score(predictions, references, lang="en")  # Use appropriate language code
print("\nBERTScore Results:")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1 Score: {F1.mean().item():.4f}")



Evaluating translations to HI:
Source: How are you?
Reference: आप कैसे हैं?
Predicted: आप कैसे हैं?
--------------------------------------------------
Source: Good morning
Reference: Bonjour
Predicted: सुप्रभात
--------------------------------------------------
Source: Thank you
Reference: Gracias
Predicted: धन्यवाद
--------------------------------------------------
Source: See you later
Reference: Bis später
Predicted: आप बाद में देखें
--------------------------------------------------

Evaluating translations to FR:
Source: How are you?
Reference: आप कैसे हैं?
Predicted: Comment allez-vous ?
--------------------------------------------------
Source: Good morning
Reference: Bonjour
Predicted: Bonjour.
--------------------------------------------------
Source: Thank you
Reference: Gracias
Predicted: Je vous remercie.
--------------------------------------------------
Source: See you later
Reference: Bis später
Predicted: A tout à l'heure.
----------------------------------------------

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTScore Results:
Precision: 0.8153
Recall: 0.8044
F1 Score: 0.8083
