In [None]:
!pip install datasets nltk transformers[sentencepiece] sacrebleu -q
!pip install sacremoses

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the foll

In [None]:
import numpy as np
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
import nltk
from tqdm import tqdm

def simple_tokenize(text):
    """
    Simple tokenization function that splits on spaces and punctuation
    """
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Split on spaces
    tokens = text.split()
    # Clean tokens
    tokens = [token.strip('.,!?()[]{}":;') for token in tokens]
    # Remove empty tokens
    tokens = [token.lower() for token in tokens if token]
    return tokens

def calculate_metrics(model, tokenizer, test_dataset, source_lang="Hindi", target_lang="English"):
    """
    Calculate BLEU score and accuracy for the translation model

    Args:
        model: The trained translation model
        tokenizer: The tokenizer
        test_dataset: Dataset containing source and target texts
        source_lang: Name of the source language column
        target_lang: Name of the target language column
    """
    # Lists to store references and hypotheses
    references = []
    hypotheses = []

    # Counter for exact matches (simple accuracy)
    exact_matches = 0
    total = 0

    # Process each example in the test dataset
    print("Calculating metrics...")
    for example in tqdm(test_dataset):
        try:
            source_text = example[source_lang]
            target_text = example[target_lang]

            # Generate translation
            inputs = tokenizer([source_text], return_tensors="tf", padding=True, truncation=True, max_length=128)
            outputs = model.generate(**inputs, max_length=128)

            # Decode translation
            predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Use simple tokenization instead of NLTK
            reference = simple_tokenize(target_text)
            hypothesis = simple_tokenize(predicted_text)

            # Add to lists for corpus BLEU
            references.append([reference])
            hypotheses.append(hypothesis)

            # Check for exact match
            if predicted_text.strip().lower() == target_text.strip().lower():
                exact_matches += 1
            total += 1

        except Exception as e:
            print(f"Error processing example: {str(e)}")
            continue

    if total == 0:
        print("No examples were successfully processed.")
        return None

    # Calculate metrics
    try:
        # Calculate corpus BLEU score
        corpus_bleu_score = corpus_bleu(references, hypotheses)

        # Calculate accuracy
        accuracy = exact_matches / total

        # Calculate BLEU scores for different n-grams
        bleu_1 = corpus_bleu(references, hypotheses, weights=(1.0, 0, 0, 0))
        bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
        bleu_3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0))
        bleu_4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25))

        # Print results
        print("\nTranslation Metrics:")
        print(f"Total examples processed: {total}")
        print(f"Accuracy (Exact Match): {accuracy:.4f}")
        print(f"Corpus BLEU Score: {corpus_bleu_score:.4f}")
        print(f"BLEU-1: {bleu_1:.4f}")
        print(f"BLEU-2: {bleu_2:.4f}")
        print(f"BLEU-3: {bleu_3:.4f}")
        print(f"BLEU-4: {bleu_4:.4f}")

        return {
            'accuracy': accuracy,
            'corpus_bleu': corpus_bleu_score,
            'bleu_1': bleu_1,
            'bleu_2': bleu_2,
            'bleu_3': bleu_3,
            'bleu_4': bleu_4
        }

    except Exception as e:
        print(f"Error calculating metrics: {str(e)}")
        return None

def show_example_translations(model, tokenizer, test_dataset, num_examples=5):
    """
    Show some example translations from the test set
    """
    print("\nExample Translations:")
    print("-" * 50)

    try:
        # Get random examples
        indices = np.random.choice(len(test_dataset), min(num_examples, len(test_dataset)), replace=False)

        for idx in indices:
            example = test_dataset[idx]
            source_text = example['Hindi']
            target_text = example['English']

            # Generate translation
            inputs = tokenizer([source_text], return_tensors="tf", padding=True, truncation=True, max_length=128)
            outputs = model.generate(**inputs, max_length=128)

            # Decode translation
            predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            print(f"Source:     {source_text}")
            print(f"Target:     {target_text}")
            print(f"Predicted:  {predicted_text}")
            print("-" * 50)

    except Exception as e:
        print(f"Error showing example translations: {str(e)}")

In [None]:
import os
import tensorflow as tf
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamWeightDecay

import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_excel('/content/translated_hindi_to_english.xlsx')  # Assuming columns: 'hindi', 'english'

# Convert the dataframe into a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Train-test split
raw_datasets = dataset.train_test_split(test_size=0.2)

In [None]:
# Model checkpoint
model_checkpoint = "Helsinki-NLP/opus-mt-hi-en"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define max sequence lengths
max_input_length = 128
max_target_length = 128

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

In [None]:
# Define source and target languages
source_lang = "Hindi"  # Column name for Hindi
target_lang = "English"  # Column name for English

In [None]:
# Preprocessing function
def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/3724 [00:00<?, ? examples/s]



Map:   0%|          | 0/932 [00:00<?, ? examples/s]

In [None]:
# Load the pre-trained model
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tf_model.h5:   0%|          | 0.00/305M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-hi-en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
# Set hyperparameters
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 5  # You can adjust this

In [None]:
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

# Prepare datasets for TensorFlow
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

# Define optimizer
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)

In [None]:
# Compile the model
model.compile(optimizer=optimizer)

In [None]:
# Fine-tune the model
model.fit(train_dataset, validation_data=validation_dataset, epochs=num_train_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Evaluating model performance...
Calculating metrics...


  0%|          | 0/932 [00:06<?, ?it/s]


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# Save the fine-tuned model
model.save_pretrained("tf_hi_to_en_model/")
tokenizer.save_pretrained("tf_hi_to_en_model/")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]]}


('tf_hi_to_en_model/tokenizer_config.json',
 'tf_hi_to_en_model/special_tokens_map.json',
 'tf_hi_to_en_model/vocab.json',
 'tf_hi_to_en_model/source.spm',
 'tf_hi_to_en_model/target.spm',
 'tf_hi_to_en_model/added_tokens.json')

In [None]:
!mkdir -p saved_translator_model

# Save model and tokenizer
model.save_pretrained("saved_translator_model")
tokenizer.save_pretrained("saved_translator_model")

# Zip the model files for easy download
!zip -r translator_model.zip saved_translator_model/


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]]}


  adding: saved_translator_model/ (stored 0%)
  adding: saved_translator_model/generation_config.json (deflated 43%)
  adding: saved_translator_model/target.spm (deflated 51%)
  adding: saved_translator_model/special_tokens_map.json (deflated 35%)
  adding: saved_translator_model/config.json (deflated 61%)
  adding: saved_translator_model/source.spm (deflated 60%)
  adding: saved_translator_model/tokenizer_config.json (deflated 68%)
  adding: saved_translator_model/tf_model.h5 (deflated 7%)
  adding: saved_translator_model/vocab.json (deflated 76%)


In [None]:
# Calculate metrics
print("\nEvaluating model performance...")
metrics = calculate_metrics(model, tokenizer, raw_datasets['test'])

# Show some example translations
show_example_translations(model, tokenizer, raw_datasets['test'])


Evaluating model performance...
Calculating metrics...


100%|██████████| 932/932 [3:31:59<00:00, 13.65s/it]



Translation Metrics:
Total examples processed: 932
Accuracy (Exact Match): 0.1373
Corpus BLEU Score: 0.3554
BLEU-1: 0.6630
BLEU-2: 0.5290
BLEU-3: 0.4349
BLEU-4: 0.3554

Example Translations:
--------------------------------------------------
Error showing example translations: Wrong key type: '334' of type '<class 'numpy.int64'>'. Expected one of int, slice, range, str or Iterable.


In [None]:
import numpy as np
from tqdm import tqdm
import sacrebleu
from datasets import load_dataset

def evaluate_with_sacrebleu(model, tokenizer, test_dataset, source_lang="Hindi", target_lang="English"):
    """
    Evaluate translation model using sacrebleu

    Args:
        model: The trained translation model
        tokenizer: The tokenizer
        test_dataset: Dataset containing source and target texts
        source_lang: Name of the source language column
        target_lang: Name of the target language column
    """
    # Lists to store references and hypotheses
    references = []
    hypotheses = []

    # Counter for exact matches
    exact_matches = 0
    total = 0

    # Process each example in the test dataset
    print("Generating translations for evaluation...")
    for example in tqdm(test_dataset):
        try:
            source_text = example[source_lang]
            target_text = example[target_lang]

            # Generate translation
            inputs = tokenizer([source_text], return_tensors="tf", padding=True, truncation=True, max_length=128)
            outputs = model.generate(**inputs, max_length=128)

            # Decode translation
            predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Add to lists
            references.append([target_text])  # sacrebleu expects a list of references for each translation
            hypotheses.append(predicted_text)

            # Check for exact match
            if predicted_text.strip().lower() == target_text.strip().lower():
                exact_matches += 1
            total += 1

        except Exception as e:
            print(f"Error processing example: {str(e)}")
            continue

    if total == 0:
        print("No examples were successfully processed.")
        return None

    try:
        # Calculate accuracy
        accuracy = exact_matches / total

        # Calculate BLEU score using sacrebleu
        # Transpose references to get list of alternatives for each sentence
        refs_list = list(map(list, zip(*references)))

        # Calculate sacrebleu score
        bleu = sacrebleu.corpus_bleu(hypotheses, refs_list)

        # Print detailed results
        print("\nTranslation Metrics:")
        print(f"Total examples processed: {total}")
        print(f"Accuracy (Exact Match): {accuracy:.4f}")
        print("\nSacreBLEU Scores:")
        print(f"BLEU: {bleu.score:.2f}")
        print(f"Precisions: {[f'{p:.1f}' for p in bleu.precisions]}")
        print(f"Brevity penalty: {bleu.bp:.3f}")
        print(f"Length ratio: {bleu.sys_len / bleu.ref_len:.3f}")
        print(f"Translation length: {bleu.sys_len}")
        print(f"Reference length: {bleu.ref_len}")

        # Show some example translations
        print("\nExample Translations:")
        print("-" * 80)
        num_examples = min(5, len(hypotheses))
        indices = np.random.choice(len(hypotheses), num_examples, replace=False)

        for idx in indices:
            print(f"Source:     {test_dataset[idx][source_lang]}")
            print(f"Reference:  {references[idx][0]}")
            print(f"Generated:  {hypotheses[idx]}")
            print("-" * 80)

        return {
            'accuracy': accuracy,
            'bleu_score': bleu.score,
            'precisions': bleu.precisions,
            'brevity_penalty': bleu.bp,
            'length_ratio': bleu.sys_len / bleu.ref_len,
            'sys_len': bleu.sys_len,
            'ref_len': bleu.ref_len
        }

    except Exception as e:
        print(f"Error calculating metrics: {str(e)}")
        return None

def compare_translations(model, tokenizer, test_dataset, num_examples=5):
    """
    Compare source, reference, and generated translations side by side
    """
    print("\nDetailed Translation Comparison:")
    print("=" * 100)

    try:
        indices = np.random.choice(len(test_dataset), min(num_examples, len(test_dataset)), replace=False)

        for idx in indices:
            example = test_dataset[idx]
            source_text = example['Hindi']
            reference_text = example['English']

            # Generate translation
            inputs = tokenizer([source_text], return_tensors="tf", padding=True, truncation=True, max_length=128)
            outputs = model.generate(**inputs, max_length=128)
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Calculate sentence-level BLEU for this example
            sent_bleu = sacrebleu.sentence_bleu(generated_text, [reference_text])

            print(f"Example {idx + 1}:")
            print(f"Source:     {source_text}")
            print(f"Reference:  {reference_text}")
            print(f"Generated:  {generated_text}")
            print(f"Sentence BLEU: {sent_bleu.score:.2f}")
            print("-" * 100)

    except Exception as e:
        print(f"Error in translation comparison: {str(e)}")

# Usage example:
if __name__ == "__main__":
    print("Evaluating model with SacreBLEU metrics...")
    metrics = evaluate_with_sacrebleu(model, tokenizer, raw_datasets['test'])

    print("\nGenerating detailed translation comparisons...")
    compare_translations(model, tokenizer, raw_datasets['test'], num_examples=5)

Evaluating model with SacreBLEU metrics...
Generating translations for evaluation...


  2%|▏         | 14/932 [03:07<3:24:52, 13.39s/it]


KeyboardInterrupt: 

In [None]:
# Load the fine-tuned model for inference
tokenizer = AutoTokenizer.from_pretrained("tf_hi_to_en_model/")
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_hi_to_en_model/")

# Translation example
input_text = "मुझे खांसी, सर्दी और पीठ दर्द है"
tokenized = tokenizer([input_text], return_tensors="np")
out = model.generate(**tokenized, max_length=128)
print(out)

# Decode the translation
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_hi_to_en_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


tf.Tensor([[61126    53    54 37221     2  5780    10   456  2629     0 61126]], shape=(1, 11), dtype=int32)
I have cough, cold and back pain


