In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install evaluate
!pip install sacrebleu

### Importing Datasets 

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="hi")



  0%|          | 0/1 [00:00<?, ?it/s]

#### Preprocessing : 

In [None]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")



In [None]:
## Importing Tokenizer 
import sentencepiece
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")



In [None]:
## Getting english and hindi sample
en_sentence = raw_datasets['train'][10]['translation']['en']
hi_sentence = raw_datasets['train'][10]['translation']['hi']

## Calling tokeinzer 
inputs = tokenizer(en_sentence, text_target=hi_sentence)
print(inputs)
print("En : " , tokenizer.decode(inputs['input_ids']))
print("Hi : " , tokenizer.decode(inputs['labels']))

{'input_ids': [3736, 924, 20442, 0], 'attention_mask': [1, 1, 1, 1], 'labels': [266, 924, 2754, 0]}
En :  ▁Every 5▁Minutes</s>
Hi :  हर 5 मिनट</s>


In [None]:
### Preproccesing Function : 

max_length = 128 ## max length of sentence to tokenize if it is more than that we truncate thos results 

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["hi"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [None]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)



In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 87504
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9723
    })
})

### Finetuning 

In [None]:
from transformers import TFAutoModelForSeq2SeqLM
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

All PyTorch model weights were used when initializing TFMarianMTModel.

All the weights of TFMarianMTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


### Data Collation : 

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
## Lets check the data collator 
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
print(batch.keys())

print(batch['labels'])
print(batch['decoder_input_ids'])

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
tf.Tensor(
[[ 8161 10238     0  -100  -100  -100]
 [10453    20    17   345    13     0]], shape=(2, 6), dtype=int32)
tf.Tensor(
[[61949  8161 10238     0 61949 61949]
 [61949 10453    20    17   345    13]], shape=(2, 6), dtype=int32)


In [None]:
### Apply data collator to each each dataset batchwise , here batch size = 32 
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [None]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128
)

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8,
)


@tf.function(jit_compile=True)
def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,
    )


def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_generate_dataset):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


### Fine Tuning the model

In [None]:
## Compiling model 

from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
#tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir="finetuned-en-to-hi", tokenizer=tokenizer
)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

/content/finetuned-en-to-hi is already a clone of https://huggingface.co/VinayakMane47/finetuned-en-to-hi. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f429c382890>

In [None]:
### Computing metrics 
print(compute_metrics())

100%|██████████| 1216/1216 [18:34<00:00,  1.09it/s]


{'bleu': 36.638250389305675}


### Using fine Tuned Model : 

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "VinayakMane47/finetuned-en-to-hi"
translator = pipeline("translation", model=model_checkpoint)


In [None]:
translator("tell me your name please")

[{'translation_text': 'कृपया अपना नाम मुझे बताएँ'}]

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "VinayakMane47/finetuned-en-to-hi"
translator = pipeline("translation", model=model_checkpoint)