In [1]:
!pip install datasets evaluate transformers[sentencepiece] sacrebleu tqdm



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="hi")



  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 97227
    })
})

In [5]:
raw_datasets['train'][300]

{'id': '300',
 'translation': {'en': '& Recurse subfolders',
  'hi': 'सब- फ़ोल्डर शामिल करें (s)'}}

In [6]:
def filter_redundant(record):
  phrase = record['translation']['hi']
  for i in phrase:
    if i in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
      phrase = phrase.replace(i, '')
  if len(phrase)>5:
    return True
  return False

In [7]:
def preprocess(record):
  phrase = record['translation']['hi']
  for i in phrase:
    if i in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
      phrase = phrase.replace(i, '')
  return {'translation':{'en':record['translation']['en'], 'hi': phrase}}

In [8]:
processed_datasets = raw_datasets.filter(filter_redundant)



In [9]:
processed_datasets = processed_datasets.map(preprocess)



In [10]:
processed_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 84475
    })
})

In [11]:
processed_datasets['train'][2595]

{'id': '3008',
 'translation': {'en': 'Default path to the color profiles folder. You must store all your color profiles in this directory.',
  'hi': 'यदि आप इस विकल्प को सक्षम करते हैं, आप मूल छवि देख सकेंगे.'}}

In [12]:
split_datasets = processed_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets



DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 76027
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 8448
    })
})

In [13]:
split_datasets["validation"] = split_datasets.pop("test")

In [14]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
translator = pipeline("translation", model=model_checkpoint)



In [15]:
translator("Default path to the color profiles folder. You must store all your color profiles in this directory.")

[{'translation_text': 'रंग प्रोफ़ाइल फ़ोल्डर के लिए डिफ़ॉल्ट पथ. इस डिरेक्ट्री में आपको अपने सभी रंग प्रोफ़ाइलों को भंडारित करना होगा.'}]

In [16]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")

In [17]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["hi"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [18]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)



In [19]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

All PyTorch model weights were used when initializing TFMarianMTModel.

All the weights of TFMarianMTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [20]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [21]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [22]:
import evaluate

metric = evaluate.load("sacrebleu")

In [23]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [24]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128
)

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8,
)

In [25]:
generation_data_collator([tokenized_datasets["train"][i] for i in range(1, 2)])

{'input_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[25988,     0, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
        61949, 

In [26]:
@tf.function(jit_compile=True)
def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,
    )


def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_generate_dataset):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}

In [27]:
print(compute_metrics())

100%|██████████| 1056/1056 [05:45<00:00,  3.05it/s]


{'bleu': 59.84206161428571}


In [28]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [29]:
model_checkpoint

'Helsinki-NLP/opus-mt-en-hi'

In [30]:
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8256396170>

In [31]:
model.save_pretrained('drive/MyDrive/Model/Translation-En-Hi2')

In [32]:
print(compute_metrics())

100%|██████████| 1056/1056 [14:43<00:00,  1.19it/s]


{'bleu': 49.07704783185189}
