### 1. Install Dependencies

In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install evaluate
!pip install sacrebleu

### 2. Importing dataset

##### Data Download URL -http://www.manythings.org/anki/

In [2]:
file_path = './mar.txt' 
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines[5000:5010]

["I'm calling you.\tमी तुम्हाला बोलवतेय.\tCC-BY 2.0 (France) Attribution: tatoeba.org #1342127 (CK) & #2088716 (sabretou)",
 "I'm getting old.\tमी म्हातारा होत चाललोय.\tCC-BY 2.0 (France) Attribution: tatoeba.org #1619683 (Spamster) & #2492374 (sabretou)",
 "I'm getting old.\tमी म्हातारी होत चाललेय.\tCC-BY 2.0 (France) Attribution: tatoeba.org #1619683 (Spamster) & #2492377 (sabretou)",
 "I'm going crazy.\tमी वेडा होत चाललोय.\tCC-BY 2.0 (France) Attribution: tatoeba.org #1890984 (CK) & #9837548 (sabretou)",
 "I'm going crazy.\tमी वेडी होत चाललेय.\tCC-BY 2.0 (France) Attribution: tatoeba.org #1890984 (CK) & #9837549 (sabretou)",
 "I'm going to go.\tमी जाणार आहे.\tCC-BY 2.0 (France) Attribution: tatoeba.org #1312 (axcutul) & #6143641 (sabretou)",
 "I'm going today.\tमी आज जातोय.\tCC-BY 2.0 (France) Attribution: tatoeba.org #8916221 (CK) & #10514496 (sabretou)",
 "I'm going today.\tमी आज जातेय.\tCC-BY 2.0 (France) Attribution: tatoeba.org #8916221 (CK) & #10514497 (sabretou)",
 "I'm half 

### 3. Data Cleaning 

In [3]:
import string
import re 
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [4]:

## Preproccesing Engilsh sentences 
def preprocess_eng_sentence(sent):
    '''Function to preprocess English sentence'''
    sent = sent.lower() # lower casing
    sent = re.sub("'", '', sent) # remove the quotation marks if any
    sent = ''.join(ch for ch in sent if ch not in exclude) #exclude punctuations 
    sent = sent.translate(remove_digits) # remove the digits
    sent = sent.strip()
    sent = re.sub(" +", " ", sent) # remove extra spaces
    
    return sent



## Preprocessing Marathi Snetences 
def preprocess_mar_sentence(sent):
    '''Function to preprocess Marathi sentence'''
    sent = re.sub("'", '', sent) # remove the quotation marks if any
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = re.sub("[२३०८१५७९४६]", "", sent) # remove the digits
    sent = sent.strip()
    sent = re.sub(" +", " ", sent) # remove extra spaces
    
    return sent

  
# Generate pairs of cleaned English and Marathi sentences
sent_pairs = []
eng_sentence = []
mar_sentence = []
for line in lines:
    sent_pair = []
    eng, mar = line.split('\t')[0] ,line.split('\t')[1] 
    eng = preprocess_eng_sentence(eng)
    mar = preprocess_mar_sentence(mar)
    eng_sentence.append(eng)
    mar_sentence.append(mar)
    
    
## Converting data to transformer acceptable dataset format 
from tqdm import tqdm 
import pandas as pd 
list1 = list() 

for i in tqdm(range(len(eng_sentence))): 
  list1.append({"en":eng_sentence[i] , "mar":mar_sentence[i]} )

df = pd.DataFrame(columns=['translation'])
df['translation'] = list1
df = df.reset_index().rename(columns={"index":"id"})


from datasets import Dataset,DatasetDict
full_dataset = Dataset.from_pandas(df)
raw_dataset = DatasetDict({'train': full_dataset })
validation_data = raw_dataset['train'].train_test_split(train_size=0.8,seed=20)
raw_dataset['validation'] = validation_data.pop("test")
raw_dataset



100%|██████████| 46966/46966 [00:00<00:00, 939276.11it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 46966
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 9394
    })
})

### 3.2 Tokenizing 

In [5]:
## Importing Tokenizer 
import sentencepiece
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-mr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")



In [6]:
## Getting english and marathi sample
en_sentence = raw_dataset['train'][46960]['translation']['en']
mar_sentence = raw_dataset['train'][46960]['translation']['mar']

## Calling tokeinzer 
inputs = tokenizer(en_sentence, text_target=mar_sentence)
print(inputs)
print("En : " , tokenizer.decode(inputs['input_ids']))
print("Hi : " , tokenizer.decode(inputs['labels']))


{'input_ids': [186, 1710, 76, 40168, 25, 11840, 16333, 11539, 4721, 132, 24, 4, 502, 1666, 58592, 1060, 10, 4, 229, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [360, 1905, 36, 28308, 9384, 7726, 123, 602, 437, 81, 7671, 140, 2054, 112, 891, 99, 576, 1045, 21577, 1425, 1623, 0]}
En :  ▁if▁religion▁were▁synonymous with▁morality brazil▁would be the▁most▁uncorrupted▁country in the▁world</s>
Hi :  जर धर्म व नीतिमत्ता समानार्थी शब्द असते तर ब्राजील जगातला सर्वात अभ्रष्ट देश असता</s>


In [7]:

max_length = 128 ## max length of sentence to tokenize if it is more than that we truncate thos results 

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["mar"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs


tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset["train"].column_names,
)
tokenized_datasets

Map:   0%|          | 0/46966 [00:00<?, ? examples/s]

Map:   0%|          | 0/9394 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 46966
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9394
    })
})

### 3.3 Data Collation

In [8]:
## Calling pretrained Model : 
from transformers import TFAutoModelForSeq2SeqLM
model_checkpoint = "Helsinki-NLP/opus-mt-en-mr"
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)


from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")


All PyTorch model weights were used when initializing TFMarianMTModel.

All the weights of TFMarianMTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [9]:
### Apply data collator to each each dataset batchwise , here batch size = 32 
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


#### 3.4 metrics - BLEU 

In [10]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

import evaluate

metric = evaluate.load("sacrebleu")

generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128
)

tf_generate_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    collate_fn=generation_data_collator,
    shuffle=False,
    batch_size=8,
)


@tf.function(jit_compile=True)
def generate_with_xla(batch):
    return model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=128,
    )


def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_generate_dataset):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


### 4 . Fine Tuning Model

In [11]:
## Compiling model 

from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
#tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [12]:
## Pushing Model to huggingface 

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:

from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir="finetuned-en-to-mar", tokenizer=tokenizer
)

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

/content/finetuned-en-to-mar is already a clone of https://huggingface.co/VinayakMane47/finetuned-en-to-mar. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/3

Adding files tracked by Git LFS: ['source.spm', 'target.spm']. This may take a bit of time if the files are large.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7feb6c13f430>

In [14]:
### Computing metrics 
print(compute_metrics())

100%|██████████| 1175/1175 [04:21<00:00,  4.49it/s]


{'bleu': 31.668840418068974}


#### 5. Inferencing 

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "VinayakMane47/finetuned-en-to-mar"
translator = pipeline("translation", model=model_checkpoint)

In [26]:
translator("whats your name")

[{'translation_text': 'तुझं नाव काय आहे'}]