In [1]:
## installing important libraries

!pip install ipywidgets 
!pip install -U accelerate 
!pip install -U transformers 
!pip install transformers[torch]
!pip install accelerate -U
!pip install sacrebleu
!pip install evaluate



Collecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.22.0
    Uninstalling accelerate-0.22.0:
      Successfully uninstalled accelerate-0.22.0
Successfully installed accelerate-0.24.0
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformer

**Importing necessary packages and libraries**

In [2]:
import pandas as pd 
from tqdm import tqdm
import transformers
import numpy as np
import tensorflow as tf
import accelerate




In [3]:
data_dir = "/kaggle/input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv"
data = pd.read_csv(data_dir)
data.drop(columns=["source"],inplace=True)
data.dropna(inplace=True)

num_epoch = 10

**Creating Train Test Split sets**

In [4]:
from datasets import Dataset
data = Dataset.from_pandas(data)
data = data.train_test_split(test_size=0.15)
data

DatasetDict({
    train: Dataset({
        features: ['english_sentence', 'hindi_sentence', '__index_level_0__'],
        num_rows: 108464
    })
    test: Dataset({
        features: ['english_sentence', 'hindi_sentence', '__index_level_0__'],
        num_rows: 19141
    })
})

In [5]:
data["train"][2] ## Sample datapoints


{'english_sentence': 'Niral was very close to Mahadevi,and Mahadevi used to tie the holly racky to his brother for 40 years.',
 'hindi_sentence': 'निराला जी से उनकी अत्यधिक निकटता थी उनकी पुष्ट कलाइयों में महादेवी जी लगभग चालीस वर्षों तक राखी बाँधती रहीं।',
 '__index_level_0__': 11120}

In [6]:
## using pre-trained models for the tasks ##

from transformers import AutoTokenizer
## from huggingface lib
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors ="pt")






Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



In [7]:
inputs = tokenizer(data["train"][0]['english_sentence'], text_target = data["train"][0]['hindi_sentence'])

inputs

{'input_ids': [81, 16376, 25121, 16, 2, 288, 33687, 6203, 10, 14836, 2, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [18035, 58373, 2, 395, 15, 52136, 331, 27044, 9, 4890, 2, 0]}

In [8]:
max_length =  128

def preprocess_function(data):
    inputs = [tiki for tiki in data["english_sentence"]]
    targets = [tiki for tiki in data["hindi_sentence"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    
    return model_inputs

In [9]:
tokenized_datasets = data.map(preprocess_function,batched=True, remove_columns=data["train"].column_names)


  0%|          | 0/109 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [10]:
from transformers import TFAutoModelForSeq2SeqLM as ex

model = ex.from_pretrained(model_checkpoint, from_pt=True)


Downloading pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFMarianMTModel.

All the weights of TFMarianMTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [11]:
from transformers import DataCollatorForSeq2Seq as xe
data_collator = xe(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)


In [12]:
batch_size = 16
tf_train_dataset = model.prepare_tf_dataset(tokenized_datasets["train"], collate_fn = data_collator, shuffle = True, batch_size=batch_size)

tf_eval_dataset = model.prepare_tf_dataset(tokenized_datasets["test"], collate_fn = data_collator, shuffle = False, batch_size=batch_size)


In [13]:
import transformers 
import accelerate
import evaluate 
metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [14]:
@tf.function(jit_compile=True)
def generate_with_xla(batch):
    return model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_new_tokens=128)


def compute_metrics():
    all_preds = []
    all_labels = []
    
    for batch, labels in tqdm(tf_eval_dataset):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels!=-100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)
    
    result = metric.compute(predictions=all_preds, references=all_labels)
    
    return {"bleu": result["score"]}






In [15]:
print(compute_metrics())

100%|██████████| 1197/1197 [21:27<00:00,  1.08s/it]


{'bleu': 7.420647829388972}


In [16]:
from huggingface_hub import notebook_login, login
toke = "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"
login(token=toke, write_permission=True)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [17]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf
num_train_steps = len(tf_train_dataset) * num_epoch

optimizer, schedule = create_optimizer(init_lr = 5e-5, num_warmup_steps=0, num_train_steps=num_epoch, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)



In [18]:
from transformers.keras_callbacks import PushToHubCallback as Pcb
callback = Pcb(output_dir="translation_output", tokenizer=tokenizer)
model.fit(tf_train_dataset, validation_data = tf_eval_dataset, callbacks=[callback], epochs = num_epoch)

print(compute_metrics())


Cloning https://huggingface.co/raphaelelel/translation_output into local empty directory.


Download file tf_model.h5:   0%|          | 8.00k/292M [00:00<?, ?B/s]

Download file source.spm:   4%|4         | 32.0k/793k [00:00<?, ?B/s]

Download file target.spm:   1%|          | 7.45k/1.02M [00:00<?, ?B/s]

Clean file source.spm:   0%|          | 1.00k/793k [00:00<?, ?B/s]

Clean file target.spm:   0%|          | 1.00k/1.02M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/292M [00:00<?, ?B/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


100%|██████████| 1197/1197 [35:03<00:00,  1.76s/it]


{'bleu': 5.08844611271075}


In [19]:
# /kaggle/working/translation_output
from transformers import pipeline

model_checkpoint = "/kaggle/working/translation_output"
translator = pipeline("translation", model=model_checkpoint)


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at /kaggle/working/translation_output.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [20]:
translator("hello")

[{'translation_text': 'हैलो'}]

In [21]:
len(data["train"]["english_sentence"])

108464

In [22]:
for j in range(5):
    index = np.random.randint(len(data['train']["english_sentence"]), size = 1)
    print("English sentence: ", data['train']["english_sentence"][index[0]])
    print("Original Hindi Sentence: ", data['train']["hindi_sentence"][index[0]])
    print("Translated Hindi : ", translator(data['train']["english_sentence"][index[0]])[0]["translation_text"])
    print("\n")
    
    
    

English sentence:  For example, the main Tomb was completed in 1643, but the the other structures kept on getting erected later.
Original Hindi Sentence:  उदाहरणतः मुख्य मकबरा 1643 में पूर्ण हुआ था किंतु शेष समूह इमारतें बनती रहीं।
Translated Hindi :  मिसाल के लिए, सन्‌ 1643 में मुख्य कब्र बनकर तैयार हो गयी ।


English sentence:  you know, productivity, quality, patient-centered care,
Original Hindi Sentence:  जैसे कि, उत्पादक्ता, गुणवत्ता, रोगी-केन्द्रित सेवा,
Translated Hindi :  तुम्हें पता है, समृद्ध, गुणवत्ता, रोगी देखभाल, पता है,


English sentence:  The theory is that when people no longer achieve a ' high ' from cannabis , they try something stronger .
Original Hindi Sentence:  लोग यह सिद्धांत को प्रस्तुत करते हैं कि जब आदमी को कैनबिस से ऊँचा नशा नही चढ़ता है तो वह इससे तेज़ चीज चाहता है .
Translated Hindi :  सिद्धांत यह है कि जब लोग अब तक एक ' उच्च' प्राप्त नहीं कर रहे हैं , वे कुछ मजबूत की कोशिश करते हैं .


English sentence:  Film Club had done a similar job with Rudyard Kipl

In [23]:
print( " -- Validation/Testing Set -- " )
for i in range(10):
    index = np.random.randint(len(data['test']["english_sentence"]),size=1)
    print("English Sentence: ", data['test']["english_sentence"][index[0]])
    print("Original Hindi Sentence: ", data['test']["hindi_sentence"][index[0]])
    print("Translated Hindi Sentence: ", translator( data['test']["english_sentence"][index[0]])[0]["translation_text"])
    print('\n')

 -- Validation/Testing Set -- 
English Sentence:  on July 6,1944,through his speech with Gandhi in all India radio he said the reason for getting help from Japan and the establishment of Indian national army.
Original Hindi Sentence:  6 जुलाई 1944 को आजाद हिंद रेडिओ पर अपने भाषण के माध्यम से गाँधीजी से बात करते हुए नेताजी ने जापान से सहायता लेने का अपना कारण और अर्जी-हुकुमत-ए-आजाद-हिंद तथा आज़ाद हिन्द फौज की स्थापना के उद्येश्य के बारे में बताया।
Translated Hindi Sentence:  जुलाई ६, १९१९4 को पूरे भारत रेडियो में गांधी के साथ अपनी भाषण के माध्यम से उन्होंने कहा कि जापान से सहायता प्राप्त करने और भारतीय राष्ट्रीय सेना की स्थापना के कारण।


English Sentence:  and smelt the bakery air,
Original Hindi Sentence:  और उसकी नायब गंध ने आपका मन मोह लिया,
Translated Hindi Sentence:  और (बादलों को) उभार कर फैला देती हैं


English Sentence:  Intriguingly , while these agreements have been arrived at , the state Government is still fighting a case in the Supreme Court against the very sale of BALCO 