## Importing libraries and modules

In [1]:
# Enable auto-reload
%load_ext autoreload
%autoreload 2

In [2]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Add one level up directory to the path
import sys
sys.path.append("..")

# Import libraries
import torch
import random
import pandas as pd

# Import custom modules
from src.data import *
from src.machine_translation import *
from src.data.utils import get_dataset
from src.data.preprocess import clean_text
from src.data.tokenizer import CustomBartTokenizer
from src.machine_translation.translate import translate
from src.machine_translation.net import CodeMixedModel, CodeMixedModelHGTrainer
from src.machine_translation.models.bart_conditional import BartForConditionalGeneration
from src.machine_translation.utils import get_tokenized_dataset_models, get_data_loader_models, calculate_sacrebleu_score, calculate_chrf_score, calculate_bert_score, calculate_tokens

## Data acquisition, cleaning and processing

In [3]:
%%time
train_df, validation_df, test_df = get_dataset()

CPU times: user 4.77 s, sys: 178 ms, total: 4.95 s
Wall time: 8.12 s


In [4]:
train_df

Unnamed: 0,hi_en,en,source
0,film ka kya naam hai,What's the name of the movie,cmu_hinglish_dog
1,"namaste, sada hua tomatoes score mahaan hai, l...","Hi, the rotten tomatoes score is great but the...",cmu_hinglish_dog
2,kya aapako lagata hai ki aapako film pasand aa...,Do you think you will like the movie,cmu_hinglish_dog
3,yah kis tarah kee philm hai,What kind of movie is it,cmu_hinglish_dog
4,film kab banee thee?,when was the movie made?,cmu_hinglish_dog
...,...,...,...
174438,Thik hai\n,Ok.,linc
174439,Thik hai bhai\n,ok bro,linc
174440,Kya ham chalu kar sakte hai?\n,shall we continue?,linc
174441,Kya aapko pasand hai hamare saath\n,do you like we can,linc


In [5]:
validation_df

Unnamed: 0,hi_en,en,source
0,movie kis baare me hai?,What is the movie about?,cmu_hinglish_dog
1,Movie ek chhote bacche Kevin k baare me hai ji...,the movie is about a young child named Kevin w...,cmu_hinglish_dog
2,Kya wo jaan bhuj k abandon karte hai?,Did they abandon him on purpose?,cmu_hinglish_dog
3,"nahi, wo uska track lose kardete hai kyuki bah...",no they had lost track of him since they had m...,cmu_hinglish_dog
4,Kya wo realize karte hai k wo chhut gaya aur u...,Did they realize they lost track of him and co...,cmu_hinglish_dog
...,...,...,...
3238,Sunkar good movie hai. Kya ham finish kar skat...,Sounds like a good movie. Can we finish now?,linc
3239,Yep. Thanks baat karne ke liye\n,Yep. Thanks for chatting,linc
3240,"Thanks, mei dekhati hui. Achi baat hai\n","thanks, I will watch it. SOunds good",linc
3241,kya tumhe movie Despicable Me pasand hai?\n,Did you like the movie Despicable Me?,linc


In [6]:
test_df

Unnamed: 0,hi_en,en,source
0,oye ledki kitni mazakiya movie dekh rhi ho ?,ah mean girls. such a funny movie. have you se...,cmu_hinglish_dog
1,ha muje bhi hassi mazak ki movies pasand par y...,"Yeah, even though I love comedies, this wasn't...",cmu_hinglish_dog
2,kese tina ka jhuti writing isme achi the usne ...,how come. thought tina fey's writing was fanta...,cmu_hinglish_dog
3,muje kahani bhute achi lagi aur unhone ise dac...,I loved the story & how true they made it in h...,cmu_hinglish_dog
4,ab samja muje nhi pta tha ki aap sahi admi hai...,I see. i didn't realize it was 14 years ago. y...,cmu_hinglish_dog
...,...,...,...
7067,alarm ko abhi stop kare,Stop alarm now,top
7068,Har ghante ke liye alarm set kare,set alarm every hour,top
7069,Bobby ko text kare,text Bobby,top
7070,Muje shaam 6 baje laundry ko pick up karne ke ...,remind me to pick up laundry at 6 pm,top


In [7]:
%%time
# Cleaning train_df
train_df = train_df.applymap(clean_text)
train_df

CPU times: user 2.86 s, sys: 2.72 ms, total: 2.87 s
Wall time: 2.87 s


Unnamed: 0,hi_en,en,source
0,film ka kya naam hai,what's the name of the movie,cmu_hinglish_dog
1,"namaste, sada hua tomatoes score mahaan hai, l...","hi, the rotten tomatoes score is great but the...",cmu_hinglish_dog
2,kya aapako lagata hai ki aapako film pasand aa...,do you think you will like the movie,cmu_hinglish_dog
3,yah kis tarah kee philm hai,what kind of movie is it,cmu_hinglish_dog
4,film kab banee thee?,when was the movie made?,cmu_hinglish_dog
...,...,...,...
174438,thik hai,ok.,linc
174439,thik hai bhai,ok bro,linc
174440,kya ham chalu kar sakte hai?,shall we continue?,linc
174441,kya aapko pasand hai hamare saath,do you like we can,linc


In [8]:
%%time
# Cleaning validation_df
validation_df = validation_df.applymap(clean_text)
validation_df

CPU times: user 53.4 ms, sys: 590 µs, total: 54 ms
Wall time: 53.7 ms


Unnamed: 0,hi_en,en,source
0,movie kis baare me hai?,what is the movie about?,cmu_hinglish_dog
1,movie ek chhote bacche kevin k baare me hai ji...,the movie is about a young child named kevin w...,cmu_hinglish_dog
2,kya wo jaan bhuj k abandon karte hai?,did they abandon him on purpose?,cmu_hinglish_dog
3,"nahi, wo uska track lose kardete hai kyuki bah...",no they had lost track of him since they had m...,cmu_hinglish_dog
4,kya wo realize karte hai k wo chhut gaya aur u...,did they realize they lost track of him and co...,cmu_hinglish_dog
...,...,...,...
3238,sunkar good movie hai. kya ham finish kar skat...,sounds like a good movie. can we finish now?,linc
3239,yep. thanks baat karne ke liye,yep. thanks for chatting,linc
3240,"thanks, mei dekhati hui. achi baat hai","thanks, i will watch it. sounds good",linc
3241,kya tumhe movie despicable me pasand hai?,did you like the movie despicable me?,linc


In [9]:
%%time
# Cleaning test_df
test_df = test_df.applymap(clean_text)
test_df

CPU times: user 107 ms, sys: 523 µs, total: 108 ms
Wall time: 107 ms


Unnamed: 0,hi_en,en,source
0,oye ledki kitni mazakiya movie dekh rhi ho ?,ah mean girls. such a funny movie. have you se...,cmu_hinglish_dog
1,ha muje bhi hassi mazak ki movies pasand par y...,"yeah, even though i love comedies, this wasn't...",cmu_hinglish_dog
2,kese tina ka jhuti writing isme achi the usne ...,how come. thought tina fey's writing was fanta...,cmu_hinglish_dog
3,muje kahani bhute achi lagi aur unhone ise dac...,i loved the story & how true they made it in h...,cmu_hinglish_dog
4,ab samja muje nhi pta tha ki aap sahi admi hai...,i see. i didn't realize it was 14 years ago. y...,cmu_hinglish_dog
...,...,...,...
7067,alarm ko abhi stop kare,stop alarm now,top
7068,har ghante ke liye alarm set kare,set alarm every hour,top
7069,bobby ko text kare,text bobby,top
7070,muje shaam 6 baje laundry ko pick up karne ke ...,remind me to pick up laundry at 6 pm,top


## Data tokenization

In [10]:
%%time
bart_tokenizer = CustomBartTokenizer().build()
bart_tokenizer_scratch = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.SCRATCH.value)
bart_tokenizer_append = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.APPEND.value)







CPU times: user 27.6 s, sys: 76.9 ms, total: 27.7 s
Wall time: 28.3 s


In [11]:
query = "Hello! Ye CSCI 544 ka project he. Ye project Code-Mixed Machine Translation ke bara me he. Hamane hamari khoon bahahe."
query = clean_text(text=query)

In [12]:
token_idx = bart_tokenizer.encode(query, add_special_tokens=True)
token_word = bart_tokenizer.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of using the default Bart Tokenizer.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

####################################################################################################
This is an example of using the default Bart Tokenizer.

Sentence: hello! ye csci 544 ka project he. ye project code-mixed machine translation ke bara me he. hamane hamari khoon bahahe.

Tokenized word: ['<s>', 'hello', '!', 'Ġye', 'Ġc', 'sci', 'Ġ5', '44', 'Ġka', 'Ġproject', 'Ġhe', '.', 'Ġye', 'Ġproject', 'Ġcode', '-', 'm', 'ixed', 'Ġmachine', 'Ġtranslation', 'Ġke', 'Ġbar', 'a', 'Ġme', 'Ġhe', '.', 'Ġham', 'ane', 'Ġham', 'ari', 'Ġkh', 'oon', 'Ġb', 'aha', 'he', '.', '</s>']

Tokenized idx: [0, 42891, 328, 32440, 740, 43428, 195, 3305, 4661, 695, 37, 4, 32440, 695, 3260, 12, 119, 24194, 3563, 19850, 7321, 2003, 102, 162, 37, 4, 11402, 1728, 11402, 1512, 16447, 3863, 741, 11695, 700, 4, 2]
####################################################################################################


In [13]:
token_idx = bart_tokenizer_scratch.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_scratch.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer that is trained from scratch.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

####################################################################################################
This is an example of the Bart Tokenizer that is trained from scratch.

Sentence: hello! ye csci 544 ka project he. ye project code-mixed machine translation ke bara me he. hamane hamari khoon bahahe.

Tokenized word: ['<s>', 'hello', '!', 'Ġye', 'Ġc', 'sci', 'Ġ5', '44', 'Ġka', 'Ġproject', 'Ġhe', '.', 'Ġye', 'Ġproject', 'Ġcode', '-', 'mix', 'ed', 'Ġmachine', 'Ġtrans', 'lation', 'Ġke', 'Ġbara', 'Ġme', 'Ġhe', '.', 'Ġhamane', 'Ġhamari', 'Ġkhoon', 'Ġbaha', 'he', '.', '</s>']

Tokenized idx: [0, 2317, 4, 520, 288, 21554, 466, 11342, 312, 3070, 356, 17, 520, 3070, 6438, 16, 19530, 517, 6262, 3397, 18402, 271, 17190, 284, 356, 17, 11714, 4630, 10623, 13967, 282, 17, 2]
####################################################################################################


In [14]:
token_idx = bart_tokenizer_append.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_append.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer to which we append our new data.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

####################################################################################################
This is an example of the Bart Tokenizer to which we append our new data.

Sentence: hello! ye csci 544 ka project he. ye project code-mixed machine translation ke bara me he. hamane hamari khoon bahahe.

Tokenized word: ['<s>', 'hello', '!', 'Ġ', 'ye', 'Ġ', 'csc', 'i', 'Ġ', '54', '4', 'Ġ', 'ka', 'Ġ', 'project', 'Ġ', 'he', '.', 'Ġ', 'ye', 'Ġ', 'project', 'Ġ', 'code', '-', 'mix', 'ed', 'Ġ', 'mach', 'ine', 'Ġ', 'trans', 'lation', 'Ġ', 'ke', 'Ġ', 'bara', 'Ġ', 'me', 'Ġ', 'he', '.', 'Ġ', 'hamane', 'Ġ', 'hamari', 'Ġ', 'khoo', 'n', 'Ġ', 'baha', 'he', '.', '</s>']

Tokenized idx: [0, 42891, 328, 1437, 4717, 1437, 60283, 118, 1437, 4283, 306, 1437, 2348, 1437, 28258, 1437, 700, 4, 1437, 4717, 1437, 28258, 1437, 20414, 12, 39915, 196, 1437, 82894, 833, 1437, 9981, 35019, 1437, 1071, 1437, 31533, 1437, 1794, 1437, 700, 4, 1437, 62217, 1437, 84301, 1437, 58789, 282, 1437, 80269, 700, 4, 2]
########

In [15]:
%%time
# Now that we know how to get a BART-Tokenizer - default, appeded and scratch, let us get the tokenizer for our code-mixed language
# and the target language.
# Code-Mixed Language - Build a Bart-Tokenizer from scratch.
# Target Language (English) - Use the default Bart-Tokenizer.
hi_en_bart_tokenizer = CustomBartTokenizer().build(
    data=train_df["hi_en"],
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_FROM_PRETRAINED
)
en_bart_tokenizer = CustomBartTokenizer().build(
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_FROM_PRETRAINED
)




CPU times: user 13.6 s, sys: 34.9 ms, total: 13.6 s
Wall time: 13.8 s


In [16]:
print(f"Hinglish tokenizer vocab size: {hi_en_bart_tokenizer.vocab_size}")
print(f"English tokenizer vocab size: {en_bart_tokenizer.vocab_size}")

Hinglish tokenizer vocab size: 50265
English tokenizer vocab size: 50265


In [17]:
%%time
print(f"There are {train_df['hi_en'].apply(lambda x: calculate_tokens(x, hi_en_bart_tokenizer)).sum()} hinglish tokens in the train data.")
print(f"There are {train_df['en'].apply(lambda x: calculate_tokens(x, en_bart_tokenizer)).sum()} english tokens in the train data.")
print(f"There are {validation_df['hi_en'].apply(lambda x: calculate_tokens(x, hi_en_bart_tokenizer)).sum()} hinglish tokens in the validation data.")
print(f"There are {validation_df['en'].apply(lambda x: calculate_tokens(x, en_bart_tokenizer)).sum()} english tokens in the validation data.")
print(f"There are {test_df['hi_en'].apply(lambda x: calculate_tokens(x, hi_en_bart_tokenizer)).sum()} hinglish tokens in the test data.")
print(f"There are {test_df['en'].apply(lambda x: calculate_tokens(x, en_bart_tokenizer)).sum()} english tokens in the test data.")

There are 1764738 hinglish tokens in the train data.
There are 1807859 english tokens in the train data.
There are 38502 hinglish tokens in the validation data.
There are 38366 english tokens in the validation data.
There are 70895 hinglish tokens in the test data.
There are 69686 english tokens in the test data.
CPU times: user 8.9 s, sys: 6.08 ms, total: 8.91 s
Wall time: 8.92 s


## Dataset and Data Loaders

In [18]:
# Get the tokenized dataset
__tokenized_datasets__ = get_tokenized_dataset_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)
__tokenized_datasets__

defaultdict(dict,
            {'hi_en__en': {'train': <src.machine_translation.data.CodeMixedTokenizedDataset at 0x29dc0c250>,
              'validation': <src.machine_translation.data.CodeMixedTokenizedDataset at 0x29dc0cac0>,
              'test': <src.machine_translation.data.CodeMixedTokenizedDataset at 0x29dc0c790>}})

In [19]:
__train_dataset__, __validation_dataset__, __test_dataset__ = __tokenized_datasets__["hi_en__en"].values()

In [20]:
__train_dataset__.visualize()

####################################################################################################
Dataset
Number of instances:  174443
Denoising stage:  False
Source language:  hi_en
Target language:  en
Key: input_ids, Value: tensor([    0,  3497,    15, 10386,  1071,  2025,  2185,  6089,   278,    15,
         1088,  4175,  2430,  2185,   392, 33673,  1711,  1963,   291,  1104,
         1067,  1758,   278,    17,     2]), Value shape: torch.Size([25])
Key: attention_mask, Value: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]), Value shape: torch.Size([25])
Key: labels, Value: tensor([    0,  3592,     6,     5, 34485, 18553,  1471,    16,   372,    53,
            5, 32820,  7745,  1471,  1302,    10,   410,   614,    10,  1569,
            9,    42,  1318,     4,     2]), Value shape: torch.Size([25])
####################################################################################################


In [21]:
# Get the data loaders for denoising
__data_loaders__ = get_data_loader_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    train_batch_size=4, 
    validation_batch_size=4, 
    test_batch_size=4,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=hi_en_bart_tokenizer,
    denoising_stage=True
)

In [22]:
__data_loaders__

defaultdict(dict,
            {'hi_en__en': {'object': <src.machine_translation.data.CodeMixedDataLoader at 0x2ad1ef790>,
              'train': <torch.utils.data.dataloader.DataLoader at 0x2ad1ef130>,
              'validation': <torch.utils.data.dataloader.DataLoader at 0x2ad1ef0a0>,
              'test': <torch.utils.data.dataloader.DataLoader at 0x2ad1ef5e0>}})

In [23]:
__data_loader__, __train_data_loader__, __validation_data_loader__, __test_data_loader__ = __data_loaders__["hi_en__en"].values()

In [24]:
%%time
__data_loader__.visualize()

####################################################################################################
Train Dataloader
Batch Size:  4
Number of batches:  43611
Batch source language shape:  torch.Size([4, 25])
Batch source language:  ['kal subha 10 am ke liye ek alarm ko set karen', 'likho ki enjoy your weekend .', 'mujhe family reunion plans ke baare me reinders dikhao', 'timer par kitna samay hai']
Batch source tokens:  tensor([[    0,   404,   626,   552,   489,   271,   283,   379,   320,   280,
           334,   437,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1],
        [    0, 17083,   294,  2476,  1939,   496,   322,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1],
        [    0,   348,  1210,  3964,  3073,   271,   631,   284, 38776,   685,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
           

100%|█████| 43611/43611 [00:41<00:00, 1048.52it/s]


Validation of train loader successful.
####################################################################################################
Val Dataloader
Batch Size:  4
Number of batches:  811
Validating validation laoder...


100%|███████████| 811/811 [00:31<00:00, 26.03it/s]


Validation of validation loader successful.
####################################################################################################
Test Dataloader
Batch Size:  4
Number of batches:  1768
Validating test laoder...


100%|█████████| 1768/1768 [00:31<00:00, 56.44it/s]

Validation of test loader successful.
####################################################################################################
CPU times: user 8.14 s, sys: 4.8 s, total: 12.9 s
Wall time: 1min 59s





In [25]:
# Get the data loaders for translation
__data_loaders__ = get_data_loader_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    train_batch_size=4, 
    validation_batch_size=4, 
    test_batch_size=4,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)

In [26]:
__data_loaders__

defaultdict(dict,
            {'hi_en__en': {'object': <src.machine_translation.data.CodeMixedDataLoader at 0x2ad1ee1c0>,
              'train': <torch.utils.data.dataloader.DataLoader at 0x2ad1eea00>,
              'validation': <torch.utils.data.dataloader.DataLoader at 0x2ae78b8e0>,
              'test': <torch.utils.data.dataloader.DataLoader at 0x2ae78bca0>}})

In [27]:
__data_loader__, __train_data_loader__, __validation_data_loader__, __test_data_loader__ = __data_loaders__["hi_en__en"].values()

In [28]:
%%time
__data_loader__.visualize()

####################################################################################################
Train Dataloader
Batch Size:  4
Number of batches:  43611
Batch source language shape:  torch.Size([4, 25])
Batch source language:  ['aaj raat 10 : 30 ke liye ek alarm set karen', 'aaj chicago me traffic kaisa hai', 'hahah tum jaante ho', 'kya 101 backed up hai']
Batch source tokens:  tensor([[   0,  479,  465,  552,  477,  586,  271,  283,  379,  320,  334,  437,
            2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1],
        [   0,  479, 1407,  284,  374,  428,  278,    2,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1],
        [   0, 9070,  770, 5924,  310,    2,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1],
        [   0,  316, 4278, 4743,  522,  278,    2,    1,    1,    1,    1,    1

100%|█████| 43611/43611 [00:42<00:00, 1031.34it/s]


Validation of train loader successful.
####################################################################################################
Val Dataloader
Batch Size:  4
Number of batches:  811
Validating validation laoder...


100%|███████████| 811/811 [00:31<00:00, 25.86it/s]


Validation of validation loader successful.
####################################################################################################
Test Dataloader
Batch Size:  4
Number of batches:  1768
Validating test laoder...


100%|█████████| 1768/1768 [00:31<00:00, 55.99it/s]

Validation of test loader successful.
####################################################################################################
CPU times: user 8.79 s, sys: 4.87 s, total: 13.7 s
Wall time: 2min 11s





## Model definition

In [29]:
__model__ = BartForConditionalGeneration()
__model__.model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [30]:
__batch_size__ = 8
__en_seq_length__ = random.randint(13, 23)
__de_seq_length__ = random.randint(13, 23)
__encoder_input__ = torch.randint(low=0, high=hi_en_bart_tokenizer.vocab_size, size=(__batch_size__, __en_seq_length__))
__decoder_input__ = torch.randint(low=0, high=en_bart_tokenizer.vocab_size, size=(__batch_size__, __de_seq_length__))
__encoder_input__ = __encoder_input__.to(MBART_MODEL_CONDITIONAL_GENERATION_DEVICE)
__decoder_input__ = __decoder_input__.to(MBART_MODEL_CONDITIONAL_GENERATION_DEVICE)
__out__ = __model__.model(input_ids=__encoder_input__, decoder_input_ids=__decoder_input__, return_dict=True)
print(f"Model has a total of {__model__.model.num_parameters()} number of parameters")
print("Model encoder input size: ", __encoder_input__.size())
print("Model decoder input size: ", __decoder_input__.size())
print("Model output type: ", type(__out__))
print("Model output size: ", __out__.logits.size())
print("Model output:\n", __out__.logits)

Model has a total of 406291456 number of parameters
Model encoder input size:  torch.Size([8, 15])
Model decoder input size:  torch.Size([8, 18])
Model output type:  <class 'transformers.modeling_outputs.Seq2SeqLMOutput'>
Model output size:  torch.Size([8, 18, 50265])
Model output:
 tensor([[[ 1.1430e+01, -1.3800e+00,  5.1728e+00,  ..., -1.7569e+00,
          -2.3358e+00,  5.7826e+00],
         [-2.0708e+01, -3.1234e+00,  5.2102e+00,  ..., -1.6977e+00,
          -1.1658e+00,  4.0243e+00],
         [-1.5470e+01, -3.1732e+00,  1.2642e+00,  ..., -3.2186e+00,
          -3.1851e+00,  1.1097e+00],
         ...,
         [-1.0355e+00, -3.3531e+00,  1.7390e+00,  ..., -3.0118e+00,
          -3.0561e+00,  5.9904e-01],
         [-1.5965e+00, -2.9003e+00,  2.0379e+00,  ..., -1.9576e+00,
          -2.7434e+00, -4.6175e-01],
         [-1.5275e+00, -3.5317e+00,  1.4155e+00,  ..., -3.4971e+00,
          -3.3030e+00, -2.0949e-01]],

        [[ 1.1569e+01, -1.5596e+00,  5.9848e+00,  ..., -1.4894e-02,
  

## Model Training

In [31]:
# Get the tokenized dataset
dataset = get_tokenized_dataset_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)
train_dataset, validation_dataset, test_dataset = dataset[MBART_DATALOADER_TRANSLATION_MODE].values()

In [32]:
# Initalize the model
mt_hg_model = CodeMixedModelHGTrainer(
    train_dataset = train_dataset,
    validation_dataset = validation_dataset,
    test_dataset = test_dataset,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)

Getting the model...

Configuring optimizer and scheduler...

Configuring collator...

Configuring trainer...

Configuring training arguments...


In [33]:
%%time
# Train the model, skip during submission time.
mt_hg_model.fit(skip_training=True)


Validating the model...


Evaluting validation data:   0%| | 0/102 [00:00<?,Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluting validation data: 100%|█| 102/102 [16:09<

***** validation metrics *****
  bert_score_f1        = 0.9322406665576413
  bert_score_precision =  0.932659924656166
  bert_score_recall    = 0.9320171836551125
  chrf_score           =            56.6586
  sacrebleu_score      =            34.5518
  validation_samples   =               3243
CPU times: user 24min 18s, sys: 10min 42s, total: 35min
Wall time: 16min 31s





In [34]:
%%time
# Evaluate model on test data
mt_hg_model.predict(model_path=MBART_MODEL_CONDITIONAL_GENERATION_RESUME_FROM_CHECKPOINT)

Loading the model...

Generating translations...



Evaluating test data: 100%|█| 221/221 [34:43<00:00

SacreBLEU score: 50.39604085424078
CHRF score: 70.40279194217455
BERT score - Precision: 0.957356490211654
BERT score - Recall: 0.9575719921027913
BERT score - F1: 0.9574025283689818
CPU times: user 50min 52s, sys: 22min 47s, total: 1h 13min 40s
Wall time: 34min 48s





In [39]:
# Run inference
src = "film ka kya name hai?"
print(f"\nSRC: {src}\nTGT: {mt_hg_model.infer(model_path=MBART_MODEL_CONDITIONAL_GENERATION_RESUME_FROM_CHECKPOINT, src=src)}")

Loading the model...

Generating translation for the source string...

Source string:  film ka kya name hai?
Translated string:  tensor([[    2,     0, 12196,    16,     5,   766,     9,     5,  1569,   116,
             2]])

SRC: film ka kya name hai?
TGT: what is the name of the movie?


In [38]:
%%time
# Generate translations for the test data
translated_df = translate(
    data=test_df,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)
translated_df

Getting translation for the data: 100%|█| 221/221 

CPU times: user 40min 29s, sys: 19min 44s, total: 1h 14s
Wall time: 30min 7s





Unnamed: 0,hi_en,en,source,translations
0,oye ledki kitni mazakiya movie dekh rhi ho ?,ah mean girls. such a funny movie. have you se...,cmu_hinglish_dog,how many times do you see this movie on your t...
1,ha muje bhi hassi mazak ki movies pasand par y...,"yeah, even though i love comedies, this wasn't...",cmu_hinglish_dog,yeah i also have a hard time with movies that ...
2,kese tina ka jhuti writing isme achi the usne ...,how come. thought tina fey's writing was fanta...,cmu_hinglish_dog,the new turing novel has a pretty unique writi...
3,muje kahani bhute achi lagi aur unhone ise dac...,i loved the story & how true they made it in h...,cmu_hinglish_dog,i really liked the story and how they tried to...
4,ab samja muje nhi pta tha ki aap sahi admi hai...,i see. i didn't realize it was 14 years ago. y...,cmu_hinglish_dog,"now that i think about it, i'm not sure you co..."
...,...,...,...,...
7067,alarm ko abhi stop kare,stop alarm now,top,stop the alarm now
7068,har ghante ke liye alarm set kare,set alarm every hour,top,set an alarm for every hour
7069,bobby ko text kare,text bobby,top,text bobby.
7070,muje shaam 6 baje laundry ko pick up karne ke ...,remind me to pick up laundry at 6 pm,top,remind me to pick up laundry at 6 pm


In [40]:
%%time
# Calculate metrics for these translations
bleu_scores = translated_df.groupby('source').apply(calculate_sacrebleu_score)
o_bleu_scores = calculate_sacrebleu_score(translated_df)
chrf_scores = translated_df.groupby('source').apply(calculate_chrf_score)
o_chrf_scores = calculate_chrf_score(translated_df)
bert_scores = translated_df.groupby('source').apply(calculate_bert_score)
o_bert_scores = calculate_bert_score(translated_df)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CPU times: user 7min 17s, sys: 1min 13s, total: 8min 30s
Wall time: 3min 43s


In [42]:
# Visualize results, grouped by data source
res = pd.concat([bleu_scores, chrf_scores, bert_scores], axis=1)
res.columns = ["SacreBERT Score", "ChrF Score", "BERT Score"]
res[["BERTScore-Precision", "BERTScore-Recall", "BERTScore-F1"]] = res["BERT Score"].apply(pd.Series)
res.drop("BERT Score", axis=1, inplace=True)
res

Unnamed: 0_level_0,SacreBERT Score,ChrF Score,BERTScore-Precision,BERTScore-Recall,BERTScore-F1
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cmu_hinglish_dog,15.08743,33.217919,0.904818,0.89806,0.901277
top,58.648575,76.29658,0.961932,0.957228,0.959495


In [43]:
# Overall results
print(f"Overall score on our test data by our model:\n\nSacreBLEU Score: {o_bleu_scores}\nChrF Score: {o_chrf_scores}\nBERTScore-Precision: {o_bert_scores[0]}\nBERTScore-Recall: {o_bert_scores[1]}\nBERTScore-F1: {o_bert_scores[2]}")

Overall score on our test data by our model:

SacreBLEU Score: 50.37833694526229
ChrF Score: 68.44914629567602
BERTScore-Precision: 0.9547688149472032
BERTScore-Recall: 0.9498065320260789
BERTScore-F1: 0.9521926869027215


In [52]:
samples = [
    "miami se west palm tak kitni der lagegi",
    "Kab melige coffee mujhe?",
    "kya aap kal ke liye mera alarm set kar sakte he",
    "kya mujhe forecast milsakta hai please ?",
    "kitna time lega?"
    "me tumse pyaar kartha hu"
]
for sample in samples:
    print(f"\nSRC: {sample}\nTGT: {mt_hg_model.infer(model_path=MBART_MODEL_CONDITIONAL_GENERATION_RESUME_FROM_CHECKPOINT, src=sample, need_print=False)}")


SRC: miami se west palm tak kitni der lagegi
TGT: how long will it take from miami to west palm

SRC: Kab melige coffee mujhe?
TGT: okay, will you give me a fresh coffee?

SRC: kya aap kal ke liye mera alarm set kar sakte he
TGT: can you set my alarm for tomorrow

SRC: kya mujhe forecast milsakta hai please ?
TGT: can i have the forecast please?

SRC: kitna time lega?me tumse pyaar kartha hu
TGT: how much time has passed? i love you
