## Importing libraries and modules

In [1]:
# Enable auto-reload
%load_ext autoreload
%autoreload 2

In [2]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Add one level up directory to the path
import sys
sys.path.append("..")

# Import libraries
import torch
import random

# Import custom modules
from src.data import *
from src.data.utils import get_dataset
from src.data.preprocess import clean_text
from src.data.tokenizer import CustomBartTokenizer
from src.machine_translation.utils import get_data_loader_models
from src.machine_translation.models.bart_conditional import BartForConditionalGeneration

## Data acquisition, cleaning and processing

In [3]:
%%time
train_df, validation_df, test_df = get_dataset()

CPU times: user 4.31 s, sys: 156 ms, total: 4.47 s
Wall time: 7.43 s


In [4]:
train_df.en.info()

<class 'pandas.core.series.Series'>
Index: 174443 entries, 0 to 8059
Series name: en
Non-Null Count   Dtype 
--------------   ----- 
174443 non-null  object
dtypes: object(1)
memory usage: 2.7+ MB


In [5]:
train_df

Unnamed: 0,hi_en,en
0,film ka kya naam hai,What's the name of the movie
1,"namaste, sada hua tomatoes score mahaan hai, l...","Hi, the rotten tomatoes score is great but the..."
2,kya aapako lagata hai ki aapako film pasand aa...,Do you think you will like the movie
3,yah kis tarah kee philm hai,What kind of movie is it
4,film kab banee thee?,when was the movie made?
...,...,...
8055,Thik hai\n,Ok.
8056,Thik hai bhai\n,ok bro
8057,Kya ham chalu kar sakte hai?\n,shall we continue?
8058,Kya aapko pasand hai hamare saath\n,do you like we can


In [6]:
validation_df

Unnamed: 0,hi_en,en
0,movie kis baare me hai?,What is the movie about?
1,Movie ek chhote bacche Kevin k baare me hai ji...,the movie is about a young child named Kevin w...
2,Kya wo jaan bhuj k abandon karte hai?,Did they abandon him on purpose?
3,"nahi, wo uska track lose kardete hai kyuki bah...",no they had lost track of him since they had m...
4,Kya wo realize karte hai k wo chhut gaya aur u...,Did they realize they lost track of him and co...
...,...,...
936,Sunkar good movie hai. Kya ham finish kar skat...,Sounds like a good movie. Can we finish now?
937,Yep. Thanks baat karne ke liye\n,Yep. Thanks for chatting
938,"Thanks, mei dekhati hui. Achi baat hai\n","thanks, I will watch it. SOunds good"
940,kya tumhe movie Despicable Me pasand hai?\n,Did you like the movie Despicable Me?


In [7]:
test_df

Unnamed: 0,hi_en,en
0,oye ledki kitni mazakiya movie dekh rhi ho ?,ah mean girls. such a funny movie. have you se...
1,ha muje bhi hassi mazak ki movies pasand par y...,"Yeah, even though I love comedies, this wasn't..."
2,kese tina ka jhuti writing isme achi the usne ...,how come. thought tina fey's writing was fanta...
3,muje kahani bhute achi lagi aur unhone ise dac...,I loved the story & how true they made it in h...
4,ab samja muje nhi pta tha ki aap sahi admi hai...,I see. i didn't realize it was 14 years ago. y...
...,...,...
6508,alarm ko abhi stop kare,Stop alarm now
6509,Har ghante ke liye alarm set kare,set alarm every hour
6510,Bobby ko text kare,text Bobby
6511,Muje shaam 6 baje laundry ko pick up karne ke ...,remind me to pick up laundry at 6 pm


In [8]:
%%time
train_df = train_df.applymap(clean_text)
train_df

CPU times: user 2.15 s, sys: 2.43 ms, total: 2.15 s
Wall time: 2.17 s


Unnamed: 0,hi_en,en
0,film ka kya naam hai,what's the name of the movie
1,"namaste, sada hua tomatoes score mahaan hai, l...","hi, the rotten tomatoes score is great but the..."
2,kya aapako lagata hai ki aapako film pasand aa...,do you think you will like the movie
3,yah kis tarah kee philm hai,what kind of movie is it
4,film kab banee thee?,when was the movie made?
...,...,...
8055,thik hai,ok.
8056,thik hai bhai,ok bro
8057,kya ham chalu kar sakte hai?,shall we continue?
8058,kya aapko pasand hai hamare saath,do you like we can


In [9]:
%%time
validation_df = validation_df.applymap(clean_text)
validation_df

CPU times: user 43.3 ms, sys: 594 µs, total: 43.9 ms
Wall time: 43.5 ms


Unnamed: 0,hi_en,en
0,movie kis baare me hai?,what is the movie about?
1,movie ek chhote bacche kevin k baare me hai ji...,the movie is about a young child named kevin w...
2,kya wo jaan bhuj k abandon karte hai?,did they abandon him on purpose?
3,"nahi, wo uska track lose kardete hai kyuki bah...",no they had lost track of him since they had m...
4,kya wo realize karte hai k wo chhut gaya aur u...,did they realize they lost track of him and co...
...,...,...
936,sunkar good movie hai. kya ham finish kar skat...,sounds like a good movie. can we finish now?
937,yep. thanks baat karne ke liye,yep. thanks for chatting
938,"thanks, mei dekhati hui. achi baat hai","thanks, i will watch it. sounds good"
940,kya tumhe movie despicable me pasand hai?,did you like the movie despicable me?


In [10]:
%%time
test_df = test_df.applymap(clean_text)
test_df

CPU times: user 86.5 ms, sys: 896 µs, total: 87.3 ms
Wall time: 86.9 ms


Unnamed: 0,hi_en,en
0,oye ledki kitni mazakiya movie dekh rhi ho ?,ah mean girls. such a funny movie. have you se...
1,ha muje bhi hassi mazak ki movies pasand par y...,"yeah, even though i love comedies, this wasn't..."
2,kese tina ka jhuti writing isme achi the usne ...,how come. thought tina fey's writing was fanta...
3,muje kahani bhute achi lagi aur unhone ise dac...,i loved the story & how true they made it in h...
4,ab samja muje nhi pta tha ki aap sahi admi hai...,i see. i didn't realize it was 14 years ago. y...
...,...,...
6508,alarm ko abhi stop kare,stop alarm now
6509,har ghante ke liye alarm set kare,set alarm every hour
6510,bobby ko text kare,text bobby
6511,muje shaam 6 baje laundry ko pick up karne ke ...,remind me to pick up laundry at 6 pm


## Data tokenization

In [11]:
bart_tokenizer = CustomBartTokenizer().build()
bart_tokenizer_scratch = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.SCRATCH.value)
bart_tokenizer_append = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.APPEND.value)









In [12]:
query = "Hello! Ye CSCI 544 ka project he. Ye project Code-Mixed Machine Translation ke bara me he. Hamane hamari khoon bahahe."
query = clean_text(text=query)

In [13]:
token_idx = bart_tokenizer.encode(query, add_special_tokens=True)
token_word = bart_tokenizer.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of using the default Bart Tokenizer.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

####################################################################################################
This is an example of using the default Bart Tokenizer.

Sentence: hello! ye csci 544 ka project he. ye project code-mixed machine translation ke bara me he. hamane hamari khoon bahahe.

Tokenized word: ['<s>', 'hello', '!', 'Ġye', 'Ġc', 'sci', 'Ġ5', '44', 'Ġka', 'Ġproject', 'Ġhe', '.', 'Ġye', 'Ġproject', 'Ġcode', '-', 'm', 'ixed', 'Ġmachine', 'Ġtranslation', 'Ġke', 'Ġbar', 'a', 'Ġme', 'Ġhe', '.', 'Ġham', 'ane', 'Ġham', 'ari', 'Ġkh', 'oon', 'Ġb', 'aha', 'he', '.', '</s>']

Tokenized idx: [0, 42891, 328, 32440, 740, 43428, 195, 3305, 4661, 695, 37, 4, 32440, 695, 3260, 12, 119, 24194, 3563, 19850, 7321, 2003, 102, 162, 37, 4, 11402, 1728, 11402, 1512, 16447, 3863, 741, 11695, 700, 4, 2]
####################################################################################################


In [14]:
token_idx = bart_tokenizer_scratch.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_scratch.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer that is trained from scratch.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

####################################################################################################
This is an example of the Bart Tokenizer that is trained from scratch.

Sentence: hello! ye csci 544 ka project he. ye project code-mixed machine translation ke bara me he. hamane hamari khoon bahahe.

Tokenized word: ['<s>', 'hello', '!', 'Ġye', 'Ġc', 'sci', 'Ġ5', '44', 'Ġka', 'Ġproject', 'Ġhe', '.', 'Ġye', 'Ġproject', 'Ġcode', '-', 'mix', 'ed', 'Ġmachine', 'Ġtrans', 'lation', 'Ġke', 'Ġbara', 'Ġme', 'Ġhe', '.', 'Ġhamane', 'Ġhamari', 'Ġkhoon', 'Ġbaha', 'he', '.', '</s>']

Tokenized idx: [0, 2317, 4, 520, 288, 21554, 466, 11342, 312, 3070, 356, 17, 520, 3070, 6438, 16, 19530, 517, 6262, 3397, 18402, 271, 17190, 284, 356, 17, 11714, 4630, 10623, 13967, 282, 17, 2]
####################################################################################################


In [15]:
token_idx = bart_tokenizer_append.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_append.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer to which we append our new data.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

####################################################################################################
This is an example of the Bart Tokenizer to which we append our new data.

Sentence: hello! ye csci 544 ka project he. ye project code-mixed machine translation ke bara me he. hamane hamari khoon bahahe.

Tokenized word: ['<s>', 'hello', '!', 'Ġ', 'ye', 'Ġ', 'csc', 'i', 'Ġ', '54', '4', 'Ġ', 'ka', 'Ġ', 'project', 'Ġ', 'he', '.', 'Ġ', 'ye', 'Ġ', 'project', 'Ġ', 'code', '-', 'mix', 'ed', 'Ġ', 'mach', 'ine', 'Ġ', 'trans', 'lation', 'Ġ', 'ke', 'Ġ', 'bara', 'Ġ', 'me', 'Ġ', 'he', '.', 'Ġ', 'hamane', 'Ġ', 'hamari', 'Ġ', 'khoo', 'n', 'Ġ', 'baha', 'he', '.', '</s>']

Tokenized idx: [0, 42891, 328, 1437, 4717, 1437, 62406, 118, 1437, 4283, 306, 1437, 2348, 1437, 28258, 1437, 700, 4, 1437, 4717, 1437, 28258, 1437, 20414, 12, 39915, 196, 1437, 63938, 833, 1437, 9981, 35019, 1437, 1071, 1437, 31533, 1437, 1794, 1437, 700, 4, 1437, 72715, 1437, 68736, 1437, 64426, 282, 1437, 83241, 700, 4, 2]
########

In [16]:
# Now that we know how to get a BART-Tokenizer - default, appeded and scratch, let us get the tokenizer for our code-mixed language
# and the target language.
# Code-Mixed Language - Build a Bart-Tokenizer from scratch.
# Target Language (English) - Use the default Bart-Tokenizer.
hi_en_bart_tokenizer = CustomBartTokenizer().build(
    data=train_df["hi_en"],
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_FROM_PRETRAINED
)
en_bart_tokenizer = CustomBartTokenizer().build(
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_FROM_PRETRAINED
)






In [17]:
print(f"Hinglish tokenizer vocab size: {hi_en_bart_tokenizer.vocab_size}")
print(f"English tokenizer vocab size: {en_bart_tokenizer.vocab_size}")

Hinglish tokenizer vocab size: 50265
English tokenizer vocab size: 50265


## Dataset and Data Loaders

In [18]:
# Get the data loaders for denoising
__data_loaders__ = get_data_loader_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    train_batch_size=4, 
    validation_batch_size=4, 
    test_batch_size=4,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=hi_en_bart_tokenizer,
    denoising_stage=True
)

In [19]:
__data_loaders__

defaultdict(dict,
            {'hi_en__en': {'object': <src.machine_translation.data.CodeMixedDataLoader at 0x177653460>,
              'train': <torch.utils.data.dataloader.DataLoader at 0x1776539d0>,
              'validation': <torch.utils.data.dataloader.DataLoader at 0x177653850>,
              'test': <torch.utils.data.dataloader.DataLoader at 0x177653790>}})

In [20]:
__data_loader__, __train_data_loader__, __validation_data_loader__, __test_data_loader__ = __data_loaders__["hi_en__en"].values()

In [21]:
__data_loader__.visualize()

####################################################################################################
Train Dataloader
Batch Size:  4
Number of batches:  43611
Batch source language shape:  torch.Size([4, 18])
Batch source language:  ['kya aj windy hone wala he ?', 'jee haan, yah ek phanee enimeshan philm hai.', 'the kids ko har dusre din empty their trash cans ke liye yaad dilana chahiye .', 'route 100 par traffic kaisa hai']
Batch source tokens:  tensor([[    0,   316,   687,  2946,   460,   565,   356,   318,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1],
        [    0,  6803,  2765,    15,  1215,   379, 26401, 34517,  1963,   278,
            17,     2,     1,     1,     1,     1,     1,     1],
        [    0,   917,  1315,   280,   616,  2040,   566,  9829,  2951,  2060,
         11453,   271,   283,   393,  3266,   486,   322,     2],
        [    0,  3400,  2600,   337,   374,   428,   278,     2,     1,     1,
             1,     1,     1,     

100%|█| 43611/43611 [00:42<00:00, 1022.80i


Validation of train loader successful.
####################################################################################################
Val Dataloader
Batch Size:  4
Number of batches:  811
Validating validation laoder...


100%|███| 811/811 [00:31<00:00, 26.10it/s]


Validation of validation loader successful.
####################################################################################################
Test Dataloader
Batch Size:  4
Number of batches:  1768
Validating test laoder...


100%|█| 1768/1768 [00:31<00:00, 56.40it/s]

Validation of test loader successful.
####################################################################################################





In [22]:
# Get the data loaders for translation
__data_loaders__ = get_data_loader_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    train_batch_size=4, 
    validation_batch_size=4, 
    test_batch_size=4,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)

In [23]:
__data_loaders__

defaultdict(dict,
            {'hi_en__en': {'object': <src.machine_translation.data.CodeMixedDataLoader at 0x29d07e250>,
              'train': <torch.utils.data.dataloader.DataLoader at 0x29d07ee80>,
              'validation': <torch.utils.data.dataloader.DataLoader at 0x29d07e220>,
              'test': <torch.utils.data.dataloader.DataLoader at 0x29d07ed00>}})

In [24]:
__data_loader__, __train_data_loader__, __validation_data_loader__, __test_data_loader__ = __data_loaders__["hi_en__en"].values()

In [25]:
__data_loader__.visualize()

####################################################################################################
Train Dataloader
Batch Size:  4
Number of batches:  43611
Batch source language shape:  torch.Size([4, 39])
Batch source language:  ["i didn't realize i had already seen it until i started reading this. i remember it was really good. if a movie isn't good i usually don't recall it this clearly.", 'india se pop music bajao .', 'meatloaf timer me panch minute add karen', 'kal evening ke liye sone jaane ke liye alarm set kare']
Batch source tokens:  tensor([[    0,    76,  8037,  3036,  5741,   516,  2826,  3741,  5519,   842,
          5461,   516, 10670,  4372,  1302,    17,   516,  4343,   842,  1609,
          2206,  1484,    17,  1810,   265,   633, 14081,  3036,  1484,   516,
          5026,  1313,  3036, 13284,   842,  1302, 14441,    17,     2],
        [    0,  4262,   291,  1639,   585,   549,   322,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,  

100%|█| 43611/43611 [00:43<00:00, 1011.87i


Validation of train loader successful.
####################################################################################################
Val Dataloader
Batch Size:  4
Number of batches:  811
Validating validation laoder...


100%|███| 811/811 [00:31<00:00, 25.78it/s]


Validation of validation loader successful.
####################################################################################################
Test Dataloader
Batch Size:  4
Number of batches:  1768
Validating test laoder...


100%|█| 1768/1768 [00:31<00:00, 55.84it/s]

Validation of test loader successful.
####################################################################################################





## Model definition

In [26]:
__model__ = BartForConditionalGeneration()
__model__.model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [27]:
__batch_size__ = 8
__en_seq_length__ = random.randint(13, 23)
__de_seq_length__ = random.randint(13, 23)
__encoder_input__ = torch.randint(low=0, high=hi_en_bart_tokenizer.vocab_size, size=(__batch_size__, __en_seq_length__))
__decoder_input__ = torch.randint(low=0, high=en_bart_tokenizer.vocab_size, size=(__batch_size__, __de_seq_length__))
__out__ = __model__.model(input_ids=__encoder_input__, decoder_input_ids=__decoder_input__, return_dict=True)
print(f"Model has a total of {__model__.model.num_parameters()} number of parameters")
print("Model encoder input size: ", __encoder_input__.size())
print("Model decoder input size: ", __decoder_input__.size())
print("Model output type: ", type(__out__))
print("Model output size: ", __out__.logits.size())
print("Model output:\n", __out__.logits)

Model has a total of 406291456 number of parameters
Model encoder input size:  torch.Size([8, 14])
Model decoder input size:  torch.Size([8, 16])
Model output type:  <class 'transformers.modeling_outputs.Seq2SeqLMOutput'>
Model output size:  torch.Size([8, 16, 50265])
Model output:
 tensor([[[ 1.5922e+01, -1.9051e+00,  6.7314e+00,  ..., -4.6098e-01,
          -8.4651e-01,  3.5367e+00],
         [-2.0321e+01, -3.1525e+00,  4.1280e+00,  ..., -1.8069e+00,
          -1.8926e+00, -1.7960e+00],
         [-1.2867e+01, -3.0857e+00,  5.3687e+00,  ..., -1.1396e+00,
          -2.2541e+00,  8.7463e-01],
         ...,
         [-6.5658e+00, -3.5609e+00,  5.4603e+00,  ..., -2.3661e+00,
          -3.4459e+00, -2.0179e+00],
         [-7.9376e+00, -2.7581e+00,  5.1347e+00,  ..., -1.4747e+00,
          -2.3998e+00, -1.3976e+00],
         [-1.1249e+01, -3.4873e+00,  4.9320e+00,  ..., -2.0877e+00,
          -2.6862e+00, -2.1638e+00]],

        [[ 1.9488e+01, -1.2134e+00,  9.2017e+00,  ...,  5.9569e-01,
  