## Importing libraries and modules

In [None]:
# Enable auto-reload
%load_ext autoreload
%autoreload 2

In [33]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Add one level up directory to the path
import sys
sys.path.append("..")

# Import libraries
import torch
import random

# Import custom modules
from src.data import *
from src.machine_translation import *
from src.data.utils import get_dataset
from src.data.preprocess import clean_text
from src.data.tokenizer import CustomBartTokenizer
from src.machine_translation.net import CodeMixedModel, CodeMixedModelHGTrainer
from src.machine_translation.utils import get_data_loader_models, get_tokenized_dataset
from src.machine_translation.models.bart_conditional import BartForConditionalGeneration

## Data acquisition, cleaning and processing

In [None]:
%%time
train_df, validation_df, test_df = get_dataset()

Found cached dataset cmu_hinglish_dog (C:/Users/tejam/.cache/huggingface/datasets/cmu_hinglish_dog/default/0.0.0/a646ab55bde6539dc76686b3b758d0e6ad2a1213f05a69e85eaa4e55bb20ddad)
100%|██████████| 3/3 [00:00<00:00, 76.90it/s]
Found cached dataset json (C:/Users/tejam/.cache/huggingface/datasets/findnitai___json/findnitai--english-to-hinglish-85f0c6597edef310/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 14.59it/s]


Wall time: 8.2 s


In [4]:
#train_df.en.info()

In [None]:
train_df

Unnamed: 0,hi_en,en
0,"HELLO, KYA AAP KO MOVIES PASAND HEIN?",Hello. Do you like movies?
1,"HAAN, OLD TOYS KE BASED HO THO PASAND HEIN, TH...","Yes, but ones based on old toys are werid"
2,"REAL STEEL THO YAAD HEIN, MEIN DEKHA HOON",This one is called Real Steel and I trying to ...
3,"MEIN NAHI, MUJE KOI INTEREST NAHI HEI","I haven't, no interest in it"
4,"MEIN MAANTHA HOON, ROCKY KA REVIEWER KO US ROB...",I agree with the reviewer stating it's Rocky w...
...,...,...
8055,Thik hai\n,Ok.
8056,Thik hai bhai\n,ok bro
8057,Kya ham chalu kar sakte hai?\n,shall we continue?
8058,Kya aapko pasand hai hamare saath\n,do you like we can


In [None]:
validation_df

Unnamed: 0,hi_en,en
0,hello,hello
1,"hello yar, mein is movie ko nahi dekha hoon th...","hello there, I have not seen this movie so im ..."
2,acha tho is movie kis baare me hein?,Alright that is fine. What is the movie?
3,is movie tho social network ke bare mein hein,The movie is The Social Network
4,mein aise kuch nahi dekha hoon,I have not seen that one either.
...,...,...
936,Sunkar good movie hai. Kya ham finish kar skat...,Sounds like a good movie. Can we finish now?
937,Yep. Thanks baat karne ke liye\n,Yep. Thanks for chatting
938,"Thanks, mei dekhati hui. Achi baat hai\n","thanks, I will watch it. SOunds good"
940,kya tumhe movie Despicable Me pasand hai?\n,Did you like the movie Despicable Me?


In [None]:
test_df

Unnamed: 0,hi_en,en
0,Hello! Kaise ho? Tumne recently koi achhi movi...,Hello! How are you? Have you seen any good mov...
1,Tumne recently koi achhi movie dekhi toh uske ...,Can you tell me the name of any good movie you...
2,Hello!,Hello!
3,"Haan, maine abhi La La Land dekhi.","Yes, I just watched La La Land"
4,Tumne dekhi hai?,Have you seen it?
...,...,...
6508,alarm ko abhi stop kare,Stop alarm now
6509,Har ghante ke liye alarm set kare,set alarm every hour
6510,Bobby ko text kare,text Bobby
6511,Muje shaam 6 baje laundry ko pick up karne ke ...,remind me to pick up laundry at 6 pm


In [None]:
%%time
train_df = train_df.applymap(clean_text)
train_df

Wall time: 3.26 s


Unnamed: 0,hi_en,en
0,"hello, kya aap ko movies pasand hein?",hello. do you like movies?
1,"haan, old toys ke based ho tho pasand hein, th...","yes, but ones based on old toys are werid"
2,"real steel tho yaad hein, mein dekha hoon",this one is called real steel and i trying to ...
3,"mein nahi, muje koi interest nahi hei","i haven't, no interest in it"
4,"mein maantha hoon, rocky ka reviewer ko us rob...",i agree with the reviewer stating it's rocky w...
...,...,...
8055,thik hai,ok.
8056,thik hai bhai,ok bro
8057,kya ham chalu kar sakte hai?,shall we continue?
8058,kya aapko pasand hai hamare saath,do you like we can


In [None]:
%%time
validation_df = validation_df.applymap(clean_text)
validation_df

Wall time: 69.5 ms


Unnamed: 0,hi_en,en
0,hello,hello
1,"hello yar, mein is movie ko nahi dekha hoon th...","hello there, i have not seen this movie so im ..."
2,acha tho is movie kis baare me hein?,alright that is fine. what is the movie?
3,is movie tho social network ke bare mein hein,the movie is the social network
4,mein aise kuch nahi dekha hoon,i have not seen that one either.
...,...,...
936,sunkar good movie hai. kya ham finish kar skat...,sounds like a good movie. can we finish now?
937,yep. thanks baat karne ke liye,yep. thanks for chatting
938,"thanks, mei dekhati hui. achi baat hai","thanks, i will watch it. sounds good"
940,kya tumhe movie despicable me pasand hai?,did you like the movie despicable me?


In [None]:
%%time
test_df = test_df.applymap(clean_text)
test_df

Wall time: 138 ms


Unnamed: 0,hi_en,en
0,hello! kaise ho? tumne recently koi achhi movi...,hello! how are you? have you seen any good mov...
1,tumne recently koi achhi movie dekhi toh uske ...,can you tell me the name of any good movie you...
2,hello!,hello!
3,"haan, maine abhi la la land dekhi.","yes, i just watched la la land"
4,tumne dekhi hai?,have you seen it?
...,...,...
6508,alarm ko abhi stop kare,stop alarm now
6509,har ghante ke liye alarm set kare,set alarm every hour
6510,bobby ko text kare,text bobby
6511,muje shaam 6 baje laundry ko pick up karne ke ...,remind me to pick up laundry at 6 pm


## Data tokenization

In [None]:
bart_tokenizer = CustomBartTokenizer().build()
bart_tokenizer_scratch = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.SCRATCH.value)
bart_tokenizer_append = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.APPEND.value)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
query = "Hello! Ye CSCI 544 ka project he. Ye project Code-Mixed Machine Translation ke bara me he. Hamane hamari khoon bahahe."
query = clean_text(text=query)

In [None]:
token_idx = bart_tokenizer.encode(query, add_special_tokens=True)
token_word = bart_tokenizer.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of using the default Bart Tokenizer.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

In [None]:
token_idx = bart_tokenizer_scratch.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_scratch.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer that is trained from scratch.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

In [None]:
token_idx = bart_tokenizer_append.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_append.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer to which we append our new data.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

####################################################################################################
This is an example of the Bart Tokenizer to which we append our new data.

Sentence: hello! ye csci 544 ka project he. ye project code-mixed machine translation ke bara me he. hamane hamari khoon bahahe.

Tokenized word: ['<s>', 'hello', '!', 'Ġye', 'csc', 'i', 'Ġ5', '44', 'Ġka', 'Ġproject', 'Ġhe', '.', 'Ġye', 'Ġproject', 'Ġcode', '-', 'm', 'ixed', 'mach', 'ine', 'tran', 'sla', 'tion', 'Ġke', 'Ġbar', 'a', 'Ġme', 'Ġhe', '.', 'hamane', 'hamari', 'khoo', 'n', 'baha', 'he', '.', '</s>']

Tokenized idx: [0, 42891, 328, 32440, 79705, 118, 195, 3305, 4661, 695, 37, 4, 32440, 695, 3260, 12, 119, 24194, 69287, 833, 55726, 52487, 24659, 7321, 2003, 102, 162, 37, 4, 72070, 73355, 66618, 282, 80387, 700, 4, 2]
####################################################################################################


In [None]:
# Now that we know how to get a BART-Tokenizer - default, appeded and scratch, let us get the tokenizer for our code-mixed language
# and the target language.
# Code-Mixed Language - Build a Bart-Tokenizer from scratch.
# Target Language (English) - Use the default Bart-Tokenizer.
hi_en_bart_tokenizer = CustomBartTokenizer().build(
    data=train_df["hi_en"],
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_FROM_PRETRAINED
)
en_bart_tokenizer = CustomBartTokenizer().build(
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_FROM_PRETRAINED
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(f"Hinglish tokenizer vocab size: {hi_en_bart_tokenizer.vocab_size}")
print(f"English tokenizer vocab size: {en_bart_tokenizer.vocab_size}")

## Dataset and Data Loaders

## Model definition

In [None]:
__model__ = BartForConditionalGeneration()
__model__.model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((102

In [27]:
__batch_size__ = 8
__en_seq_length__ = random.randint(13, 23)
__de_seq_length__ = random.randint(13, 23)
__encoder_input__ = torch.randint(low=0, high=hi_en_bart_tokenizer.vocab_size, size=(__batch_size__, __en_seq_length__))
__decoder_input__ = torch.randint(low=0, high=en_bart_tokenizer.vocab_size, size=(__batch_size__, __de_seq_length__))
__encoder_input__ = __encoder_input__.to("cuda")
__decoder_input__ = __decoder_input__.to("cuda")
__out__ = __model__.model(input_ids=__encoder_input__, decoder_input_ids=__decoder_input__, return_dict=True)
print(f"Model has a total of {__model__.model.num_parameters()} number of parameters")
print("Model encoder input size: ", __encoder_input__.size())
print("Model decoder input size: ", __decoder_input__.size())
print("Model output type: ", type(__out__))
print("Model output size: ", __out__.logits.size())
print("Model output:\n", __out__.logits)

Model has a total of 406291456 number of parameters
Model encoder input size:  torch.Size([8, 17])
Model decoder input size:  torch.Size([8, 21])
Model output type:  <class 'transformers.modeling_outputs.Seq2SeqLMOutput'>
Model output size:  torch.Size([8, 21, 50265])
Model output:
 tensor([[[ 13.8037,  -1.5430,   5.5749,  ...,  -2.0705,  -2.5284,   3.1019],
         [-30.4389,  -2.7391,   5.1244,  ...,  -1.2274,  -0.7405,   2.6097],
         [ -6.6546,  -2.7839,   7.6908,  ...,  -1.7490,  -2.9420,   2.4142],
         ...,
         [ -5.3017,  -3.2074,   2.9960,  ...,  -2.9747,  -3.6109,  -2.5332],
         [ -8.5728,  -2.7164,   4.3671,  ...,  -1.2633,  -2.4995,  -0.4308],
         [ -9.6214,  -1.7396,  11.3250,  ...,   2.2406,  -1.3513,   6.8301]],

        [[ 17.2651,  -1.1422,   7.5051,  ...,  -0.1251,  -1.6244,   3.7492],
         [ -9.4207,  -2.9711,   4.2016,  ...,  -1.6579,  -2.4945,  -1.5065],
         [ -0.0342,  -1.3373,   5.1807,  ...,   0.2894,  -0.2027,   1.0067],
       

In [64]:
data_set = get_tokenized_dataset(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)

In [65]:
data_set

defaultdict(dict,
            {'hi_en__en': {'train': <src.machine_translation.data.CodeMixedTokenizedDataset at 0x1d7d8657308>,
              'validation': <src.machine_translation.data.CodeMixedTokenizedDataset at 0x1d7d8657d88>,
              'test': <src.machine_translation.data.CodeMixedTokenizedDataset at 0x1d7d8657208>}})

In [66]:
train_dataset, validation_dataset, test_dataset = data_set["hi_en__en"].values()

In [67]:
mt_model_HG_Trainer = CodeMixedModelHGTrainer(
    train_dataset = train_dataset,
    validation_dataset = validation_dataset,
    test_dataset = test_dataset
    trainable_layers=[
        "model.shared.weight",
        # "model.encoder.embed_positions.weight",
        # "model.decoder.embed_positions.weight"
    ]
)

In [69]:
trained_mt_model_HG_Trainer, best_trained_mt_model_HG_Trainer = mt_model_HG_Trainer.fit()

Loading model...


RuntimeError: CUDA out of memory. Tried to allocate 198.00 MiB (GPU 0; 8.00 GiB total capacity; 7.26 GiB already allocated; 0 bytes free; 7.31 GiB reserved in total by PyTorch)

## Model Training

In [34]:
data_loaders = get_data_loader_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)
data_loaders

defaultdict(dict,
            {'hi_en__en': {'object': <src.machine_translation.data.CodeMixedDataLoader at 0x1d7da30f5c8>,
              'train': <torch.utils.data.dataloader.DataLoader at 0x1d7da30f0c8>,
              'validation': <torch.utils.data.dataloader.DataLoader at 0x1d7da30f308>,
              'test': <torch.utils.data.dataloader.DataLoader at 0x1d7da30fbc8>}})

In [None]:
data_loader, train_data_laoder, validation_data_loader, test_data_loader = data_loaders["hi_en__en"].values()

In [30]:
mt_model = CodeMixedModel(
    train_data_loader=train_data_laoder,
    validation_data_loader=validation_data_loader,
    test_data_loader=test_data_loader,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer,
    trainable_layers=[
        "model.shared.weight",
        # "model.encoder.embed_positions.weight",
        # "model.decoder.embed_positions.weight"
    ]
)

Loading model...
No saved model to load as `saved_model_path` was not provided in the `__init__()`...
Freezing the model...
BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bia

In [60]:
import torch
torch.cuda.empty_cache()

In [61]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 1            |        cudaMalloc retries: 1         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    7432 MB |    7432 MB |    7614 MB |  186779 KB |
|       from large pool |    7251 MB |    7251 MB |    7283 MB |   32986 KB |
|       from small pool |     181 MB |     181 MB |     331 MB |  153792 KB |
|---------------------------------------------------------------------------|
| Active memory         |    7432 MB |    7432 MB |    7614 MB |  186779 KB |
|       from large pool |    7251 MB |    7251 MB |    7283 MB |   32986 KB |
|       from small pool |     181 MB |     181 MB |     331 MB |  153792 KB |
|---------------------------------------------------------------