## Importing libraries and modules

In [None]:
# Enable auto-reload
%load_ext autoreload
%autoreload 2

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Add one level up directory to the path
import sys
sys.path.append("..")

# Import libraries
import torch
import random

# Import custom modules
from src.data import *
from src.machine_translation import *
from src.data.utils import get_dataset
from src.data.preprocess import clean_text
from src.data.tokenizer import CustomBartTokenizer
from src.machine_translation.net import CodeMixedModel
from src.machine_translation.utils import get_data_loader_models
from src.machine_translation.models.bart_conditional import BartForConditionalGeneration

## Data acquisition, cleaning and processing

In [None]:
%%time
train_df, validation_df, test_df = get_dataset()

In [None]:
train_df.en.info()

In [None]:
train_df

In [None]:
validation_df

In [None]:
test_df

In [None]:
%%time
train_df = train_df.applymap(clean_text)
train_df

In [None]:
%%time
validation_df = validation_df.applymap(clean_text)
validation_df

In [None]:
%%time
test_df = test_df.applymap(clean_text)
test_df

## Data tokenization

In [None]:
bart_tokenizer = CustomBartTokenizer().build()
bart_tokenizer_scratch = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.SCRATCH.value)
bart_tokenizer_append = CustomBartTokenizer().build(data=train_df["hi_en"], tokenizer_style=STYLE.APPEND.value)

In [None]:
query = "Hello! Ye CSCI 544 ka project he. Ye project Code-Mixed Machine Translation ke bara me he. Hamane hamari khoon bahahe."
query = clean_text(text=query)

In [None]:
token_idx = bart_tokenizer.encode(query, add_special_tokens=True)
token_word = bart_tokenizer.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of using the default Bart Tokenizer.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

In [None]:
token_idx = bart_tokenizer_scratch.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_scratch.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer that is trained from scratch.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

In [None]:
token_idx = bart_tokenizer_append.encode(query, add_special_tokens=True)
token_word = bart_tokenizer_append.convert_ids_to_tokens(token_idx)
print('#'*100)
print("This is an example of the Bart Tokenizer to which we append our new data.\n")
print(f"Sentence: {query}\n")
print(f"Tokenized word: {token_word}\n")
print(f"Tokenized idx: {token_idx}")
print('#'*100)

In [None]:
# Now that we know how to get a BART-Tokenizer - default, appeded and scratch, let us get the tokenizer for our code-mixed language
# and the target language.
# Code-Mixed Language - Build a Bart-Tokenizer from scratch.
# Target Language (English) - Use the default Bart-Tokenizer.
hi_en_bart_tokenizer = CustomBartTokenizer().build(
    data=train_df["hi_en"],
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_ENCODER_FROM_PRETRAINED
)
en_bart_tokenizer = CustomBartTokenizer().build(
    tokenizer_style=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_STYLE,
    tokenizer_bart_from_pretrained_path=MBART_TOKENIZER_BPE_BINDING_BART_TOKENIZER_DECODER_FROM_PRETRAINED
)

In [None]:
print(f"Hinglish tokenizer vocab size: {hi_en_bart_tokenizer.vocab_size}")
print(f"English tokenizer vocab size: {en_bart_tokenizer.vocab_size}")

## Dataset and Data Loaders

## Model definition

In [None]:
__model__ = BartForConditionalGeneration()
__model__.model

In [None]:
__batch_size__ = 8
__en_seq_length__ = random.randint(13, 23)
__de_seq_length__ = random.randint(13, 23)
__encoder_input__ = torch.randint(low=0, high=hi_en_bart_tokenizer.vocab_size, size=(__batch_size__, __en_seq_length__)).to(MBART_MODEL_CONDITIONAL_GENERATION_DEVICE)
__decoder_input__ = torch.randint(low=0, high=en_bart_tokenizer.vocab_size, size=(__batch_size__, __de_seq_length__)).to(MBART_MODEL_CONDITIONAL_GENERATION_DEVICE)
__out__ = __model__.model(input_ids=__encoder_input__, decoder_input_ids=__decoder_input__, return_dict=True)
print(f"Model has a total of {__model__.model.num_parameters()} number of parameters")
print("Model encoder input size: ", __encoder_input__.size())
print("Model decoder input size: ", __decoder_input__.size())
print("Model output type: ", type(__out__))
print("Model output size: ", __out__.logits.size())
print("Model output:\n", __out__.logits)

## Model Training

In [None]:
data_loaders = get_data_loader_models(
    train_df=train_df,
    validation_df=validation_df,
    test_df=test_df,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer
)
data_loaders

In [None]:
data_loader, train_data_laoder, validation_data_loader, test_data_loader = data_loaders["hi_en__en"].values()

In [None]:
mt_model = CodeMixedModel(
    train_data_loader=train_data_laoder,
    validation_data_loader=validation_data_loader,
    test_data_loader=test_data_loader,
    encoder_tokenizer=hi_en_bart_tokenizer,
    decoder_tokenizer=en_bart_tokenizer,
    trainable_layers=[
        "model.shared.weight",
        # "model.encoder.embed_positions.weight",
        # "model.decoder.embed_positions.weight"
    ]
)

In [None]:
# Fit/train the model
trained_mt_model, best_trained_mt_model = mt_model.fit()

In [None]:
mt_model.test()

In [None]:
mt_model.infer(
    # model=mt_model.model,
    src = ["hi, tum kaise ho!"]
)