# Model Dependency

In [1]:
! pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting streamlit
  Downloading streamlit-1.14.0-py2.py3-none-any.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting watchdog
  Downloading watchdog-2.1.9-py3-none-manylinux2014_x86_64.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.8.0-py2.py3-none-any.whl (4.7 MB)
[2K     

# Model Training

In [2]:
import logging
import pandas as pd

from simpletransformers.seq2seq import (
  Seq2SeqModel,
  Seq2SeqArgs,
)

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

print("LOADING DATASETS")
train_df = pd.read_csv("../input/modeldatasets/final1_train.tsv", sep='\t').astype(str)
test_df = pd.read_csv("../input/modeldatasets/final1_test.tsv", sep='\t').astype(str)
print("LOADED DATASETS SUCCESSFULLY")


train_df["prefix"] = ""
test_df["prefix"] = ""

model_args = Seq2SeqArgs()
model_args.max_seq_length = 56
# model_args.max_length = 20
model_args.train_batch_size = 4
model_args.eval_batch_size = 4
# model_args.rag_embed_batch_size = 8
model_args.num_train_epochs = 5
model_args.no_save = True
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True

# Initialize model
model = Seq2SeqModel(
    encoder_decoder_type="mbart",
    encoder_decoder_name="facebook/mbart-large-50-one-to-many-mmt",
    args=model_args,
    use_cuda=True,
)

def count_matches(labels, preds):
    print(labels)
    print(preds)
    return sum(
        [
            1 if label == pred else 0
            for label, pred in zip(labels, preds)
        ]
    )
print("------TRAINING------")
# Train the model
model.train_model(train_df, output_dir="./Outputs/", eval_data=test_df)
results = model.eval_model(test_df)
print("-------------TRAINING DONE---------")

LOADING DATASETS
LOADED DATASETS SUCCESSFULLY


Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/717 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/528 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


------TRAINING------


  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/500 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

-------------TRAINING DONE---------


# Dependencies for Model Evaluation

In [13]:
! pip install evaluate
! pip install rouge_score
! pip install jiwer
! pip install sacrebleu

[0mCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m687.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.3.1
[0m

# Model Evaluation

In [9]:
# Load inputs and targets
test_df = pd.read_csv("../input/modeldatasets/final1_test.tsv", sep='\t').astype(str)

hindi_truth = test_df.loc[test_df["prefix"] == "translate english to indic"]["target_text"].tolist()
to_hindi = test_df.loc[test_df["prefix"] == "translate english to indic"]["input_text"].tolist()

english_truth = test_df.loc[test_df["prefix"] == "translate indic to english"]["target_text"].tolist()
to_english = test_df.loc[test_df["prefix"] == "translate indic to english"]["input_text"].tolist()

In [11]:
# Expand string to list of strings to pass as references in evaluation metric function calls
def str_to_list_of_str(lang_truth):
    expanded_lang_truth = []
    for t in lang_truth:
        temp = []
        temp.append(t)
        expanded_lang_truth.append(temp)
    return expanded_lang_truth

expanded_hindi_truth = str_to_list_of_str(hindi_truth)
expanded_english_truth = str_to_list_of_str(english_truth)

In [14]:
# Load evaluation models
import evaluate
sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
rouge = evaluate.load("rouge")
ter = evaluate.load("ter")
wer = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [15]:
# Load model predictions
hindi_preds = model.predict(to_hindi)
english_preds = model.predict(to_english)

Generating outputs:   0%|          | 0/250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Generating outputs:   0%|          | 0/250 [00:00<?, ?it/s]

In [16]:
## BLEU
en_hi_results_bleu = sacrebleu.compute(predictions=hindi_preds, references=expanded_hindi_truth)
print("BLEU SCORE--------Orig Target")
print("English to Hindi")
print(en_hi_results_bleu['score'])

hi_en_results_bleu = sacrebleu.compute(predictions=english_preds, references=expanded_english_truth)
print("Hindi to English")
print(hi_en_results_bleu['score'])

BLEU SCORE--------Orig Target
English to Hindi
4.853599662746455
Hindi to English
3.880075272592362


In [17]:
## CHRF++ 
en_hi_results_chrf = chrf.compute(predictions=hindi_preds, references=expanded_hindi_truth, word_order=2)
print("CHRF++ SCORE--------Orig Target")
print("English to Hindi")
print(en_hi_results_chrf['score'])

hi_en_results_chrf = chrf.compute(predictions=english_preds, references=expanded_english_truth, word_order=2)
print("Hindi to English")
print(hi_en_results_chrf['score'])

CHRF++ SCORE--------Orig Target
English to Hindi
21.75876473363467
Hindi to English
20.683955214157788


In [18]:
# ROUGE
en_hi_results_rouge = rouge.compute(predictions=hindi_preds, references=expanded_hindi_truth)
print("Rouge SCORE--------Orig target")
print("English to Hindi")
print(en_hi_results_rouge)

hi_en_results_rouge = rouge.compute(predictions=english_preds, references=expanded_english_truth)
print("Hindi to English")
print(hi_en_results_rouge)

Rouge SCORE--------Orig target
English to Hindi
{'rouge1': 0.10155574691009478, 'rouge2': 0.025887614870509605, 'rougeL': 0.09998813601857079, 'rougeLsum': 0.09962684610075911}
Hindi to English
{'rouge1': 0.3239534319943863, 'rouge2': 0.12533199461291888, 'rougeL': 0.2695351346128553, 'rougeLsum': 0.2689892099611896}


In [19]:
# TER
en_hi_results_ter = ter.compute(predictions=hindi_preds, references=expanded_hindi_truth)
print("TER SCORE--------Orig Target")
print("English to Hindi")
print(en_hi_results_ter['score'])

hi_en_results_ter = ter.compute(predictions=english_preds, references=expanded_english_truth)
print("Hindi to English")
print(hi_en_results_ter['score'])

TER SCORE--------Orig Target
English to Hindi
84.06992607633491
Hindi to English
84.60040949216258


In [20]:
# WER
en_hi_results_wer = wer.compute(predictions=hindi_preds, references=hindi_truth)
print("WER SCORE--------Orig Target")
print("English to Hindi")
print(en_hi_results_wer)

hi_en_results_wer = wer.compute(predictions=english_preds, references=english_truth)
print("Hindi to English")
print(hi_en_results_wer)

WER SCORE--------Orig Target
English to Hindi
0.8492637725310993
Hindi to English
0.866512267982412


# Sample Sentences

In [21]:
print(model.predict(["I am Jayant"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['मैं जयंत हूँ']


In [22]:
print(model.predict(["I am working as a doctor in AIMS hospital, Delhi"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['मैं दिल्ली के एम्स अस्पताल में डॉक्टर के रूप में काम कर रहा हूं']


In [23]:
print(model.predict(["Outputs for MT5 model"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['एमटी5 मॉडल के लिए आउटपुट']


In [24]:
print(model.predict(["मुझे अच्छी तरह पीने की आशा करता हूं।"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['I hope to have a good way to drink.']


In [25]:
print(model.predict(["मैं गणेश सिंह हूं"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['I am Ganesh Singh']


In [26]:
print(model.predict(["मुझे अपने दोस्तों के साथ क्रिकेट खेलना पसंद है"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['I like to play cricket with friends']
