# Model Dependency

In [1]:
! pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit
  Downloading streamlit-1.14.0-py2.py3-none-any.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting watchdog
  Downloading watchdog-2.1.9-py3-none-manylinux2014_x86_64.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting validators>=0.2
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing metadata (setu

# Model Training

In [2]:
import logging
import pandas as pd

from simpletransformers.seq2seq import (
  Seq2SeqModel,
  Seq2SeqArgs,
)

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

print("LOADING DATASETS")
train_df = pd.read_csv("../input/modeldatas-te/final1_train.tsv", sep='\t').astype(str)
test_df = pd.read_csv("../input/modeldatas-te/final1_test.tsv", sep='\t').astype(str)
print("LOADED DATASETS SUCCESSFULLY")


train_df["prefix"] = ""
test_df["prefix"] = ""

model_args = Seq2SeqArgs()
model_args.max_seq_length = 56
# model_args.max_length = 20
model_args.train_batch_size = 4
model_args.eval_batch_size = 4
# model_args.rag_embed_batch_size = 8
model_args.num_train_epochs = 5
model_args.no_save = True
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True

# Initialize model
model = Seq2SeqModel(
    encoder_decoder_type="mbart",
    encoder_decoder_name="facebook/mbart-large-50-one-to-many-mmt",
    args=model_args,
    use_cuda=True,
)

def count_matches(labels, preds):
    print(labels)
    print(preds)
    return sum(
        [
            1 if label == pred else 0
            for label, pred in zip(labels, preds)
        ]
    )
print("------TRAINING------")
# Train the model
model.train_model(train_df, output_dir="./Outputs/", eval_data=test_df)
results = model.eval_model(test_df)
print("-------------TRAINING DONE---------")

LOADING DATASETS
LOADED DATASETS SUCCESSFULLY


Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/717 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/528 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


------TRAINING------


  0%|          | 0/10000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/500 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/500 [00:00<?, ?it/s]

-------------TRAINING DONE---------


# Dependencies for Model Evaluation

In [3]:
! pip install evaluate
! pip install rouge_score
! pip install jiwer
! pip install sacrebleu

Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m565.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.3.0
[0mCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=e7a0a4010cd04c33ce16979b5231881735d1842e82aeee10824f5771cf22f2a4
  Stored in directory: /root/.cache/pip/wheels/84/ac/6b/38096e3c5bf1dc87911e3585875e21a3ac610348e740409c76
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0mCollecting jiwer
  Downloading jiwer-2.5.1-py3-none-any.whl (15 kB)
Collecting levens

# Model Evaluation

In [4]:
# Load inputs and targets
# Treat hindi==telugu==indic_language
test_df = pd.read_csv("../input/modeldatas-te/final1_test.tsv", sep='\t').astype(str)

hindi_truth = test_df.loc[test_df["prefix"] == "translate english to indic"]["target_text"].tolist()
to_hindi = test_df.loc[test_df["prefix"] == "translate english to indic"]["input_text"].tolist()

english_truth = test_df.loc[test_df["prefix"] == "translate indic to english"]["target_text"].tolist()
to_english = test_df.loc[test_df["prefix"] == "translate indic to english"]["input_text"].tolist()

In [5]:
# Expand string to list of strings to pass as references in evaluation metric function calls
def str_to_list_of_str(lang_truth):
    expanded_lang_truth = []
    for t in lang_truth:
        temp = []
        temp.append(t)
        expanded_lang_truth.append(temp)
    return expanded_lang_truth

expanded_hindi_truth = str_to_list_of_str(hindi_truth)
expanded_english_truth = str_to_list_of_str(english_truth)

In [6]:
# Load evaluation models
import evaluate
sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
rouge = evaluate.load("rouge")
ter = evaluate.load("ter")
wer = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [7]:
# Load model predictions
hindi_preds = model.predict(to_hindi)
english_preds = model.predict(to_english)

Generating outputs:   0%|          | 0/250 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Generating outputs:   0%|          | 0/250 [00:00<?, ?it/s]

In [8]:
## BLEU
en_hi_results_bleu = sacrebleu.compute(predictions=hindi_preds, references=expanded_hindi_truth)
print("BLEU SCORE--------Orig Target")
print("English to Telugu")
print(en_hi_results_bleu['score'])

hi_en_results_bleu = sacrebleu.compute(predictions=english_preds, references=expanded_english_truth)
print("Telugu to English")
print(hi_en_results_bleu['score'])

BLEU SCORE--------Orig Target
English to Telugu
1.056984642225879
Telugu to English
2.77165029049143


In [9]:
## CHRF++ 
en_hi_results_chrf = chrf.compute(predictions=hindi_preds, references=expanded_hindi_truth, word_order=2)
print("CHRF++ SCORE--------Orig Target")
print("English to Telugu")
print(en_hi_results_chrf['score'])

hi_en_results_chrf = chrf.compute(predictions=english_preds, references=expanded_english_truth, word_order=2)
print("Telugu to English")
print(hi_en_results_chrf['score'])

CHRF++ SCORE--------Orig Target
English to Telugu
13.444375418678565
Telugu to English
16.90747497219693


In [10]:
# ROUGE
en_hi_results_rouge = rouge.compute(predictions=hindi_preds, references=expanded_hindi_truth)
print("Rouge SCORE--------Orig target")
print("English to Telugu")
print(en_hi_results_rouge)

hi_en_results_rouge = rouge.compute(predictions=english_preds, references=expanded_english_truth)
print("Telugu to English")
print(hi_en_results_rouge)

Rouge SCORE--------Orig target
English to Telugu
{'rouge1': 0.10724028055278056, 'rouge2': 0.03443059163059163, 'rougeL': 0.10589457209457213, 'rougeLsum': 0.10652276473526479}
Telugu to English
{'rouge1': 0.2590765244710618, 'rouge2': 0.08451881035925332, 'rougeL': 0.21129069536863224, 'rougeLsum': 0.21097844867621773}


In [11]:
# TER
en_hi_results_ter = ter.compute(predictions=hindi_preds, references=expanded_hindi_truth)
print("TER SCORE--------Orig Target")
print("English to Telugu")
print(en_hi_results_ter['score'])

hi_en_results_ter = ter.compute(predictions=english_preds, references=expanded_english_truth)
print("Telugu to English")
print(hi_en_results_ter['score'])

TER SCORE--------Orig Target
English to Telugu
93.53098603328594
Telugu to English
88.81368364897307


In [12]:
# WER
en_hi_results_wer = wer.compute(predictions=hindi_preds, references=hindi_truth)
print("WER SCORE--------Orig Target")
print("English to Telugu")
print(en_hi_results_wer)

hi_en_results_wer = wer.compute(predictions=english_preds, references=english_truth)
print("Telugu to English")
print(hi_en_results_wer)

WER SCORE--------Orig Target
English to Telugu
0.9386855799899279
Telugu to English
0.9032075220058682


# Sample Sentences

In [13]:
print(model.predict(["I am the President"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['ఈ కార్యక్రమంలో అధ్యక్షుడు మిమ్మీ పదునైన ఈ స్పందిస్ లో ఒక పాక్ష']


In [14]:
print(model.predict(["I am working as a doctor in AIMS hospital, Delhi"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['ఢిల్లీలోని ఎయిమ్స్ ఆస్పత్రిలో డాక్టర్ గా పని చేస్తున్నాను.']


In [15]:
print(model.predict(["Outputs for MT5 model"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['మెటి5 మోడల్ అవుట్పుట్ లు']


In [16]:
print(model.predict(["మెటి5 మోడల్ అవుట్పుట్ లు"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['మెటి5 మోడల్ అవుట్పుట్ లు']


In [17]:
print(model.predict(["నాకు టెలివిజన్ చూడటం ఇష్టం"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['I love to have a telescope']


In [18]:
print(model.predict(["నేను రాత్రి ఆలస్యంగా నిద్రపోవడాన్ని ద్వేషిస్తున్నాను"]))

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

['I deny that I will not be able to sleep in the night']
