<a href="https://colab.research.google.com/github/ReynaQuita/NLP/blob/main/Combining_Translation_and_Dialogue.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install transformers sentencepiece



# **STEPS**

**Translate Chinese to English**

In [2]:
model_translation = "facebook/mbart-large-50-many-to-many-mmt"
model_dialogue = "microsoft/DialoGPT-medium"

In [3]:
from transformers import AutoTokenizer, MBart50TokenizerFast, MBartForConditionalGeneration, AutoModelForCausalLM

In [4]:
tokenizer_translation = MBart50TokenizerFast.from_pretrained(model_translation)
model_translation = MBartForConditionalGeneration.from_pretrained(model_translation)

In [11]:
article_zh =   "你叫什麼名字？" #"昨天我去了台北"

In [12]:
tokenizer_translation.src_lang = "zh_CN"
encoded_zh = tokenizer_translation(article_zh, return_tensors="pt")
generated_tokens = model_translation.generate(
    **encoded_zh,
    forced_bos_token_id=tokenizer_translation.lang_code_to_id["en_XX"]
)
translation_zh2en = tokenizer_translation.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [13]:
translation_zh2en

"What's your name?"

**Generate the Dialogue**

In [8]:
tokenizer_dialogue = AutoTokenizer.from_pretrained(model_dialogue)
model_dialogue = AutoModelForCausalLM.from_pretrained(model_dialogue)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=862955157.0, style=ProgressStyle(descri…




In [14]:
encoded_dialogue = tokenizer_dialogue.encode(translation_zh2en + tokenizer_dialogue.eos_token, return_tensors='pt')
generated_tokens_dialogue = model_dialogue.generate(encoded_dialogue, max_length=1000, pad_token_id=tokenizer_dialogue.eos_token_id)
generated_dialogue = tokenizer_dialogue.decode(generated_tokens_dialogue[:, encoded_dialogue.shape[-1]:][0], skip_special_tokens=True)

In [15]:
generated_dialogue

"I'm not sure, but I think it's a reference to the movie The Big Lebowski."

**Translate English to Chinese**

In [16]:
tokenizer_translation.src_lang = "en_XX"
encoded_en = tokenizer_translation(generated_dialogue, return_tensors="pt")
generated_tokens = model_translation.generate(
    **encoded_en,
    forced_bos_token_id=tokenizer_translation.lang_code_to_id["zh_CN"]
)
translation_en2zh = tokenizer_translation.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [17]:
translation_en2zh

'我不确定,但我认为这是电影《大列布罗夫斯基》的引用。'

# **Combine Them all**

In [36]:
def combine_translation_dialogue(article_zh):
  print("Input: {}".format(article_zh))

  #translate chinese to english
  tokenizer_translation.src_lang = "zh_CN"
  encoded_zh = tokenizer_translation(article_zh, return_tensors="pt")
  generated_tokens = model_translation.generate(
    **encoded_zh,
    forced_bos_token_id=tokenizer_translation.lang_code_to_id["en_XX"]
  )
  translation_zh2en = tokenizer_translation.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("Translation from Chinese to English: {}".format(translation_zh2en))

  #generate dialogue
  encoded_dialogue = tokenizer_dialogue.encode(translation_zh2en + tokenizer_dialogue.eos_token, return_tensors='pt')
  generated_tokens_dialogue = model_dialogue.generate(encoded_dialogue, max_length=1000, pad_token_id=tokenizer_dialogue.eos_token_id)
  generated_dialogue = tokenizer_dialogue.decode(generated_tokens_dialogue[:, encoded_dialogue.shape[-1]:][0], skip_special_tokens=True)
  print("Generated Dialogue: {}".format(generated_dialogue))

  #translate english to chinese
  tokenizer_translation.src_lang = "en_XX"
  encoded_en = tokenizer_translation(generated_dialogue, return_tensors="pt")
  generated_tokens = model_translation.generate(
    **encoded_en,
    forced_bos_token_id=tokenizer_translation.lang_code_to_id["zh_CN"]
  )
  translation_en2zh = tokenizer_translation.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("Translation from English to Chinese: {}".format(translation_en2zh))

  return translation_en2zh

In [37]:
test = combine_translation_dialogue("明天你要做什么？")

Input: 明天你要做什么？
Translation from Chinese to English: What are you gonna do tomorrow?
Generated Dialogue: I'm going to go to the gym.
Translation from English to Chinese: 我要去健身房。


In [38]:
test

'我要去健身房。'

**Calculate The Perplexity**