# Flan-T5 

In [13]:
# pip install accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")

input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<pad> Wie old sind Sie?</s>


### Tokenization

In [14]:
input_text = "أكمل البيت الشعري: تلك العيون التي في عينها حور"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

In [16]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_eos_if_not_present',
 '_add_tokens',
 '_added_tokens_decoder',
 '_added_tokens_encoder',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_batch_prepare_for_model',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_compile_jinja_template',
 '_convert_id_to_token',
 '_convert_token_to_id',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_ma

In [15]:
print(input_ids)
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

tensor([[ 3,  2,  3,  2,  3,  2, 10,  3,  2,  3,  2,  3,  2,  3,  2,  3,  2,  3,
          2,  1]], device='cuda:0')
<pad> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>


In [6]:
input_text = "Translate to Arabic: Let's burn down this city"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
print(input_ids)
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

tensor([[30355,    15,    12, 19248,    10,  1563,    31,     7,  5958,   323,
            48,   690,     1]], device='cuda:0')
<pad> <unk> <unk> <unk> <unk> <unk></s>


In [7]:
tokenizer(input_text, return_tensors="pt")

{'input_ids': tensor([[30355,    15,    12, 19248,    10,  1563,    31,     7,  5958,   323,
            48,   690,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

# mT0

In [37]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

checkpoint = "bigscience/mt0-base"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")

inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

<pad> I love you.</s>


In [38]:
inputs = tokenizer.encode("Translate to Arabic: Je t’aime.", return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

<pad> أحبك.</s>


In [41]:
inputs = tokenizer.encode("Finish the following verse:ان العيون التي في طرفها حور ..", return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

<pad> حور ينظر إلى عينه.</s>


# Ara-T5

In [42]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM

tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/AraT5v2-base-1024")  
model = AutoModelForSeq2SeqLM.from_pretrained("UBC-NLP/AraT5v2-base-1024").to("cuda")

ar_prompt="عاصمة ألمانيا هي <extra_id_0> "
input_ids = tokenizer(ar_prompt, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids)
print("Tokenized input:", tokenizer.tokenize(ar_prompt))
print("Decoded output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenized input: ['▁عاصمة', '▁ألمانيا', '▁هي', '<extra_id_0>']
Decoded output: برلين


In [69]:
prompt = "اكمل البيت الشعري: ان العيون التي في طرفها حور قتلننا <extra_id_0> "
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(inputs)

In [70]:
print("Tokenized input:", tokenizer.tokenize(prompt))
print("Decoded output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Tokenized input: ['▁اكمل', '▁البيت', '▁الشعري', ':', '▁ان', '▁العيون', '▁التي', '▁في', '▁طرف', 'ها', '▁حور', '▁قتل', 'ننا', '<extra_id_0>']
Decoded output: ولم يحيين
