# **Install transformers**

In [None]:
# -- Install transformers and graphviz
!pip install transformers

# **TextGen_GreedySearch**

1.   Input text
2.   Language model
3.   Inner probability distrubution
4.   Decoding algorithm : Greedy search
5.   Generate output text



In [None]:
# -- Importation : torch, numpy, GPT2Tokenizer, GPT2LMHeadModel
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# -- A. Creat Main Class (mytokenizer) and (mymodel)
mytokenizer = GPT2Tokenizer.from_pretrained("gpt2")
mymodel = GPT2LMHeadModel.from_pretrained("gpt2")

# -- B. The input text (text) and max length of generated_text (nb_generated_tokens=length-length(input_text))
input_text = "The Olmec colossal heads are at least seventeen monumental stone representations of human heads sculpted from large basalt boulders. The heads date from at least before"
print(f"-0 >>  input_text: {input_text}\n")
length=128

# -- C. Encoding: 1. 'mytokenizer' translates each token in the input text (text) into a corresponding token ID (input_ids).
input_ids = mytokenizer.encode(input_text, return_tensors='pt')
print(f"-1 >>  input_ids: {input_ids}\n")

# -- C. Encoding: 2. generate output_ids sequence until the output length (which includes the context length)
output_ids = mymodel.generate(input_ids, max_length=length)
print(f"-2 >>  output_ids: {output_ids}\n")

# -- C. Encoding: 3. generate transition scores for each token generated with 'Greedy Search'
output_ids_ss = mymodel.generate(input_ids, max_length=length, return_dict_in_generate=True, output_scores=True)
print(f"-3 >>  output_ids_ss: {output_ids_ss}\n")
#output_ids_scores = mymodel.compute_transition_scores(output_ids_ss.sequences, output_ids_ss.scores, normalize_logits=True)
#print(f"-4 >>  output_ids_scores: {output_ids_scores}\n")

# -- C. Decoding: 1. Print the scores for each token generated with Greedy Search

###--inputs_length = the length of the input prompt, for decoder-only models / =1 --> for encoder-decoder models
inputs_length = 1 if mymodel.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = output_ids_ss.sequences[:, inputs_length:]

###--Print the scores for each token generated with Greedy Search
print(f"-5 >>  generated_tokens")
print(f"| token_ids | token string | logits | probability")
#for tok, score in zip(generated_tokens[0], output_ids_scores[0]):
#    print(f"| {tok:9d} | {mytokenizer.decode(tok):12s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

# -- C. Decoding: 2. generate output_text which includes the input_text
generated_text = mytokenizer.decode(output_ids[0])
print(f"\n-6 >>  Generated text: {generated_text}")
