In [1]:
!pip install transformers -q

In [2]:
from transformers import pipeline, set_seed

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
generator = pipeline('text-generation', model='gpt2')

In [4]:
set_seed(123)
generator("Hey readers, today is", max_length=50, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hey readers, today is not the last time we'll be seeing one of our favorite indie rock bands playing…\n\nSo we do have something news for you!\n\nHuge thanks to our friends at Cencept, the official soundtrack artist"},
 {'generated_text': 'Hey readers, today is Christmas. This is not Christmas, because Christmas is so long and I hope everyone still has the peace of mind of the two million years ago, but rather, this is a year of great things for you on board your journey'},
 {'generated_text': "Hey readers, today is CTA Day!\n\nWe're proud to be hosting a special event on July 26th. Here are all sorts of fun facts you can learn about CTA at your local CTA stop (but don't think that"}]

### Using transformer model to generate features for training other models

In [23]:
from transformers import GPT2Tokenizer

#encode the input sentence into a tokenized format for GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = 'Let us encode this sentence'

# Mapped strings to an integer representation, and it set the 
#attention mask to all 1s. meaning all words will be processed when we pass the encoded input to the model
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[ 5756,   514, 37773,   428,  6827]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [26]:
from transformers import GPT2Model

model = GPT2Model.from_pretrained('gpt2')
output = model(**encoded_input)

#[batch_size, sentence_length, embed_dim]
output['last_hidden_state'].shape

torch.Size([1, 5, 768])

In [28]:
# We can use this feature encoding from above, 
#to train a classifier based on the GPT-2 based feature 
#representation instead of using other methods in encoding word to numericals