In [17]:
import transformers   # Transformers library from Huggingface
import torch 

# Load GPT-2 model and the tokenizer from the transformers library 

In [8]:
gpt_tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2-large')
# Loading in the pretrained GPT-2 model itself.
gpt_model = transformers.GPT2LMHeadModel.from_pretrained('gpt2-large')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/764 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

## Function to generate text

In [58]:
def generate_text(prompt_text, tokenizer, model, n_seqs=1, max_length=25):
    # n_seqs is the number of sentences to generate
    # max_length is the maximum length of the sentence
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    # Encoding the text using the gpt tokenizer. The return tensors are of type "pt" since we are using PyTorch.
    output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=max_length+len(encoded_prompt), # The model has to generate something, so add the length of the original sequence to max_length
      temperature=1.0,
      top_k=0,
      top_p=0.9,
      repetition_penalty=1.2, # To ensure that we dont get repeated phrases
      do_sample=True,
      num_return_sequences=n_seqs   # returns a tuple of tensors containing sequence indices and the generated sequence itself.
   ) # We feed the encoded input into the model.
    
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_() # the _ indicates that the operation will be done in-place.
    generated_sequences = []
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()  # Covert the output sequence to list.
        text = tokenizer.decode(generated_sequence)  # Decode and convert the tokens to text.
        total_sequence = (prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, )) :])
        generated_sequences.append(total_sequence)
        generated_sequences[0] = generated_sequences[0].replace('\n','')
    return generated_sequences

In [61]:
# Time to check our model output.

generated_output = generate_text("pollution",
         gpt_tokenizer,
         gpt_model,
         max_length=200)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [62]:
generated_output[0]

'pollution" or "industrial pollution", and then try to prove that "something is changing". In reality, none of this happens. It\'s very much the opposite: namely, we are doing less destruction now than in any other historical period.But this isn\'t a death-knell, even though it was almost instantaneous! Probably the most important natural thing to be fought against today is energy, especially electric power (electricity is produced by plants). To transform all its use into renewable energy would consume over 30 % of existing production capacity worldwide! The world resources in 2000 were only 20 % of present peak them! For every 15 GW installed, there was only 5% reduction in demand for electricity. But as speeded up, over 30 % more production capacity gets extinguished each year! Any question about rising prices? Well, we are producing more energy already than we will spend on new hydrocarbons in 2050 anyway (we can use nukes), so why should we make any effort?'