In [1]:
import transformers   # Transformers library from Huggingface
import torch 

# Load GPT-2 model and the tokenizer from the transformers library 

In [2]:
gpt_tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2-large')
# Loading in the pretrained GPT-2 model itself.
gpt_model = transformers.GPT2LMHeadModel.from_pretrained('gpt2-large')

## Function to generate text

In [3]:
def generate_text(prompt_text, tokenizer, model, n_seqs=1, max_length=25):
    # n_seqs is the number of sentences to generate
    # max_length is the maximum length of the sentence
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    # Encoding the text using the gpt tokenizer. The return tensors are of type "pt" since we are using PyTorch.
    output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=max_length+len(encoded_prompt), # The model has to generate something, so add the length of the original sequence to max_length
      temperature=0.7,   # applied to softmax to control the liklihood of high and low frequency words
      
      # Top-K considers only top 6 most likely words from the probability distribution while sampling (Here we          are setting it to 0 and consider Top-p(nucleus) sampling)
      top_k=0,
      # Top-p considers all the words from the distribution with conditional probability less than set value
      top_p=0.9,  
      repetition_penalty=1.2, # To ensure that we dont get repeated phrases
      do_sample=True,
      num_return_sequences=n_seqs   # returns a tuple of tensors containing sequence indices and the generated sequence itself.
   ) # We feed the encoded input into the model.
    
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_() # the _ indicates that the operation will be done in-place.
    generated_sequences = []
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()  # Covert the output sequence to list.
        text = tokenizer.decode(generated_sequence)  # Decode and convert the tokens to text.
        total_sequence = (prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, )) :])
        generated_sequences.append(total_sequence)
        generated_sequences[0] = generated_sequences[0].replace('\n','')
    return generated_sequences

In [4]:
# Time to check our model output.

generated_output = generate_text("electric vehicles",
         gpt_tokenizer,
         gpt_model,
         max_length=200)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [5]:
generated_output[0]

'electric vehicles (EVs) and electric buses, which are capable of running on electricity from the grid.The price for electricity is expected to fall by 2 percent this year due to a drop in coal use."If we take into account that coal-fired power stations will be shut down globally at an average rate of one per week by 2030, the carbon emissions from diesel-fuelled vehicles will continue falling," said Tim Buckley, head of fuel analysis for IHS Markit\'s Global Fuel & Energy Outlook 2016.Meanwhile, a report released last month by International Energy Agency forecast global demand for energy would rise by 9 billion tons in 2015 and 1.2 trillion tons in 2020 as economies recover.Europe will remain the biggest market for EVs over the next three years, accounting for 30 percent of new sales, followed by China with 18 percent growth and India with 11 percent, according to IEA data."We\'re seeing a big shift towards cleaner'