# ***0. Installing Dependencies***

In [34]:
!pip install transformers #installing transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [36]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer 
##importing Language model
#importing tokenizer to create pytorch tensors and tokenize sentences


# ***1. Load Model***

In [37]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large') #intialized tokenizer using gpt-large 
model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)

In [38]:
tokenizer.eos_token_id


50256

# ***2. Tokenize sentences***

In [39]:
sentence = input("Tell me a sentence: ")
maxLength = int(input("How many words would you like it to be (whole numbers only): "))
#can input maximum words 
input_ids = tokenizer.encode(sentence, return_tensors ='pt') #returns pytorch tensors

Tell me a sentence: I love coding
How many words would you like it to be (whole numbers only): 200


In [41]:
tokenizer.decode(input_ids[0][1]) #input ids are sentences stored as a pytorch array


' love'

# ***3. Generate and Decode Text***

In [42]:
output = model.generate(input_ids, max_length = maxLength, no_repeat_ngram_size = 2, early_stopping = True)
#generating output text using the model
#takes in sentence tokens(input_ids)
#can specify max lenght, repeat, and early_stopping 
#uses beamsearch with 5 trees


In [43]:
output
#returns generated text as a pytorch tensor token to decode

tensor([[   40,  1842, 19617,    11,   475,   314,  5465,  3597,  2438,    13,
           314,  1842,  3597,  3788,    11,   290,   314,  1101,   407,   257,
         24292,    13,   198,   198,    40,  1101,   257, 11915,    13,   843,
           314,   588,   284,  1486,    13,   887,   314,   836,   470,   588,
          3597,    13,  1406,   314,  1053,   587,  2111,   284,  3785,   503,
           703,   284,  3551,  2438,   326,   314,   460,  1682,   779,    13,
           632,   338,   587,   257,  6531,    13,   383,   717,  1517,   314,
           750,   373,  3551,   257,  1430,   326,   561,  1309,   502,  3551,
           262,  2438,   314,  2227,   284,    13,  3244,   314,  2630,   257,
          1218,  1430,   284,  1309,   262,   717,  1430,  1057,    13,  1320,
           373,   257,  1256,   286,   670,    13,   357,    40,  1053,   635,
          3194,   257,  1178,   584,  4056,   326,  1309,   345,  3551,   534,
           898,  2438,  2014,   198,    13,   764,  

In [44]:
tokenizer.decode(output[0], skip_special_tokens = True) 

"I love coding, but I hate writing code. I love writing software, and I'm not a programmer.\n\nI'm a designer. And I like to design. But I don't like writing. So I've been trying to figure out how to write code that I can actually use. It's been a struggle. The first thing I did was write a program that would let me write the code I wanted to. Then I wrote a second program to let the first program run. That was a lot of work. (I've also written a few other programs that let you write your own code.)\n...\nThe second thing that happened was that when I started writing the program, I realized that it was actually a very simple program. There was no real logic. No real code to read. Just a bunch of numbers. A bunch. Of numbers that were all the same size. All the numbers were the size of the smallest number that could be represented in a computer"

# ***4. Output result***

In [49]:
text = tokenizer.decode(output[0], skip_special_tokens = True) 
#stores output in text without any special keywords using the skip_special_tokens


In [50]:
with open('blogposticecream.txt', 'w') as f:
  f.write(text)