## GPT-2

In [None]:
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [None]:
prompt = 'The domestic dog (Canis familiaris or Canis lupus familiaris)[4] is a domesticated wolf. The dog descended from an ancient, extinct wolf,[5][6] with the modern grey wolf being the nearest living relative.[7] The dog was the first species to be domesticated,[8][7] by hunter–gatherers over 15,000 years ago,[6] before the development of agriculture.'

f = open("gpt2.txt", "a")


input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
tf.random.set_seed(5)
    
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    min_length=1000,
    max_length=1500,
    top_p=0.95,
    top_k=40
)
    
output = tokenizer.decode(sample_output[0], skip_special_tokens=True)

f.write(output)
f.write('\nEND\n')
    
f.close()    


# 3min 42s
# 10.000 articles = 616.67 hours

## distilGPT-2

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelWithLMHead

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

model = AutoModelWithLMHead.from_pretrained("distilgpt2")

In [None]:
f = open("distil_gpt2.txt", "a")

prompt = 'The domestic dog (Canis familiaris or Canis lupus familiaris)[4] is a domesticated wolf. The dog descended from an ancient, extinct wolf,[5][6] with the modern grey wolf being the nearest living relative.[7] The dog was the first species to be domesticated,[8][7] by hunter–gatherers over 15,000 years ago,[6] before the development of agriculture.'

input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
tf.random.set_seed(5)
    
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    min_length=200,
    max_length=500,
    top_p=0.95,
    top_k=40
)
    
output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
    
f.write(output)
f.write('\nEND\n')
    
f.close()   


# 2min 56s
# 10.000 articles = 489 hours

## distilBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

In [None]:
%%time

prompt = 'hi, this is a test.'

input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
tf.random.set_seed(0)
    
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    min_length=200,
    max_length=300,
    top_k=40
)
    
output = tokenizer.decode(sample_output[0], skip_special_tokens=True)

print(output)

# Unusable output
# 2min 40s for one sample

## Using BBC news & distilGPT-2

In [None]:
# pip3 install datasets

from datasets import load_dataset

dataset = load_dataset('xsum')

In [None]:
train_dict = dataset["train"].to_dict()
test_dict = dataset["test"].to_dict()
val_dict = dataset["validation"].to_dict()

docs = train_dict["document"] + test_dict["document"] + val_dict["document"]

In [None]:
import spacy

nlp = spacy.load('en')

f = open("dataset.txt", "a")

for item in docs[336:]:
  try:
    # generate prompt
    tokens = nlp(item)
    i = 0
    prompt = ''
    for sent in tokens.sents:
        i += 1
        prompt = prompt + ' ' + sent.string.strip()
        # 3 sentences
        if i > 3:
          break

    prompt_size = len(prompt)

    # encode input
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    tf.random.set_seed(3)
    
    # generate output
    sample_output = model.generate(
        input_ids, 
        do_sample=True, 
        min_length=prompt_size+200,
        max_length=prompt_size+400,
        # top_p & top_k sampling
        top_p=0.95,
        top_k=50

        )
  
    # decode output
    output = tokenizer.decode(sample_output[0], skip_special_tokens=True)

    # write output, excluding the prompt
    f.write(output[len(prompt):])

  # handle max token error (max=1024)
  except IndexError:
    pass

f.close()   