## GPT-2

In [None]:
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [None]:
prompt = 'The domestic dog (Canis familiaris or Canis lupus familiaris)[4] is a domesticated wolf. The dog descended from an ancient, extinct wolf,[5][6] with the modern grey wolf being the nearest living relative.[7] The dog was the first species to be domesticated,[8][7] by hunter–gatherers over 15,000 years ago,[6] before the development of agriculture.'

f = open("gpt2.txt", "a")


input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
tf.random.set_seed(5)
    
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    min_length=1000,
    max_length=1500,
    top_p=0.95,
    top_k=40
)
    
output = tokenizer.decode(sample_output[0], skip_special_tokens=True)

f.write(output)
f.write('\nEND\n')
    
f.close()    


# 3min 42s
# 10.000 articles = 616.67 hours

## distilGPT-2

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelWithLMHead

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

model = AutoModelWithLMHead.from_pretrained("distilgpt2")

In [None]:
f = open("distil_gpt2.txt", "a")

prompt = 'The domestic dog (Canis familiaris or Canis lupus familiaris)[4] is a domesticated wolf. The dog descended from an ancient, extinct wolf,[5][6] with the modern grey wolf being the nearest living relative.[7] The dog was the first species to be domesticated,[8][7] by hunter–gatherers over 15,000 years ago,[6] before the development of agriculture.'

input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
tf.random.set_seed(5)
    
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    min_length=200,
    max_length=500,
    top_p=0.95,
    top_k=40
)
    
output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
    
f.write(output)
f.write('\nEND\n')
    
f.close()   


# 2min 56s
# 10.000 articles = 489 hours

## distilBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

In [None]:
%%time

prompt = 'hi, this is a test.'

input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
tf.random.set_seed(0)
    
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    min_length=200,
    max_length=300,
    top_k=40
)
    
output = tokenizer.decode(sample_output[0], skip_special_tokens=True)

print(output)

# Unusable output
# 2min 40s for one sample

## Using BBC news & distilGPT-2

In [None]:
# pip3 install datasets

from datasets import load_dataset

dataset = load_dataset('xsum')

In [None]:
train_dict = dataset["train"].to_dict()
test_dict = dataset["test"].to_dict()
val_dict = dataset["validation"].to_dict()

docs = train_dict["document"] + test_dict["document"] + val_dict["document"]

In [None]:
import spacy

nlp = spacy.load('en')

f = open("dataset.txt", "a")

for item in docs[336:]:
  try:
    # generate prompt
    tokens = nlp(item)
    i = 0
    prompt = ''
    for sent in tokens.sents:
        i += 1
        prompt = prompt + ' ' + sent.string.strip()
        # 3 sentences
        if i > 3:
          break

    prompt_size = len(prompt)

    # encode input
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    tf.random.set_seed(3)
    
    # generate output
    sample_output = model.generate(
        input_ids, 
        do_sample=True, 
        min_length=prompt_size+200,
        max_length=prompt_size+400,
        # top_p & top_k sampling
        top_p=0.95,
        top_k=50

        )
  
    # decode output
    output = tokenizer.decode(sample_output[0], skip_special_tokens=True)

    # write output, excluding the prompt
    f.write(output[len(prompt):])

  # handle max token error (max=1024)
  except IndexError:
    pass

f.close()   

# Using Wiki dump & distilGPT-2

### Dependencies

In [1]:
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import nltk

from tqdm import tqdm
import re

from gensim.corpora.wikicorpus import extract_pages, filter_wiki, process_article



### Preprocess Wikipedia dump

In [2]:
def text_from_raw(raw):
    txt = filter_wiki(raw).strip()
    txt = re.sub("\n+", "\n", txt)
    return re.sub("'+", "'", txt)

def dump_to_list(filename, max_articles=1e10):
    with open(filename) as handle:
        pg_iter = extract_pages(handle)
        for (title, raw_text, page_id), _ in tqdm(zip(pg_iter, range(max_articles))):
            yield title, text_from_raw(raw_text), int(page_id)

In [3]:
wiki_file = list(dump_to_list("./enwiki-latest-pages-articles16.xml-p20460153p20570392", max_articles=10000))

10000it [00:11, 837.08it/s]


In [18]:
contents = list()

# obtain a list of article contents (500 tokens)
for article in wiki_file:
    if 300 < len(article[1].split()) < 700:
        contents.append(article[1])


len(contents)        


# Do not mind this

# j = 0

# for item in contents:
#     prompt = ''
#     for i in range(3):
#         prompt += ' ' + nltk.tokenize.sent_tokenize(item)[i]
#     print(prompt)
#     j += 1 
#     if j == 3:
#         break
    

# temp = nltk.tokenize.sent_tokenize(contents[1])

# new = ''
# for i in range(3):
#     new += ' '+temp[i]
    
# print(new)

# wiki_gen.txt bevat 651 samples nu

1005

In [25]:
# f = open('wiki_human.txt', 'w')
# for item in contents[:653]:
#     f.write(item)

### Load tokenizer & model

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

### Generation

In [19]:
counter = 0

f = open("wiki_gen.txt", "a")

for item in contents[280:]:
  try:
    # generate prompt
    prompt = ''
    for i in range(3):
        prompt += ' ' + nltk.tokenize.sent_tokenize(item)[i]

    # this actually needs to be split()
    prompt_size = len(prompt)

    # encode input
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    tf.random.set_seed(10)
    
    # generate output
    sample_output = model.generate(
        input_ids, 
        do_sample=True, 
        min_length=prompt_size+200,
        max_length=prompt_size+400,
        # top_p & top_k sampling
        top_p=0.94
#         top_k=50

        )
  
    # decode output
    output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
    
    # write output, excluding the prompt
    f.write(output[prompt_size+1:])
    f.write('\n\n\n\n')
    
    counter += 1 
    
    if counter == 350:
        break
    
  # handle max token error (max=1024)
  except IndexError:
    pass

f.close()   

print(counter)

Token indices sequence length is longer than the specified maximum sequence length for this model (1266 > 1024). Running this sequence through the model will result in indexing errors


350


In [18]:
f = open('wiki_human.txt', 'w')

for article in contents:
    f.write

["The 'Assistant Secretary of State for Consular Affairs' is the head of the Bureau of Consular Affairs within the United States Department of State.  The Assistant Secretary of State for Consular Affairs reports to the Under Secretary of State for Management.  From 1953 to 1977, the position was called 'Administrator of the Bureau of Security and Consular Affairs'.\n==List of the Assistant Secretaries of State for Security and Consular Affairs, 1953—77==\nName\nAssumed office\nLeft office\nPresident(s) served under\nR. W. Scott McLeod\nMarch 3, 1953\nMarch 9, 1957\n Dwight D. Eisenhower\nRoderic L. O'Connor\nMay 28, 1957\nDecember 29, 1958\nJohn W. Hanes III\nJanuary 1, 1959\nOctober 4, 1962\nDwight D. Eisenhower and John F. Kennedy\nAbba P. Schwartz\nOctober 5, 1962\nMarch 6, 1966\nJohn F. Kennedy and Lyndon B. Johnson\nBarbara M. Watson\nAugust 12, 1968\nDecember 31, 1974\nLyndon B. Johnson and Richard Nixon\nLeonard F. Walentynowicz\nJanuary 2, 1975\nMarch 7, 1977\nGerald Ford\n==L