In [None]:
!pip install -q transformers torch torchvision

[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
[K     |████████████████████████████████| 248 kB 43.1 MB/s 
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
[K     |████████████████████████████████| 895 kB 43.4 MB/s 


In [None]:
import pandas as pd

model_checkpoint = "msintaha/gpt2-finetuned-rocstories"
batch_size = 2
from google.colab import drive
drive.mount('/content/drive')

with open('100KStories.csv', 'rb') as csv_file:
    csv_data = pd.read_csv(csv_file)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/811 [00:00<?, ?B/s]

In [None]:
import numpy as np
import random
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(model_checkpoint, output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(model_checkpoint, config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Downloading:   0%|          | 0.00/907 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/487M [00:00<?, ?B/s]

## Create pkl files for 1, 2, 3 input stories in separate dicts (Skip this step if you have the pkl files)

In [None]:
import pickle
data_for_generation = csv_data[501:]
three_sentence_input = {}
two_sentence_input = {}
one_sentence_input = {}
for index, row in data_for_generation.iterrows():
    three_sentence_input[row['storyid']] = {
        'storytitle': row['storytitle'],
        'original': row['sentence1'] + ' ' + row['sentence2'] + ' ' + row['sentence3'] + ' ' + row['sentence4'] + ' ' + row['sentence5'],
        'input': row['sentence1'] + ' ' + row['sentence2'] + ' ' + row['sentence3'],
        'human': row['sentence4'] + ' ' + row['sentence5']
    }

    two_sentence_input[row['storyid']] = {
        'storytitle': row['storytitle'],
        'original': row['sentence1'] + ' ' + row['sentence2'] + ' ' + row['sentence3'] + ' ' + row['sentence4'] + ' ' + row['sentence5'],
        'input': row['sentence1'] + ' ' + row['sentence2'],
        'human': row['sentence3'] + ' ' + row['sentence4'] + ' ' + row['sentence5']
    }

    one_sentence_input[row['storyid']] = {
        'storytitle': row['storytitle'],
        'original': row['sentence1'] + ' ' + row['sentence2'] + ' ' + row['sentence3'] + ' ' + row['sentence4'] + ' ' + row['sentence5'],
        'input': row['sentence1'],
        'human': row['sentence2'] + ' ' + row['sentence3'] + ' ' + row['sentence4'] + ' ' + row['sentence5']
    }

with open('three_sentence_input.pkl', 'wb') as pickle_file:
    pickle.dump(three_sentence_input, pickle_file)
    print('Saved three_sentence_input Pickle File!')

with open('two_sentence_input.pkl', 'wb') as pickle_file:
    pickle.dump(two_sentence_input, pickle_file)
    print('Saved two_sentence_input Pickle File!')

with open('one_sentence_input.pkl', 'wb') as pickle_file:
    pickle.dump(one_sentence_input, pickle_file)
    print('Saved one_sentence_input Pickle File!')

## Load from pkl

In [None]:
import pickle

with open('three_sentence_input.pkl', 'rb') as pickle_file:
    three_sentence_input = pickle.load(pickle_file)
with open('two_sentence_input.pkl', 'rb') as pickle_file:
    two_sentence_input = pickle.load(pickle_file)
with open('one_sentence_input.pkl', 'rb') as pickle_file:
    one_sentence_input = pickle.load(pickle_file)

In [None]:
!pip install -q spacy
!python -m spacy download en_core_web_lg 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Generate 2 input data

In [None]:
import spacy
import numpy as np
import en_core_web_lg
nlp = en_core_web_lg.load()

model.eval()

print(len(two_sentence_input))
two_sentence_data = {k: two_sentence_input[k] for k in list(two_sentence_input)[10000:13500]}
print(len(two_sentence_data))

count = 0
for key,value in two_sentence_data.copy().items():
  prompt = "<|startoftext|>" + value['input']
  print(prompt)

  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)

  # print(generated)

  sample_outputs = model.generate(
                                  generated,
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 500,
                                  top_p=0.95, 
                                  num_return_sequences=10
                                  )
  output_sequences = []
  for i, sample_output in enumerate(sample_outputs):
    decoded_output = tokenizer.decode(sample_output, skip_special_tokens=True)
    output = decoded_output.split('. ')
    if len(output) == 5:
      print('OUTPUT', decoded_output)
      output_sequences.append(decoded_output)
      # print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
  
  output_score_map = {}
  for output in output_sequences:
    original = nlp(two_sentence_data[key]['original'])
    current = nlp(output)
    score = original.similarity(current)
    output_score_map[output] = score
  
  sorted_output_score = {k: v for k, v in sorted(output_score_map.items(), key=lambda item: item[1])}
  if len(sorted_output_score):
    two_sentence_data[key]['machine'] = list(sorted_output_score.items())[0][0]
    print('MACHINE ::', two_sentence_data[key]['machine'])
  else:
    count += 1
    del two_sentence_data[key]

print('Did not find text for {} sentences'.format(count))

with open('two_sentence_input_with_machine.pkl', 'wb') as pickle_file:
    pickle.dump(two_sentence_data, pickle_file)
    print('Saved two_sentence_data Pickle File!')

from google.colab import files
files.download('two_sentence_input_with_machine.pkl')

## Generate 3 input data

In [None]:
import spacy
import numpy as np
import en_core_web_lg
nlp = en_core_web_lg.load()

model.eval()

print(len(three_sentence_input))
three_sentence_data = {k: three_sentence_input[k] for k in list(three_sentence_input)[13501:17001]}
print(len(three_sentence_data))

count = 0
for key,value in three_sentence_data.copy().items():
  prompt = "<|startoftext|>" + value['input']
  print(prompt)

  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)

  # print(generated)

  sample_outputs = model.generate(
                                  generated,
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 500,
                                  top_p=0.95, 
                                  num_return_sequences=10
                                  )
  output_sequences = []
  for i, sample_output in enumerate(sample_outputs):
    decoded_output = tokenizer.decode(sample_output, skip_special_tokens=True)
    output = decoded_output.split('. ')
    if len(output) == 5:
      print('OUTPUT', decoded_output)
      output_sequences.append(decoded_output)
      # print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
  
  output_score_map = {}
  for output in output_sequences:
    original = nlp(three_sentence_data[key]['original'])
    current = nlp(output)
    score = original.similarity(current)
    output_score_map[output] = score
  
  sorted_output_score = {k: v for k, v in sorted(output_score_map.items(), key=lambda item: item[1])}
  if len(sorted_output_score):
    three_sentence_data[key]['machine'] = list(sorted_output_score.items())[0][0]
    print('MACHINE ::', three_sentence_data[key]['machine'])
  else:
    count += 1
    del three_sentence_data[key]

print('Did not find text for {} sentences'.format(count))

with open('three_sentence_input_with_machine.pkl', 'wb') as pickle_file:
    pickle.dump(three_sentence_data, pickle_file)
    print('Saved three_sentence_data Pickle File!')

from google.colab import files
files.download('three_sentence_input_with_machine.pkl')

## Generate 1 input data

In [None]:
import spacy
import numpy as np
import en_core_web_lg
nlp = en_core_web_lg.load()

model.eval()

print(len(one_sentence_input))
one_sentence_data = {k: one_sentence_input[k] for k in list(one_sentence_input)[17001:20501]}
print(len(one_sentence_data))

count = 0
for key,value in one_sentence_data.copy().items():
  prompt = "<|startoftext|>" + value['input']
  print(prompt)

  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)

  # print(generated)

  sample_outputs = model.generate(
                                  generated,
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 500,
                                  top_p=0.95, 
                                  num_return_sequences=10
                                  )
  output_sequences = []
  for i, sample_output in enumerate(sample_outputs):
    decoded_output = tokenizer.decode(sample_output, skip_special_tokens=True)
    output = decoded_output.split('. ')
    if len(output) == 5:
      print('OUTPUT', decoded_output)
      output_sequences.append(decoded_output)
      # print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
  
  output_score_map = {}
  for output in output_sequences:
    original = nlp(one_sentence_data[key]['original'])
    current = nlp(output)
    score = original.similarity(current)
    output_score_map[output] = score
  
  sorted_output_score = {k: v for k, v in sorted(output_score_map.items(), key=lambda item: item[1])}
  if len(sorted_output_score):
    one_sentence_data[key]['machine'] = list(sorted_output_score.items())[0][0]
    print('MACHINE ::', one_sentence_data[key]['machine'])
  else:
    count += 1
    del one_sentence_data[key]

print('Did not find text for {} sentences'.format(count))

with open('one_sentence_input_with_machine.pkl', 'wb') as pickle_file:
    pickle.dump(one_sentence_data, pickle_file)
    print('Saved one_sentence_data Pickle File!')

from google.colab import files
files.download('one_sentence_input_with_machine.pkl')

In [None]:
import numpy as np

print(len(one_sentence_input))
human_generated = {k: one_sentence_input[k] for k in list(one_sentence_input)[30000:40000]}


for key,value in human_generated.copy().items():
  human_generated[key]['human'] = human_generated[key]['original']

with open('human_generated.pkl', 'wb') as pickle_file:
    pickle.dump(human_generated, pickle_file)
    print('Saved human_generated Pickle File!')

from google.colab import files
files.download('human_generated.pkl')