In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import pandas as pd
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv, os

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Data Preparation

In [95]:
df_main = pd.read_csv("../Data/Actual Data/data_en_non_idio.csv")
df_main.head()

Unnamed: 0.1,Unnamed: 0,MWE,Previous,Target,Next
0,5,double dutch,"We came out feeling entirely stuffed, and welc...",Another nice thing about Double Dutch is that ...,On our visit the staff were very helpful and a...
1,6,double dutch,Dutch children would usually chant a singing r...,Since settlers from other areas of the world c...,The Double Dutch game remained a favorite side...
2,7,double dutch,In lamb with a single to Bellefield Double Dut...,"At 6,400gns, Auldhouseburn sold another by the...",The remainder of the Auldhouseburn pen of gimm...
3,8,double dutch,In the second of Nigel Bird's darkly humorous ...,Double Dutch hasn't committed murder in almost...,"When his victim is found, DI Wilson and his po..."
4,21,night owl,"In this way, I compete against myself, and am ...",Samaha’s most interesting occurrence for the y...,"A trailing runner saw the number, picked it up..."


In [96]:
# Previous + Target
df_main['Context'] = df_main['Previous'] + ' ' + df_main['Target']
df_main['All'] = df_main['Previous'] + ' ' + df_main['Target'] + ' ' + df_main['Next']
df_main.head()

Unnamed: 0.1,Unnamed: 0,MWE,Previous,Target,Next,Context,All
0,5,double dutch,"We came out feeling entirely stuffed, and welc...",Another nice thing about Double Dutch is that ...,On our visit the staff were very helpful and a...,"We came out feeling entirely stuffed, and welc...","We came out feeling entirely stuffed, and welc..."
1,6,double dutch,Dutch children would usually chant a singing r...,Since settlers from other areas of the world c...,The Double Dutch game remained a favorite side...,Dutch children would usually chant a singing r...,Dutch children would usually chant a singing r...
2,7,double dutch,In lamb with a single to Bellefield Double Dut...,"At 6,400gns, Auldhouseburn sold another by the...",The remainder of the Auldhouseburn pen of gimm...,In lamb with a single to Bellefield Double Dut...,In lamb with a single to Bellefield Double Dut...
3,8,double dutch,In the second of Nigel Bird's darkly humorous ...,Double Dutch hasn't committed murder in almost...,"When his victim is found, DI Wilson and his po...",In the second of Nigel Bird's darkly humorous ...,In the second of Nigel Bird's darkly humorous ...
4,21,night owl,"In this way, I compete against myself, and am ...",Samaha’s most interesting occurrence for the y...,"A trailing runner saw the number, picked it up...","In this way, I compete against myself, and am ...","In this way, I compete against myself, and am ..."


In [99]:
# drop the rows with nan values

nan_rows = df_main.loc[pd.isna(df_main["All"]), :].index
nan_rows
# df_main = df_main.drop([df_main.index[nan_rows[0]], df_main.index[nan_rows[1]]])
# nan_rows = df_main.loc[pd.isna(df_main["All"]), :].index

Int64Index([], dtype='int64')

In [100]:
# some stats

from statistics import mean

def data_stats(df):
    l = list(df.apply(len))
    print("Max lentgh:", max(l))
    print(df.iloc[l.index(max(l))])
    print("\nSmallest length:", min(l))
    print(df.iloc[l.index(min(l))])
    print("\nAvg length:", mean(l))

In [101]:
data_stats(df_main['Context'])

Max lentgh: 988
The Board also considered more recent events taking place in the Highland Capital Management, L.P. (also referred to as the “Debtor” or “HCMLP”) bankruptcy case, including a preliminary injunction issued by the U.S. Bankruptcy Court for the Northern District of Texas (the “Court”) on January 11, 2021, enjoining and restraining James Dondero from taking a variety of actions, including making express or implied threats against the Debtor and its directors, officers, employees, professionals, or agents, in whatever capacity they are acting, and from physically or virtually entering the Debtor’s offices, computer, email, or information systems, including office space that the Debtor shares with employees of NexPoint Advisors, NexPoint’s investment advisor. The Court previously issued a temporary restraining order enjoining James Dondero from threatening the Debtor and its representatives after the Debtor accused him of the same and of interfering with the bankruptcy proceed

In [102]:
data_stats(df_main['All'])

Max lentgh: 1100
They then applied advanced methods from economics to quantify the relationship between historical precipitation variations and historical flooding costs, along with methods from statistics and climate science to evaluate the impact of changes in precipitation on total flooding costs. Together, these analyses revealed that climate change has contributed substantially to the growing cost of flooding in the U.S., and that exceeding the levels of global warming agreed upon in the United Nations Paris Agreement is very likely to lead to greater intensification of the kinds of extreme precipitation events that have been most costly and devastating in recent decades. Previous studies have analyzed pieces of this puzzle, but this is the first study to combine rigorous economic analysis of the historical relationships between climate and flooding costs with really careful extreme event analyses in both historical observations and global climate models, across the whole United S

In [103]:
# split into train and test
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_main, test_size=0.2, random_state=42)

In [104]:
class DataPrep(Dataset):
    
    def __init__(self, control_code, truncate=False, gpt2_type="openai-gpt", max_length=1024):

        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df_main['All']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
                
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [105]:
train_dataset = DataPrep(df_train['All'], truncate=True, gpt2_type="openai-gpt")

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [106]:
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.
Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine tuning

In [17]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [24]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="openai-gpt", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):

    acc_steps = 100
    device = torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 512)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [None]:
#Train the model on the specific data we have
model = train(train_dataset, model, tokenizer)

In [None]:
#Save the model to a pkl or something so it can be reused later on
torch.save(model, 'your_path')

## Text Generation

In [107]:
# load the model 
model = torch.load('your_path')

In [None]:
from nltk.tokenize import sent_tokenize

temperature_generator = 1.0

c = 0
next_generations = []
for context in df_test['Context'].iloc[:]:
    print(c)
    inputs = tokenizer.encode(context, return_tensors='pt')
    outputs = model.generate(inputs.cuda(), max_length=len(context)+100, do_sample=True, temperature=temperature_generator)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    text = text.replace(context, '')

    # text = ''
    while text=='':
        inputs = tokenizer.encode(context, return_tensors='pt')
        outputs = model.generate(inputs.cuda(), max_length=len(context)+100, do_sample=True, temperature=temperature_generator)
        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        text = text.replace(context, '')

    c+=1
        
    # get the first sentence only
    actual_next = sent_tokenize(text)

    try:
        next_generations.append(actual_next[0])
    except IndexError:
        if not text:
            print("string is empty")

        print(context)
        print(c)
        print(text)
        print(len(text))
        print(actual_next)


In [None]:
df_test['Context'].iloc[5]

'I led a bad lifestyle. The real damage happened when I used to work the graveyard shift.'

In [None]:
len(next_generations)

In [None]:
df_test['Generated Next'] = next_generations
df_test

In [109]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=1.0,
    temperature=1.0,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

In [110]:
# Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_next = []
  for i in range(len(test_data)):
    x = ''
    while x == '':
        x = generate(model.to('cpu'), tokenizer, test_data['Context'][i], entry_count=1)

    generated_next.append(x)
  return generated_next

In [111]:
df_test = df_test.reset_index()
df_test.head()

Unnamed: 0.1,index,Unnamed: 0,MWE,Previous,Target,Next,Context,All
0,1610,76,entrance hall,"According to Preservation NJ, it's the only re...","In classic Colonial Revival style, the Laurist...",Lauriston is currently at the center of an aff...,"According to Preservation NJ, it's the only re...","According to Preservation NJ, it's the only re..."
1,1402,3015,video game,Grab your bullwhip and fedora because soon you...,"In a surprise announcement on Tuesday morning,...",A new Indiana Jones game with an original stor...,Grab your bullwhip and fedora because soon you...,Grab your bullwhip and fedora because soon you...
2,1718,199,chain reaction,"It is completely free to download and play, an...",The objective of Chain Reaction is to eliminat...,You can enjoy the game on both smartphones and...,"It is completely free to download and play, an...","It is completely free to download and play, an..."
3,1054,2043,end user,"Metro-level edge clouds, both network operator...",Far-edge clouds will be located within a few 1...,"Despite the fact this is a really important, f...","Metro-level edge clouds, both network operator...","Metro-level edge clouds, both network operator..."
4,307,704,grandfather clock,Thank you for subscribingWe have more newslett...,"A nod to controversy from their past, along wi...",The distinctive looking character was in actio...,Thank you for subscribingWe have more newslett...,Thank you for subscribingWe have more newslett...


In [112]:
generated_next = text_generation(df_test)

100%|██████████| 1/1 [00:12<00:00, 12.21s/it]
100%|██████████| 1/1 [00:15<00:00, 15.58s/it]
100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
100%|██████████| 1/1 [00:19<00:00, 19.66s/it]
100%|██████████| 1/1 [00:00<00:00,  1.23it/s]
100%|██████████| 1/1 [00:13<00:00, 13.67s/it]
100%|██████████| 1/1 [00:17<00:00, 17.96s/it]
100%|██████████| 1/1 [00:11<00:00, 11.71s/it]
100%|██████████| 1/1 [00:11<00:00, 11.79s/it]
100%|██████████| 1/1 [00:13<00:00, 13.91s/it]
100%|██████████| 1/1 [00:18<00:00, 19.00s/it]
100%|██████████| 1/1 [00:12<00:00, 12.15s/it]
100%|██████████| 1/1 [00:19<00:00, 19.35s/it]
100%|██████████| 1/1 [00:18<00:00, 18.98s/it]
100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
100%|██████████| 1/1 [00:00<00:00,  2.74it/s]
100%|██████████| 1/1 [00:12<00:00, 12.17s/it]
100%|██████████| 1/1 [00:10<00:00, 10.28s/it]
100%|██████████| 1/1 [00:01<00:00,  1.51s/it]
100%|██████████| 1/1 [00:12<00:00, 12.92s/it]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:12<00:00,

In [113]:
len(generated_next)

389

In [114]:
my_generations=[]

# Loop to keep only generated text and add it as a new column in the dataframe


# for i in range(len(generated_next)):
#   a = df_test['Context'][i].split()[-30:] #Get the matching string we want (30 words)
#   b = ' '.join(a)
#   c = ' '.join(generated_next[i]) #Get all that comes after the matching string
#   my_generations.append(c.split(b)[-1])



# using NLTK
from nltk.tokenize import sent_tokenize

for i, text in enumerate(generated_next):
    # remove the context
    text = text[0].replace(df_test['Context'].iloc[i].lower(), '')
    
    # remove the tags
    text = text.replace(r'<|endoftext|>', '')

    # tokenize the generated sentence at sentence level
    actual_next = sent_tokenize(text)
    # print(actual_next)
    # print(actual_next[0])

    if not actual_next:
        my_generations.append("")
    else:
        my_generations.append(actual_next[0])

df_test['Generated Next'] = my_generations
# my_generations

In [91]:
generated_next[0]

['and you are the marvellous exception. lovely as a goddess, clever as an athenian and a bad hat like myself, yet one who still has decent feelings. " \n " i do not know the meaning of the word, " said ransom. " and i was not born to be a god. "<|endoftext|>']

In [92]:
df_test['Context'].iloc[0]

'And you are the marvellous exception. Lovely as a goddess, clever as an Athenian and a bad hat like myself, yet one who still has decent feelings.'

In [93]:
df_test['Generated Next'].iloc[0]

' " \n " i do not know the meaning of the word, " said ransom. "'