In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Config, GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
class Generator(nn.Module):
    def __init__(self, gpt, config, max_len=31):
        super(Generator, self).__init__()
        self.gpt = gpt
        self.config = config
        self.max_len = max_len
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    def forward(self, inputs):
        # input: [batch, seq]
        context_len = inputs.size(1)
        generated = inputs
        next_token = inputs
        past = None
        with torch.no_grad():
            for step in range(self.max_len):
                outputs = self.gpt(next_token, past_key_values=past)
                hidden = outputs[0][:, -1]
                past = outputs[1]
                next_token_logits = self.lm_head(hidden)
                next_logits, next_token = next_token_logits.topk(k=1, dim=1)
                generated = torch.cat((generated, next_token), dim=1)
        return generated

class PathGenerator():
    def __init__(self):
        print("Load Path Generator..")
        lm_type = 'gpt2'
        config = GPT2Config.from_pretrained(lm_type)
        self.tokenizer = GPT2Tokenizer.from_pretrained(lm_type)
        self.tokenizer.add_tokens(['<PAD>'])
        self.tokenizer.add_tokens(['<SEP>'])
        self.tokenizer.add_tokens(['<END>'])
        gpt = GPT2Model.from_pretrained(lm_type)
        config.vocab_size = len(self.tokenizer)
        gpt.resize_token_embeddings(len(self.tokenizer))
        pretrain_generator_ckpt = "../pg/commonsense-path-generator.ckpt"
        self.generator = Generator(gpt, config)
        self.generator.load_state_dict(torch.load(pretrain_generator_ckpt, map_location=torch.device("cpu")), strict=False)

    def prepare_input(self, head_entity, tail_entity, input_len=16):
        head_entity = head_entity.replace('_', ' ')
        tail_entity = tail_entity.replace('_', ' ')
        input_token = tail_entity + '<SEP>' + head_entity
        input_id = self.tokenizer.encode(input_token, add_special_tokens=False)[:input_len]
        input_id += [self.tokenizer.convert_tokens_to_ids('<PAD>')] * (input_len - len(input_id))
        return torch.tensor([input_id], dtype=torch.long)

    def connect_entities(self, head_entity, tail_entity):
        gen_input = self.prepare_input(head_entity, tail_entity)
        gen_output = self.generator(gen_input)
        path = self.tokenizer.decode(gen_output[0].tolist(), skip_special_tokens=True)
        path = ' '.join(path.replace('<PAD>', '').split())
        return path[path.index('<SEP>')+6:]

In [12]:
PG = PathGenerator()

Load Path Generator..


In [15]:
def clean_string(x):
        x = x.replace(")(", ", ")
        return x.replace("(", "").replace(")","").replace(";", "")

In [16]:
def get_path(x):
        original_explanation_graph = x.split(";")
        head = clean_string(original_explanation_graph[0])
        tail = clean_string(original_explanation_graph[-1])
        #print(f"Explanation was: {original_explanation_graph}, head is now {head}, tail is {tail}")
        path = PG.connect_entities(head, tail)
        return path

In [17]:
df_val = pd.read_csv("../data/dev_original.tsv", sep="\t")

In [18]:
df_train = pd.read_csv("../data/train_original.tsv", sep="\t")

In [19]:
df_train.columns = ["belief", "argument", "label", "explanation"]
df_val.columns = ["belief", "argument", "label", "explanation"]

In [20]:
df = pd.concat([df_train, df_val], axis=0)

In [None]:
new_paths = []
for i, exp in enumerate(tqdm(df["explanation"])):
    new_paths.append(get_path(exp))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 2753/2764 [25:26<00:06,  1.82it/s]

In [None]:
train_val, test = train_test_split(df, random_state=1, test_size=0.1)

In [None]:
train, val = train_test_split(train_val, random_state=1, test_size=0.1)

In [None]:
train.to_csv("../data/train.tsv", sep="\t")

In [None]:
val.to_csv("../data/val.tsv", sep="\t")

In [None]:
test.to_csv("../data/test.tsv", sep="\t")

In [None]:
train

In [None]:
class ExplaGraphs(Dataset):
    def __init__(self, model_name, split="train", use_graphs=True):
        print(f"Use graph explanations = {use_graphs}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        df = pd.read_csv(f"../data/{split}.tsv", sep="\t", header=0, index_col=0)
        premises, arguments, self.labels, explanations = df.to_numpy().T
        self.label_converter = {"counter": 0, "support": 1}
        self.label_inverter = {0: "counter", 1: "support"}
        explanations = [self.clean_string(x) for x in explanations]
        if use_graphs == True:
            self.features = [prem + " [SEP] " + arg + " [SEP] " + exp for prem,arg,exp in zip(premises, arguments, explanations)]
        else:
            self.features = [prem + " [SEP] " + arg for prem,arg in zip(premises, arguments)]
            
        encodings = self.tokenizer(self.features, truncation=True, padding=True)
        self.input_ids, self.attention_masks = encodings["input_ids"], encodings["attention_mask"]
        
    def clean_string(self, x):
        x = x.replace(")(", ", ")
        return x.replace("(", "").replace(")","").replace(";", "")
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.input_ids[idx]), torch.BoolTensor(self.attention_masks[idx]), self.label_converter[self.labels[idx]]

In [None]:
train = ExplaGraphs("bert-base-uncased", split="train")

In [None]:
x = train.features[1]

In [None]:
x