In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Config, GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from stop import eng_stop_words
from concept_net import PathRetriever
import json

  from .autonotebook import tqdm as notebook_tqdm


# 1) Fuse original splits into one file for both ExplaGraphs and COPA-SSE

### ExplaGraphs

In [2]:
exp_val = pd.read_csv("../data/explagraphs/dev_original.tsv", sep="\t", header=0)
exp_train = pd.read_csv("../data/explagraphs/train_original.tsv", sep="\t", header=0)
exp_train.columns = ["belief", "argument", "label", "gold_graph"]
exp_val.columns = ["belief", "argument", "label", "gold_graph"]
exp_df = pd.concat([exp_train, exp_val], axis=0)
exp_df['id'] = range(880, 880+len(exp_df))

### COPA-SSE

In [3]:
copa_dev = pd.read_json("../data/copa/copa_dev_original.jsonl", lines=True)
copa_test = pd.read_json("../data/copa/copa_test_original.jsonl", lines=True)
copa_df = pd.concat([copa_dev, copa_test], axis=0)

### For COPA-SSE, we keep the human annotated graph with the highest rating as the gold graph

In [4]:
best_explanations = []
for i, row in copa_df.iterrows():
    candidates = list(row["human-explanations"])
    best_score = 0.0
    best_candidate = 0
    for i, cand in enumerate(candidates):
        score = cand["filtered-avg-rating"]
        if score > best_score:
            best_score = score
            best_candidate = i
    best_explanations.append(candidates[best_candidate]["triples"])

In [5]:
copa_df["gold_graph"] = best_explanations

# 2) Transform gold explanations into one single format for both datasets

The triple format of COPA-SSE is neater, transform the ExplaGraphs format into that.

In [6]:
def transform_expla_to_triple(explanation: str):
    print(explanation)
    triples = explanation.split(")(")
    triples = [s.strip("(").strip(")") for s in triples]
    new_triples = []
    for trip in triples:
        head, rel, tail = trip.split(";")
        new_triples.append([head.strip(),rel.strip(),tail.strip()])
    return new_triples

In [7]:
exp_df["gold_graph"] = exp_df["gold_graph"].apply(lambda x: transform_expla_to_triple(x)) #Gives long output

(women and men; is a; citizens)(citizens; causes; have same rights)(have same rights; causes; women)(women; capable of; help the country)(help the country; desires; be in combat)
(marijuana; receives action; popular)(popular; used for; people)(people; at location; everywhere)
(armed forces; desires; nurses and helpers)(nurses and helpers; made of; women)(women; causes; more open)(more open; has subevent; recruiting women candidates)(recruiting women candidates; capable of; partake in war)
(marijuana; is a; recreational drug)(recreational drug; capable of; drug addiction)(drug addiction; is a; dangerous for society)(dangerous for society; not desires; legalized)
(everyone; receives action; has the right)(has the right; desires; choose)(choose; has subevent; what to smoke)(use of marijuana; is a; what to smoke)(what to smoke; not desires; ban)
(combat; desires; physical capabilities)(physical capabilities; part of; men)(men; has property; testosterone)(testosterone; not part of; women)(w

# 3) Append graph explanations of different quality

## 3.0) Entity Linking from Lin et al...

### Expla

In [8]:
grounded_expla = {}
with open("../expla_grounded.jsonl", encoding="utf-8") as f:
    for line in f:
        e = json.loads(line)
        grounded_expla[e["id"]] =  e["path"]

In [9]:
el_paths = []

for idx, row in exp_df.iterrows():
    try:
        x = grounded_expla[row['id']]
    except:
        x = []
    el_paths.append(x)

In [10]:
exp_df["linked_paths"] = el_paths


### Copa

In [11]:
grounded_copa = {}
with open("../copa_grounded.jsonl", encoding="utf-8") as f:
    for line in f:
        e = json.loads(line)
        grounded_copa[e["id"]] =  e["path"]
elc_paths = []

for idx, row in copa_df.iterrows():
    try:
        x = grounded_copa[row['id']]
    except:
        x = []
    elc_paths.append(x)

In [12]:
copa_df["linked_paths"] = elc_paths


## 3.1) Generated Paths from https://arxiv.org/abs/2005.00691

In [13]:
r2t = None
with open('relation2text.json') as json_file:
    r2t = json.load(json_file)
    r2t =  {k.lower(): v for k, v in r2t.items()}
r2t_keys_text = [r.lower() for r in r2t.keys()]

In [14]:
class Generator(nn.Module):
    def __init__(self, gpt, config, max_len=31):
        super(Generator, self).__init__()
        self.gpt = gpt
        self.config = config
        self.max_len = max_len
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    def forward(self, inputs):
        # input: [batch, seq]
        context_len = inputs.size(1)
        generated = inputs
        next_token = inputs
        past = None
        with torch.no_grad():
            for step in range(self.max_len):
                outputs = self.gpt(next_token, past_key_values=past)
                hidden = outputs[0][:, -1]
                past = outputs[1]
                next_token_logits = self.lm_head(hidden)
                next_logits, next_token = next_token_logits.topk(k=1, dim=1)
                generated = torch.cat((generated, next_token), dim=1)
        return generated

class PathGenerator():
    def __init__(self):
        print("Load Path Generator..")
        lm_type = 'gpt2'
        config = GPT2Config.from_pretrained(lm_type)
        self.tokenizer = GPT2Tokenizer.from_pretrained(lm_type)
        self.tokenizer.add_tokens(['<PAD>'])
        self.tokenizer.add_tokens(['<SEP>'])
        self.tokenizer.add_tokens(['<END>'])
        gpt = GPT2Model.from_pretrained(lm_type)
        config.vocab_size = len(self.tokenizer)
        gpt.resize_token_embeddings(len(self.tokenizer))
        pretrain_generator_ckpt = "../pg/commonsense-path-generator.ckpt" #Use an already trained model from the paper. 
        self.generator = Generator(gpt, config)
        self.generator.load_state_dict(torch.load(pretrain_generator_ckpt, map_location=torch.device("cpu")), strict=False)

    def prepare_input(self, head_entity, tail_entity, input_len=16):
        head_entity = head_entity.replace('_', ' ')
        tail_entity = tail_entity.replace('_', ' ')
        input_token = tail_entity + '<SEP>' + head_entity
        input_id = self.tokenizer.encode(input_token, add_special_tokens=False)[:input_len]
        input_id += [self.tokenizer.convert_tokens_to_ids('<PAD>')] * (input_len - len(input_id))
        return torch.tensor([input_id], dtype=torch.long)

    def connect_entities(self, head_entity, tail_entity):
        gen_input = self.prepare_input(head_entity, tail_entity)
        gen_output = self.generator(gen_input)
        path = self.tokenizer.decode(gen_output[0].tolist(), skip_special_tokens=True)
        path = ' '.join(path.replace('<PAD>', '').split())
        
        try:
            path = path[path.index('<SEP>')+6:]
        except ValueError as e:
            return None
        entities = path.split(" ")
        final = []
        prev_was_rel = False
        head = ""
        tail = ""
        trip = []
        trip_counter = 0
        for i in range(0, len(entities)): #State machine to construct complete triples from the string representations in the GPT-2 output.
            if prev_was_rel == False and entities[i].strip("_") not in r2t_keys_text:
                if head == "":
                    head = entities[i]
                else:
                    head += " " + entities[i]
            if entities[i].strip("_") in r2t_keys_text:
                trip.append(head)
                prev_was_rel = True
                tail = ""
                head = ""
                trip.append(entities[i])
            if prev_was_rel == True and entities[i].strip("_") not in r2t_keys_text:
                if tail == "":
                    tail = entities[i]
                else:
                    tail += " " + entities[i]
                if i < len(entities)-1:
                    if entities[i+1] in r2t_keys_text:
                        prev_was_rel = False
                        trip.append(tail)
                        if len(trip[0])  == 0 and len(final) > 0:
                            trip[0] = final[trip_counter-1][-1]
                        final.append(trip)
                        trip_counter += 1
                        trip = []
                        tail = ""
                        head = ""
                else:
                    trip.append(tail)
                    if len(trip[0])  == 0 and len(final) > 0:
                            trip[0] = final[trip_counter-1][-1]
                    final.append(trip)
                    
        return final

In [15]:
PG = PathGenerator()

Load Path Generator..


### 3.1.1) COPA-SSE

In [16]:
generated_paths_copa = []
for i, exp in enumerate(tqdm(copa_df["linked_paths"])):
    try:
        head = exp[0][0]
        tail = exp[-1][-1]
        path = PG.connect_entities(head, tail)
        generated_paths_copa.append(path)        
    except:
        generated_paths_copa.append([])
copa_df["generated_graph_linked"] = generated_paths_copa

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [15:17<00:00,  1.64it/s]


### 3.1.2) ExplaGraphs

In [17]:
generated_paths_expa = []
for i, exp in enumerate(tqdm(exp_df["linked_paths"])):
    try:
        head = exp[0][0]
        tail = exp[-1][-1]
        path = PG.connect_entities(head, tail)
        generated_paths_expa.append(path)
        
    except:
        generated_paths_expa.append([])
exp_df["generated_graph_linked"] = generated_paths_expa

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2764/2764 [25:50<00:00,  1.78it/s]


## Add gold generated as well

In [18]:
generated_paths_copa = []
for i, exp in enumerate(tqdm(copa_df["gold_graph"])):
    try:
        head = exp[0][0]
        tail = exp[-1][-1]
        path = PG.connect_entities(head, tail)
        generated_paths_copa.append(path)
    except:
        generated_paths_copa.append([])
copa_df["generated_graph_gold"] = generated_paths_copa

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [14:21<00:00,  1.74it/s]


In [19]:
generated_paths_expa = []
for i, exp in enumerate(tqdm(exp_df["gold_graph"])):
    try:
        head = exp[0][0]
        tail = exp[-1][-1]
        path = PG.connect_entities(head, tail)
        generated_paths_expa.append(path)
        
    except:
        generated_paths_expa.append([])
exp_df["generated_graph_gold"] = generated_paths_expa

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2764/2764 [26:20<00:00,  1.75it/s]


## 3.2) Add pseudo-random graphs via naive retrieval

In [20]:
def find_best_path(belief, argument, model, PR):
        '''
            Finds paths between concepts indentified in the belief and the argument.
            Each path is scored against the original context with cosine similarity using SentenceTransformer.
        '''
        answers_tmp = argument
        q_words = []
        answer_words = []
        for q in belief.split(" "):
            if PR.is_entity(q) and q not in eng_stop_words:
                q_words.append(q)
        for a in argument.split(" "):
            if PR.is_entity(a) and a not in eng_stop_words:
                answer_words.append(a)

        paths = []
        top_score = 0.0
        best_path = ""
        flag = 0

        for q in q_words:
            for a in answer_words:
                if not q == a:
                    path =  PR.get_path(q, a)
                    if path != -1: paths.append(path)
                        
        for path in paths:
            str_path = ""
            for triple in path:
                head, rel, tail = triple
                try:
                    rel = PR.r2t[rel.strip("_")]
                except:
                    continue
                str_path += head + " " + rel + " " + tail + " "
            path_emb = model.encode(str_path, convert_to_tensor=True, show_progress_bar=False)
            question_emb = model.encode(belief, convert_to_tensor=True, show_progress_bar=False)
            score = util.cos_sim(path_emb, question_emb)
            if score > top_score:
                top_score = score
                best_path = path
        
        if best_path != "":
            return best_path
        else:
            return -1

In [21]:
model = SentenceTransformer('all-mpnet-base-v2')
PR = PathRetriever("../data/conceptnet/")

Loading conceptnet...


### 3.2.1) COPA-SSE

In [22]:
random_paths_copa = []
for i, (a, b, c) in enumerate(tqdm(zip(copa_df["p"], copa_df["a1"], copa_df["a2"]))):
    path = find_best_path(a.lower(), b.lower() + " " + c.lower(), model, PR) # Concat a1 and a2
    random_paths_copa.append(path)

1500it [08:13,  3.04it/s]


In [23]:
copa_df["retrieved_graph"] = random_paths_copa

### 3.2.2) ExpaGraphs

In [24]:
random_paths_expa = []
for i, (a, b) in enumerate(tqdm(zip(exp_df["belief"], exp_df["argument"]))):
    path = find_best_path(a.lower(), b.lower(), model, PR) # Concat a1 and a2    
    random_paths_expa.append(path)

2764it [31:31,  1.46it/s]


In [25]:
exp_df["retrieved_graph"] = random_paths_expa

# 4.0 Split and save to disk

### Copa

In [26]:
train_val_copa, test_copa = train_test_split(copa_df, random_state=1, test_size=0.1)
train_copa, val_copa = train_test_split(train_val_copa, random_state=1, test_size=0.1)
train_copa.to_csv("../data/copa/train_v3.tsv", sep="\t")
val_copa.to_csv("../data/copa/val_v3.tsv", sep="\t")
test_copa.to_csv("../data/copa/test_v3.tsv", sep="\t")

### ExplaGraphs

In [27]:
train_val_exp, test_exp = train_test_split(exp_df, random_state=1, test_size=0.1)
train_exp, val_exp = train_test_split(train_val_exp, random_state=1, test_size=0.1)
train_exp.to_csv("../data/explagraphs/train_v3.tsv", sep="\t")
val_exp.to_csv("../data/explagraphs/val_v3.tsv", sep="\t")
test_exp.to_csv("../data/explagraphs/test_v3.tsv", sep="\t")