# This notebook produces a dictionary where the vocabulary found in the Questions in CommonsenseQA are keys and their triplets which are found in ConceptNet are the values

In [1]:
import json_lines
import re
import requests
from tqdm import tqdm_notebook as tqdm
from time import sleep
import time
import multiprocessing
import json

In [2]:
def load_files(which_dataset): #train / dev / test    
    questions = []
    with open(which_dataset + '_qtoken_split.jsonl', 'rb') as f: # opening file in binary(rb) mode    
        for item in json_lines.reader(f):
            questions.append(item)
    return questions

def build_vocab(questions): #what we get from load_files
    regex = r'\b\w+\b'
    vocab = []
    for Q in questions:
        vocab += re.findall(regex, Q['question']['stem'].lower())
        for i in range(3):
            vocab += Q['question']['choices'][i]['text'].lower()
    vocab = list(set(vocab))
    return vocab

# We use the API for ConceptNet to obtain triplets

In [3]:
#get all of one word's triplet
def get_triplets_single(word):

    #This is the API. 
    obj = requests.get('http://api.conceptnet.io/c/en/' + word).json()


    if 'error' not in obj.keys(): #eliminate words not found in ConceptNet
        list_start = []
        list_rels = []
        list_end = []
        list_lang = []

        len_edges = len(obj['edges'])
        for j in range(len_edges):

            list_start.append(obj['edges'][j]['start']['label'])
            list_lang.append(obj['edges'][j]['start']['language'])
            list_end.append(obj['edges'][j]['end']['label'])
            list_rels.append(obj['edges'][j]['rel']['label'])
        #we will extract triplets which are exclusively in english because 'murica
        relations = [(start,rel,end) for start,rel,end,lang in zip(list_start,
                                                                   list_rels,
                                                                   list_end,
                                                                   list_lang) if lang == 'en']
        
        return (word, relations)

In [4]:
#using multithreading
def get_triplets_multithread(vocab):
    
    pool = multiprocessing.Pool(processes=16)
    pool_outputs = pool.map(get_triplets_single,
                            vocab)
    pool.close()
    pool.join()
    output = [x for x in pool_outputs if x is not None]
    return (dict(output))


In [13]:
#main function
def main(which_data):
    data = load_files(which_data)
    vocab = build_vocab(data)[0:100]
    triplets = get_triplets_multithread(vocab)
    
    return triplets

# Run the two cells below, then wait for one hour. 

In [6]:
#get the triplets
dev_triplets = main('dev')

In [10]:
test_triplets = main('test')

# We have to break up the training data since the API only allows 6000 requests per hour

In [17]:

data = load_files('train')
vocab = build_vocab(data)

vocab1 = vocab[0:3600]
vocab2 = vocab[3600:]

In [21]:
train_triplets1 = get_triplets_multithread(vocab1)

# Wait for an hour before running the next cell

In [23]:
train_triplets2 = get_triplets_multithread(vocab2)

# Merge the two sets of training triplets

In [27]:
train_triplets = {**train_triplets1, **train_triplets2}

In [30]:
#save the triplets
json.dump(dev_triplets, open("dev_triplets.txt",'w'))
json.dump(test_triplets, open("test_triplets.txt",'w'))
json.dump(train_triplets, open("train_triplets.txt",'w'))


In [31]:
#load the triplets
temp_dev_triplets = json.load(open("dev_triplets.txt"))
temp_test_triplets = json.load(open("test_triplets.txt"))
temp_train_triplets = json.load(open("train_triplets.txt"))

In [47]:
#convert back to tuple instead of list (since JSON files cannot store tuples):
def decoder(dictionary):
    for key in dictionary.keys():
        items = dictionary[key]
        dictionary[key] = [tuple(item) for item in items]
    return dictionary

_dev_triplets = decoder(temp_dev_triplets)
_test_triplets = decoder(temp_test_triplets)
_train_triplets = decoder(temp_train_triplets)

# Check that they have remained unchanged

In [49]:
print(_dev_triplets == dev_triplets)
print(_test_triplets == test_triplets)
print(_train_triplets == train_triplets)

True
True
True


# This function queries the API for ConceptNet one by one (instead of multithreading). Just for Reference

In [None]:

# def get_triplets(vocab):
#     triplet_dictionary = {}
#     for i, word in tqdm(enumerate(vocab)):
        
        
#         #This is the API. 
#         obj = requests.get('http://api.conceptnet.io/c/en/' + word).json()
        
        
#         if 'error' not in obj.keys(): #eliminate words not found in ConceptNet
#             list_start = []
#             list_rels = []
#             list_end = []
#             list_lang = []

#             len_edges = len(obj['edges'])
#             for j in range(len_edges):
                
#                 list_start.append(obj['edges'][j]['start']['label'])
#                 list_lang.append(obj['edges'][j]['start']['language'])
#                 list_end.append(obj['edges'][j]['end']['label'])
#                 list_rels.append(obj['edges'][j]['rel']['label'])
#             #we will extract triplets which are exclusively in english because 'murica
#             relations = [(start,rel,end) for start,rel,end,lang in zip(list_start,
#                                                                        list_rels,
#                                                                        list_end,
#                                                                        list_lang) if lang == 'en']
#             triplet_dictionary[word] = relations
#     return triplet_dictionary