# This notebook produces a dictionary where the vocabulary found in the Questions in CommonsenseQA are keys and their triplets which are found in ConceptNet are the values

In [13]:
import json_lines
import re
import requests
from tqdm import tqdm_notebook as tqdm
from time import sleep
import time
import multiprocessing
import json

In [18]:
def load_files(which_dataset): #train / dev / test    
    questions = []
    with open('data/' + which_dataset + '_qtoken_split.jsonl', 'rb') as f: # opening file in binary(rb) mode    
        for item in json_lines.reader(f):
            questions.append(item)
    return questions

def build_vocab(questions): #what we get from load_files
    regex = r'\b\w+\b'
    vocab = []
    for Q in questions:
        vocab += re.findall(regex, Q['question']['stem'].lower())
        
        for i in range(3):
            vocab += re.findall(regex, Q['question']['choices'][i]['text'].lower() )
            
    vocab = list(set(vocab))
    return vocab

# We use the API for ConceptNet to obtain download the dictionaries

In [15]:
#get all of one word's triplet
def get_triplets_single(word):

    #This is the API. 
    obj = requests.get('http://api.conceptnet.io/c/en/' + word).json()

    return(word, obj)


def get_nodes(dictionary): #keys are vocabulary, values are the dicitonaries pulled from API
    nodes_dict = {}
    for word in dictionary.keys():
        obj = dictionary[word]
        
        if 'error' not in obj.keys(): #eliminate words not found in ConceptNet
            list_start = []
            list_rels = []
            list_end = []
            list_lang_start = []
            list_lang_end = []

            len_edges = len(obj['edges'])
            for j in range(len_edges):
                '''Starting nodes'''
                list_start.append(obj['edges'][j]['start']['label'])
                #sometimes there might not be the 'language' key, so we must catch the exception. 
                try:
                    list_lang_start.append(obj['edges'][j]['start']['language'])
                except:
                    list_lang_start.append('not en')
                
                '''End nodes'''
                list_end.append(obj['edges'][j]['end']['label'])
                #sometimes there might not be the 'language' key, so we must catch the exception. 
                try:
                    list_lang_end.append(obj['edges'][j]['end']['language'])
                except:
                    list_lang_end.append('not en')

                list_rels.append(obj['edges'][j]['rel']['label'])
            #we will extract triplets which are exclusively in english because 'murica
            relations = [(start,rel,end) for start,rel,end,lang_start,lang_end in zip(list_start,
                                                                                       list_rels,
                                                                                       list_end,
                                                                                       list_lang_start,
                                                                                       list_lang_end) if (lang_start == 'en' and lang_end == 'en')]
            nodes_dict[word] = relations
    return nodes_dict



In [16]:
#using multithreading
def get_triplets_multithread(vocab):
    
    pool = multiprocessing.Pool(processes=16)
    pool_outputs = pool.map(get_triplets_single,
                            vocab)
    pool.close()
    pool.join()
    output = [x for x in pool_outputs if x is not None]
    return (dict(output))


In [25]:
# main function
def main(which_data):
    data = load_files(which_data)
    vocab = build_vocab(data)
    triplets = get_triplets_multithread(vocab)
    
    return triplets

In [26]:
# a decoder which will be used later
def decoder(dictionary):
    for key in dictionary.keys():
        items = dictionary[key]
        dictionary[key] = [tuple(item) for item in items]
    return dictionary

# Run main() for 'dev' and 'test' below, then wait for one hour. 

### Dev set

In [None]:
dev_dictionary = main('dev')

In [28]:
#extract the triplets from the dictionaries
dev_triplets = get_nodes(dev_dictionary)


'''store both the raw dictionary and the triplets'''
json.dump(dev_dictionary, open("data/dev_dictionary.txt",'w'))
json.dump(dev_triplets, open("data/dev_triplets.txt",'w'))

'''How to load the raw dictionary and the triplets'''
#don't need the decoder for dev_dictionary
temp_dev_dictionary = json.load(open("data/dev_dictionary.txt"))
#require the decoder for dev_triplets
temp_dev_triplets = decoder(json.load(open("data/dev_triplets.txt")))

In [None]:
len(temp_dev_triplets)

### Test set

In [None]:
test_dictionary = main('test')

In [27]:
#extract the triplets from the dictionaries
test_triplets = get_nodes(test_dictionary)


'''store both the raw dictionary and the triplets'''
json.dump(test_dictionary, open("test_dictionary.txt",'w'))
json.dump(test_triplets, open("test_triplets.txt",'w'))

'''How to load the raw dictionary and the triplets'''
#don't need the decoder for dev_dictionary
temp_test_dictionary = json.load(open("data/test_dictionary.txt"))
#require the decoder for dev_triplets
temp_test_triplets = decoder(json.load(open("data/test_triplets.txt")))

## break for an hour

# We have to break up the training data since the API only allows 6000 requests per hour

### Train set

In [20]:
data = load_files('train')
vocab = build_vocab(data)
print(len(vocab))
vocab1 = vocab[0:5000]
vocab2 = vocab[5000:]

9002


In [21]:
train_dictionary1 = get_triplets_multithread(vocab1)

# Wait for an hour before running the next cell

In [78]:
train_dictionary2 = get_triplets_multithread(vocab2)

# Merge the two sets of training dictionaries

In [79]:
train_dictionary = {**train_dictionary1, **train_dictionary2}

In [80]:
#extract the triplets from the dictionaries
train_triplets = get_nodes(train_dictionary)


'''store both the raw dictionary and the triplets'''
json.dump(train_dictionary, open("data/train_dictionary.txt",'w'))
json.dump(train_triplets, open("data/train_triplets.txt",'w'))

'''How to load the raw dictionary and the triplets'''
#don't need the decoder for dev_dictionary
temp_train_dictionary = json.load(open("data/train_dictionary.txt"))
#require the decoder for dev_triplets
temp_train_triplets = decoder(json.load(open("data/train_triplets.txt")))

# Check that they have remained unchanged after saving/loading

In [None]:
print(temp_dev_dictionary == dev_dictionary)
print(temp_test_dictionary == test_dictionary)
print(temp_train_dictionary == train_dictionary)

print(temp_dev_triplets == dev_triplets)
print(temp_test_triplets == test_triplets)
print(temp_train_triplets == train_triplets)

# This function queries the API for ConceptNet one by one (instead of multithreading). Just for Reference

In [None]:

# def get_triplets(vocab):
#     triplet_dictionary = {}
#     for i, word in tqdm(enumerate(vocab)):
        
        
#         #This is the API. 
#         obj = requests.get('http://api.conceptnet.io/c/en/' + word).json()
        
        
#         if 'error' not in obj.keys(): #eliminate words not found in ConceptNet
#             list_start = []
#             list_rels = []
#             list_end = []
#             list_lang = []

#             len_edges = len(obj['edges'])
#             for j in range(len_edges):
                
#                 list_start.append(obj['edges'][j]['start']['label'])
#                 list_lang.append(obj['edges'][j]['start']['language'])
#                 list_end.append(obj['edges'][j]['end']['label'])
#                 list_rels.append(obj['edges'][j]['rel']['label'])
#             #we will extract triplets which are exclusively in english because 'murica
#             relations = [(start,rel,end) for start,rel,end,lang in zip(list_start,
#                                                                        list_rels,
#                                                                        list_end,
#                                                                        list_lang) if lang == 'en']
#             triplet_dictionary[word] = relations
#     return triplet_dictionary