In [1]:
import pickle, pickledb
import numpy as np
from itertools import count
from collections import defaultdict
import tensorflow as tf
import tensorflow_hub as hub

train_file = "../files/dataset/pizza_train.tsv"
test_file =  "../files/dataset/pizza_test.tsv"
# instances_file = '../files/dataset/test_instances.tsv'
knocked_file = '../files/dataset/pizza_knockedout.tsv'

NULL_PATH = ((0, 0, 0, 0),)
relations = ["hypernym", "hyponym", "concept", "instance", "none"]
NUM_RELATIONS = len(relations)
prefix = "../junk/Pizza/temp/pizza_threshold_7_10/"

USE_link = "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed"
model = hub.load(USE_link)

f = open("../junk/resolved_use_unbracketed.pkl", "rb")
resolved = pickle.load(f)

def extractUSEEmbeddings(words):
    word_embeddings = model(words)
    return word_embeddings.numpy()

In [6]:
arrow_heads = {">": "up", "<":"down"}

def preprocess_db(db):
    final_db = {}
    for key in db:
        try:
            new_key = key.decode("utf-8")
        except:
            new_key = key
        try:
            new_val = db[key].decode("utf-8")
        except:
            new_val = db[key]
        final_db[new_key] = new_val
    return final_db

def to_list(seq):
    for item in seq:
        if isinstance(item, tuple):
            yield list(to_list(item))
        elif isinstance(item, list):
            yield [list(to_list(elem)) for elem in item]
        else:
            yield item

def extract_direction(edge):

    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrow_heads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrow_heads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge

def parse_path(path):
    parsed_path = []
    for edge in path.split("*##*"):
        direction, edge = extract_direction(edge)
        if edge.split("/"):
            try:
                embedding, pos, dependency = tuple([a[::-1] for a in edge[::-1].split("/",2)][::-1])
            except:
                print (edge, path)
                raise
            emb_idx, pos_idx, dep_idx, dir_idx = emb_indexer[embedding], pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsed_path.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsed_path)

def parse_tuple(tup):
    x, y = [entity_to_id(word2id_db, elem) for elem in tup]
    paths_x, paths_y = list(extract_paths(relations_db,x,y).items()), list(extract_paths(relations_db,y,x).items())
    path_count_dict_x = { id_to_path(id2path_db, path).replace("X/", tup[0]+"/").replace("Y/", tup[1]+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { id_to_path(id2path_db, path).replace("Y/", tup[0]+"/").replace("X/", tup[1]+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}
    return path_count_dict

def parse_dataset(dataset):
    parsed_dicts = [parse_tuple(tup) for tup in dataset.keys()]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
    counts = [list(path_dict.values()) for path_dict in paths]
    paths = [list(path_dict.keys()) for path_dict in paths]
    targets = [rel_indexer[relation] for relation in dataset.values()]
    return list(to_list(paths)), counts, targets

def get_instance_key(tup):
    return tuple([" ".join([tok.text for tok in nlp(elem)]) for elem in tup])

def parse_instance(tup):
    
    paths_x = list(instances_db.get(get_instance_key(tup), {}).items())
    paths_y = list(instances_db.get(get_instance_key(tup[::-1]), {}).items())
    path_count_dict_x = { path.replace("X/", tup[0]+"/").replace("Y/", tup[1]+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { path.replace("Y/", tup[0]+"/").replace("X/", tup[1]+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}
    return path_count_dict

def parse_instance_dataset(dataset):
    parsed_dicts = [parse_instance(tup) for tup in dataset.keys()]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
    counts = [list(path_dict.values()) for path_dict in paths]
    paths = [list(path_dict.keys()) for path_dict in paths]
    targets = [rel_indexer[relation] for relation in dataset.values()]
    return list(to_list(paths)), counts, targets

def id_to_entity(db, entity_id):
    entity = db[str(entity_id)]
    return entity

def id_to_path(db, entity_id):
    entity = db[str(entity_id)]
    entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
    return entity

def entity_to_id(db, entity):
    global success, failed
    entity_id = db.get(entity)
    if entity_id:
        success.append(entity)
        return int(entity_id)
#     closest_entity = resolved.get(entity, "")
#     if closest_entity and closest_entity[0] and float(closest_entity[1]) > threshold:
#         success.append(entity)
#         return int(db[closest_entity[0]])
    failed.append(entity)
    return -1

def extract_paths(db, x, y):
    key = (str(x) + '###' + str(y))
    try:
        relation = db[key]
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        return {}

word2id_db = preprocess_db(pickle.load(open(prefix + "pizza_word_to_id_dict.pkl", "rb")))
id2word_db = preprocess_db(pickle.load(open(prefix + "pizza_id_to_word_dict.pkl", "rb")))
path2id_db = preprocess_db(pickle.load(open(prefix + "pizza_path_to_id_dict.pkl", "rb")))
id2path_db = preprocess_db(pickle.load(open(prefix + "pizza_id_to_path_dict.pkl", "rb")))
relations_db = preprocess_db(pickle.load(open(prefix + "pizza_word_occurence_map.pkl", "rb")))


In [None]:
# Creating Instance DB
import spacy, subprocess, itertools, multiprocessing, sys, glob,  en_core_web_lg, neuralcoref
from spacy.tokens.token import Token
from spacy.attrs import ORTH, LEMMA
from collections import Counter

def stringifyEdge(word, root=True):
    try:
        w = word.root
    except:
        w = word

    if isinstance(word, Token):
        word = word.lemma_.strip().lower()
    else:
        word = ' '.join([wd.string.strip().lower() for wd in word])
    pos, deps = w.pos_, w.dep_
    path = '/'.join([word, pos, deps if deps and root else 'ROOT'])
    return path

def stringifyArg(word, edge):
    try:
        word = word.root
    except:
        pass
    pos, deps = word.pos_, word.dep_
    path = '/'.join([edge, pos, deps if deps else 'ROOT'])
    return path

def filterPaths(function, lowestCommonHead, paths):
    path1 = [lowestCommonHead]
    path1.extend(paths[:-1])
    path2 = paths
    return any(node not in function(path) for path, node in list(zip(path1, path2)))

def notPunct(arr):
    firstWord = arr[0]
    return firstWord.tag_ != 'PUNCT' and len(firstWord.string.strip()) > 1

def notEqual(x, y):
    try:
        return x!=y
    except:
        return False

def checkHead(token, lowestCommonHead):
    return isinstance(token, Token) and lowestCommonHead == token

def getPathFromRoot(phrase):
    paths = []
    head = phrase.head
    while phrase != head:
        phrase = phrase.head
        paths.append(phrase)
        head = phrase.head
    paths = paths[::-1]
    return paths

def breakCompoundWords(elem):
    try:
        root = elem.root
        return root
    except:
        return elem

def findMinLength(x, y):
    if len(x) < len(y):
        return (len(x), x)
    return (len(y), y)

def findLowestCommonHead(pathX, pathY, minLength, minArray):
    lowestCommonHead = None
    if minLength:        
        uncommon = [i for i in range(minLength) if pathX[i] != pathY[i]]
        if uncommon:
            idx = uncommon[0] - 1
        else:
            idx = minLength - 1
        lowestCommonHead = minArray[idx]
    else:
        idx = 0
        if pathX:
            lowestCommonHead = pathX[0]
        elif pathY:
            lowestCommonHead = pathY[0]
        else:
            lowestCommonHead = None
    
    return idx, lowestCommonHead

def getShortestPath(tup):

    xinit, yinit = tup[0], tup[1]

    x, y = breakCompoundWords(xinit), breakCompoundWords(yinit)
    
    pathX, pathY = getPathFromRoot(x), getPathFromRoot(y)
    
    minLength, minArray = findMinLength(pathX, pathY)
    
    idx, lowestCommonHead = findLowestCommonHead(pathX, pathY, minLength, minArray)
    
    try:
        pathX = pathX[idx+1:]
        pathY = pathY[idx+1:]
        checkLeft, checkRight = lambda h: h.lefts, lambda h: h.rights
        if lowestCommonHead and (filterPaths(checkLeft, lowestCommonHead, pathX) or filterPaths(checkRight, lowestCommonHead, pathY)):
            return None
        pathX = pathX[::-1]

        paths = [(None, xinit, pathX, lowestCommonHead, pathY, yinit, None)]
        lefts, rights = list(xinit.lefts), list(yinit.rights)

        if lefts and notPunct(lefts):
            paths.append((lefts[0], xinit, pathX, lowestCommonHead, pathY, yinit, None))

        if rights and notPunct(rights):
            paths.append((None, xinit, pathX, lowestCommonHead, pathY, yinit, rights[0]))
        
        return paths
    except Exception as e:
        print (e)
        return None

def stringifyFilterPath(path, maxlen):

    lowestCommonHeads = []
    (leftX, x, pathX, lowestCommonHead, pathY, y, rightY) = path

    isXHead, isYHead = checkHead(x, lowestCommonHead), checkHead(y, lowestCommonHead)
    signX = '' if isXHead else '>'
    leftXPath  = []
    if leftX:
        edge_str = stringifyEdge(leftX)
        leftXPath.append(edge_str + "<")

    signY = '' if isYHead else '<'
    rightYPath = []
    if rightY:
        edge_str = stringifyEdge(rightY)
        rightYPath.append(">" + edge_str)

    lowestCommonHeads = [[stringifyEdge(lowestCommonHead, False)] if lowestCommonHead and not (isYHead or isXHead) else []][0]
    
    if maxlen >= len(pathX + leftXPath + pathY + rightYPath + lowestCommonHeads):
        
        if isinstance(x, Token):
            stringifiedX = x.string.strip().lower()
        else:
            stringifiedX = ' '.join([x_wd.string.strip().lower() for x_wd in x])
        
        if isinstance(y, Token):
            stringifiedY = y.string.strip().lower()
        else:
            stringifiedY = ' '.join([y_wd.string.strip().lower() for y_wd in y])

        stringifiedPathX, stringifiedPathY = [stringifyEdge(word) + ">" for word in pathX], ["<" + stringifyEdge(word) for word in pathY]
        stringifiedArgX, stringifiedArgY = [stringifyArg(x, 'X') + signX], [signY + stringifyArg(y, 'Y')]
        
        stringifiedPath = '_'.join(leftXPath + stringifiedArgX + stringifiedPathX + lowestCommonHeads + stringifiedPathY + stringifiedArgY + rightYPath)

        return (stringifiedX, stringifiedY, stringifiedPath)

    return None

def getDependencyPaths(sentence, nlp, sentenceNounChunks, maxlen):

    nps = [(n, n.start, n.end) for n in sentenceNounChunks]
    nps.extend([(word, pos, pos) for (pos, word) in enumerate(sentence) if word.tag_[:2] == 'NN' and len(word.string.strip()) > 2])
    ls = list(itertools.product(nps, nps))
    pairedConcepts = [(el[0][0], el[1][0]) for el in itertools.product(nps, nps) if el[1][1] > el[0][2] and notEqual(el[0], el[1])]
    pairedConcepts = list(dict.fromkeys(pairedConcepts))
    
    paths = []
    for pair in pairedConcepts:
        appendingElem = getShortestPath(pair)
        if appendingElem:
            filtered = [stringifyFilterPath(path, maxlen) for path in appendingElem]
            paths.extend(filtered)

    return paths

def preprocess_word(noun):
    filt_tokens = ["DET", "ADV", "PUNCT", "CCONJ"]
    start_index = [i for i,token in enumerate(noun) if token.pos_ not in filt_tokens][0]
    np_filt = noun[start_index:].text
    if "(" not in np_filt and ")" in np_filt:
        np_filt = np_filt.replace(")", "")
    elif "(" in np_filt and ")" not in np_filt:
        np_filt = np_filt.replace("(", "")
    return np_filt


nlp = en_core_web_lg.load()


# load NeuralCoref and add it to the pipe of SpaCy's model, for coreference resolution
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
nlp.tokenizer.add_special_case('Inc.', [{ORTH: 'Inc', LEMMA: 'Incorporated'}])

doc = open("../files/dataset/security4.txt").read()
all_nounchunks = list(nlp(doc).noun_chunks).copy()

sentences = [list(nlp(nlp(sent.text)._.coref_resolved.replace("\n", " ").replace("  ", " ")).sents)[0]
             for sent in nlp(doc).sents]
# [preprocess(nlp(para).noun_chunks) for para in paras]
all_deps = []
instances_db = {}
for sentence in sentences:
    noun_chunks = [n for n in all_nounchunks if sentence.start <= n.start < n.end - 1 < sentence.end]
    noun_chunks = list(nlp(sentence.text).noun_chunks)
    dependencies = getDependencyPaths(sentence, nlp, noun_chunks, 10)
    for dep in dependencies:
        if not dep:
            continue
        key = tuple([preprocess_word(nlp(word)) for word in dep[:2]])
        path = "/".join(["*##*".join(e.split("_", 1)) for e in dep[-1].split("/")])
        if key not in instances_db:
            instances_db[key] = [path]
        else:
            instances_db[key].append(path)
instances_db = {key: Counter(instances_db[key]) for key in instances_db}


In [7]:

# thresholds = [0.5, 0.59, 0.6, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]

# for threshold in thresholds:
threshold = 0.86
    
failed, success = [], []

emb_indexer, pos_indexer, dep_indexer, dir_indexer = [defaultdict(count(0).__next__) for i in range(4)]
unk_emb, unk_pos, unk_dep, unk_dir = emb_indexer["<UNK>"], pos_indexer["<UNK>"], dep_indexer["<UNK>"], dir_indexer["<UNK>"]
rel_indexer = {key: idx for (idx,key) in enumerate(relations)}

train_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(train_file).read().split("\n")}
test_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(test_file).read().split("\n")}
# test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instances_file).read().split("\n")}
test_knocked = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(knocked_file).read().split("\n")}

paths_train, counts_train, targets_train = parse_dataset(train_dataset)
paths_test, counts_test, targets_test  = parse_dataset(test_dataset)
# paths_instances, counts_instances, targets_instances  = parse_instance_dataset(test_instances)
paths_knocked, counts_knocked, targets_knocked  = parse_dataset(test_knocked)

# nodes_train = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in train_dataset]
# nodes_test = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_dataset]
# nodes_instances = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_instances]
# nodes_knocked = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_knocked]

# print ("Train len: {}, Test len: {}, Instance len: {}, Knocked len: {}".format(len(paths_train), len(paths_test),  len(paths_instances), len(paths_knocked)))
# print (len(failed), len(success))
# emb_indexer_inv = {emb_indexer[key]: key for key in emb_indexer}
# embeds = extractUSEEmbeddings(list(emb_indexer.keys())[1:])
# emb_vals = np.array(np.zeros((1, embeds.shape[1])).tolist() + embeds.tolist())


# output_file = "../Input/data_instances_sample.pkl"
# f = open(output_file, "wb+")
# pickle.dump([nodes_train, paths_train, counts_train, targets_train, 
#              nodes_test, paths_test, counts_test, targets_test,
#              nodes_instances, paths_instances, counts_instances, targets_instances,
#              nodes_knocked, paths_knocked, counts_knocked, targets_knocked,
#              emb_indexer, emb_indexer_inv, emb_vals, 
#              pos_indexer, dep_indexer, dir_indexer, rel_indexer], f)
# f.close()



In [11]:
pickle.dump([failed, list(word2id_db.keys())], open("../junk/failed_words_pizza", "wb"))

In [118]:
# Extracting Instances from a document

import glob,  en_core_web_lg
import spacy, neuralcoref, itertools
from spacy.attrs import ORTH, LEMMA

def preprocess(noun_chunks):
    all_parsed_chunks = []
    filt_tokens = ["DET", "ADV", "PUNCT", "CCONJ"]
    for np in noun_chunks:
        start_index = [i for i,token in enumerate(np) if token.pos_ not in filt_tokens][0]
        np_filt = np[start_index:].text
        if "(" not in np_filt and ")" in np_filt:
            np_filt = np_filt.replace(")", "")
        elif "(" in np_filt and ")" not in np_filt:
            np_filt = np_filt.replace("(", "")
        all_parsed_chunks.append(np_filt)
    return list(set(all_parsed_chunks))

nlp = en_core_web_lg.load()


# load NeuralCoref and add it to the pipe of SpaCy's model, for coreference resolution
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
nlp.tokenizer.add_special_case('Inc.', [{ORTH: 'Inc', LEMMA: 'Incorporated'}])

for i,file in enumerate(sorted(glob.glob("../files/dataset/security*"))):
    paras = [t.text for t in list(nlp(open(file).read()).sents)]
    paras = [nlp(para)._.coref_resolved.replace("\n", " ").replace("  ", " ") for para in paras]
    instances = [preprocess(nlp(para).noun_chunks) for para in paras]
    instances_pairs = []
    for instances_sent in instances:
        instances_pairs.extend(list(set(list(itertools.combinations(instances_sent, 2)))))

    instances_pairs = ["\t".join(list(pair) + ["none"]) for pair in instances_pairs if pair]

    open("../files/dataset/instances" + str(i) + ".tsv", "w+").write("\n".join(instances_pairs))


In [29]:
import shelve
shelve.open("../junk/db_files/pizza_term_to_id.db")

error: db type is dbm.gnu, but the module is not available

In [5]:
success

['country',
 'equatorial guinea',
 'country',
 'american',
 'american',
 'american',
 'american',
 'american',
 'american',
 'country',
 'food',
 'american',
 'american',
 'american',
 'country',
 'american',
 'american',
 'american',
 'american',
 'country',
 'cameroon',
 'american',
 'american',
 'american',
 'american',
 'american',
 'american',
 'american',
 'american',
 'food',
 'american',
 'american',
 'american',
 'american',
 'american',
 'country',
 'food',
 'brem',
 'american',
 'american',
 'american',
 'country',
 'american',
 'media',
 'food',
 'american',
 'american',
 'food',
 'american',
 'american',
 'country',
 'morocco',
 'country',
 'american',
 'american',
 'american',
 'american',
 'country',
 'american',
 'american',
 'country',
 'american',
 'american',
 'american',
 'american',
 'american',
 'american',
 'cloverfield',
 'american',
 'american',
 'american',
 'american',
 'american',
 'food',
 'country',
 'country',
 'american',
 'country',
 'country',
 'americ

In [34]:
len(db.keys())

1489929

In [None]:
word2id_db_corrected = pickledb.load(prefix + "w2i_corrected.db", True)
id2word_db_corrected = pickledb.load(prefix + "i2w_corrected.db", True)
allkeys = list(word2id_db.getall())
for key in allkeys:
    try:
        word2id_db_corrected[preprocess_word(nlp(key))] = word2id_db[key]
        id2word_db_corrected[word2id_db[key]] = preprocess_word(nlp(key))
    except:
        print ("Dropping ", key)
        word2id_db_corrected[key] = word2id_db[key]
        id2word_db_corrected[word2id_db[key]] = key
word2id_db_corrected.dump()
id2word_db_corrected.dump()

Dropping  tylgiv
Dropping  valtra
Dropping  matsika
Dropping  frenstrup
Dropping  kakkassery
Dropping  only martelly
Dropping  n700
Dropping  mitteldeutschland
Dropping  n5348a
Dropping  hiramic
Dropping  defined fields
Dropping  the s j p harvie professor


Exception in thread Thread-15154:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  a terminating binary expansion
Dropping  the online canvas design elements
Dropping  the instantaneous angular velocity vector
Dropping  fitting anorexic illnesses
Dropping  a 1920s proposal
Dropping  an international non profit and non governmental student society
Dropping  william a trimble
Dropping  a provincial regiment
Dropping  first real studio experience
Dropping  a lycoming o 360 a4 m
Dropping  other graphics systems
Dropping  polish tradition
Dropping  a practising teacher
Dropping  close diplomatic and economic relationships
Dropping  kiley
Dropping  original or reconstructed fabric
Dropping  scriptural or customary laws
Dropping  national economics challenge champions
Dropping  a long horizontal jump
Dropping  the open bloodstream
Dropping  the officer s blooded horses
Dropping  classical comedy
Dropping  the continental exchanges
Dropping  the most frequent uses
Dropping  major local developers
Dropping  184 restaurants
Dropping  maria s young son
Dropping  utsu


Exception in thread Thread-15446:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration

Exception in thread Thread-15449:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._ar

Dropping  periodic recitals
Dropping  last weekend s post coup presidential election
Dropping  so2 james suh
Dropping  silvie iii
Dropping  pot au feu
Dropping  its operational readiness
Dropping  no one reason


Exception in thread Thread-15472:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  the exterior mirror
Dropping  free agent greg holland
Dropping  keio university hospital
Dropping  negative at skew
Dropping  the former coalfield area
Dropping  a coherent personality
Dropping  intevation
Dropping  fgm 148 javelin
Dropping  17 august robert ritter von greim s fliegerkorps v
Dropping  neither military training
Dropping  self service passport control
Dropping  sierra s salon
Dropping  general no l de castelnau
Dropping  debra delee
Dropping  davis second term
Dropping  the oldest literary account
Dropping  each wall inlet
Dropping  the people s nomadic heritage
Dropping  glasgow academicals
Dropping  fine v fib
Dropping  flat end facets
Dropping  dense grids
Dropping  professor dominique martin
Dropping  the fastest overall driver
Dropping  their sledging rations
Dropping  the lambda company
Dropping  the additional rail
Dropping  maintenance flaws
Dropping  a 75 cm long bundle
Dropping  179 fs
Dropping  military miniatures
Dropping  performance and management

Exception in thread Thread-15713:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  sal n de la paz
Dropping  brian williams lustmord project
Dropping  an exponential behavior
Dropping  this uncommon case
Dropping  only 13 more performances


Exception in thread Thread-15739:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  then a third wrestling team
Dropping  an old watch


Exception in thread Thread-15748:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  a sophisticated propaganda machine
Dropping  a successful and effective program
Dropping  50 s strongest track
Dropping  the yshphh
Dropping  the estimated sinking position
Dropping  phoenix s citizens
Dropping  the cbbb
Dropping  re arranged panels
Dropping  his 50th birthday celebration
Dropping  the male eggs
Dropping  montane meadows
Dropping  the troops good spirit
Dropping  paltrow s performance
Dropping  a free demonstration


Exception in thread Thread-15808:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  roxy attempts
Dropping  either deletion
Dropping  frictional behavior
Dropping  four successive popes
Dropping  engineering design teams
Dropping  felix the cat
Dropping  tidal venuses
Dropping  dsquared2 duo dean and dan caten
Dropping  cooper s most important film
Dropping  consistent subtest scores
Dropping  frances hegarty
Dropping  borland s guitar playing
Dropping  ahsura
Dropping  an unnamed polish clone
Dropping  at least the a credit rating
Dropping  a radio based transatlantic telephone service
Dropping  carddass exclusive storyline series
Dropping  a balance sheet hedge
Dropping  bluebush saltbush steppe


In [1]:
import spacy, subprocess, itertools, multiprocessing, sys, glob,  en_core_web_lg, neuralcoref
from spacy.tokens.token import Token
from spacy.attrs import ORTH, LEMMA
from collections import Counter


def preprocess_word(noun):
    filt_tokens = ["DET", "ADV", "PUNCT", "CCONJ"]
    start_index = [i for i,token in enumerate(noun) if token.pos_ not in filt_tokens][0]
    np_filt = noun[start_index:].text
    if "(" not in np_filt and ")" in np_filt:
        np_filt = np_filt.replace(")", "")
    elif "(" in np_filt and ")" not in np_filt:
        np_filt = np_filt.replace("(", "")
    return np_filt


nlp = en_core_web_lg.load()


In [8]:
import time
t = time.time()
a = nlp("beach on the ocean")
print (time.time()-t)
t = time.time()
preprocess_word(a)
print (time.time()-t)

0.015409708023071289
0.0001728534698486328


In [20]:
import random
lines = [l.split("\t")[:3] for l in open("../files/dataset/pizza_knockedout.tsv", "r").read().split("\n")]
final_lines = []
for line in lines:
    elem = line
    if random.random()>0.5:
        label = line[2]
        if label == "hyponym":
            label = "hypernym"
        elif label == "hypernym":
            label = "hyponym"
        elif label == "concept":
            label = "instance"
        elif label == "instance":
            label = "concept"
        elem = [line[1], line[0], label]
    final_lines.append(elem)
random.shuffle(final_lines)
# final_lines_none = [elem for elem in final_lines if elem=="none"]
# final_lines_none_train = final_lines_none[:int(0.9 * len(final_lines_none))]
# final_lines_none_test = final_lines_none[int(0.9 * len(final_lines_none)):]

# final_lines_rest = [elem for elem in final_lines if elem!="none"]
# final_lines_rest_train = final_lines_rest[:int(0.9 * len(final_lines_rest))]
# final_lines_rest_test = final_lines_rest[int(0.9 * len(final_lines_rest)):]

# final_lines_train = final_lines_none_train + final_lines_rest_train
# final_lines_test = final_lines_none_test + final_lines_rest_test

open("../files/dataset/pizza_knockedout_shuffled.tsv","w+").write("\n".join(["\t".join(line) for line in final_lines]))
# open("../files/dataset/pizza_test.tsv","w+").write("\n".join(["\t".join(line) for line in final_lines_test]))

3227

In [18]:
final_lines

[['Real Italian Pizza', 'Pizza', 'hypernym'],
 ['Pepper Topping', 'Peperonata Topping', 'hyponym'],
 ['Nut Topping', 'Pine Kernel Topping', 'hyponym'],
 ['Vegetable Topping', 'Rocket Topping', 'hyponym'],
 ['Vegetable Topping', 'Leek Topping', 'hyponym'],
 ['Thin And Crispy Base', 'Pizza Base', 'hypernym'],
 ['Seafood Topping', 'Pizza Topping', 'hypernym'],
 ['Fiorentina', 'Named Pizza', 'hypernym'],
 ['Cheesy Vegetable Topping', 'Vegetable Topping', 'hypernym'],
 ['Pepper Topping', 'Sweet Pepper Topping', 'hyponym'],
 ['Cajun', 'Named Pizza', 'hypernym'],
 ['Tomato Topping', 'Sliced Tomato Topping', 'hyponym'],
 ['Margherita', 'Named Pizza', 'hypernym'],
 ['Mixed Seafood Topping', 'Seafood Topping', 'hypernym'],
 ['Seafood Topping', 'Anchovies Topping', 'hyponym'],
 ['Mushroom Topping', 'Vegetable Topping', 'hypernym'],
 ['Pizza', 'Spicy Pizza Equivalent', 'hyponym'],
 ['Artichoke Topping', 'Vegetable Topping', 'hypernym'],
 ['Onion Topping', 'Red Onion Topping', 'hyponym'],
 ['Green 