In [1]:
import pickle, pickledb
import numpy as np
from itertools import count
from collections import defaultdict
import tensorflow as tf
import tensorflow_hub as hub

train_file = "/data/Vivek/original/HypeNET/dataset/custom_train_0.0_0.05.tsv"
test_file =  "/data/Vivek/original/HypeNET/dataset/custom_test_0.0_0.05.tsv"

knocked_file = '../files/dataset/test_knocked.tsv'

NULL_PATH = ((0, 0, 0, 0),)
relations = ["hypernym", "hyponym", "concept", "instance", "none"]
NUM_RELATIONS = len(relations)
prefix = "../junk/db_files/"

USE_link = "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed"
model = hub.load(USE_link)

f = open("../junk/resolved_use_unbracketed.pkl", "rb")
resolved = pickle.load(f)

def extractUSEEmbeddings(words):
    word_embeddings = model(words)
    return word_embeddings.numpy()

In [4]:
arrow_heads = {">": "up", "<":"down"}

def to_list_mixed(seq):
    for item in seq:
        if isinstance(item, tuple):
            yield list(to_list_mixed(item))
        elif isinstance(item, list):
            yield [list(to_list_mixed(elem)) for elem in item]
        else:
            yield item

def extract_direction(edge):

    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrow_heads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrow_heads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge

def parse_path(path):
    parsed_path = []
    for edge in path.split("*##*"):
        direction, edge = extract_direction(edge)
        if edge.split("/"):
            try:
                embedding, pos, dependency = tuple([a[::-1] for a in edge[::-1].split("/",2)][::-1])
            except:
                print (edge, path)
                raise
            emb_idx, pos_idx, dep_idx, dir_idx = emb_indexer[embedding], pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsed_path.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsed_path)

def parse_tuple(tup, resolve=True):
    x, y = [entity_to_id(word2id_db, elem, resolve) for elem in tup]
    paths_x, paths_y = list(extract_paths(relations_db,x,y).items()), list(extract_paths(relations_db,y,x).items())
    path_count_dict_x = { id_to_path(id2path_db, path).replace("X/", tup[0]+"/").replace("Y/", tup[1]+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { id_to_path(id2path_db, path).replace("Y/", tup[0]+"/").replace("X/", tup[1]+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}
    return path_count_dict

def parse_dataset(dataset, resolve=True):
    parsed_dicts = [parse_tuple(tup, resolve) for tup in dataset.keys()]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
    counts = [list(path_dict.values()) for path_dict in paths]
    paths = [list(path_dict.keys()) for path_dict in paths]
    targets = [rel_indexer[relation] for relation in dataset.values()]
    return list(to_list_mixed(paths)), counts, targets

def get_instance_key(tup):
    return tuple([" ".join([tok.text for tok in nlp(elem)]) for elem in tup])

def parse_instance(tup):
    
    paths_x = list(instances_db.get(get_instance_key(tup), {}).items())
    paths_y = list(instances_db.get(get_instance_key(tup[::-1]), {}).items())
    path_count_dict_x = { path.replace("X/", tup[0]+"/").replace("Y/", tup[1]+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { path.replace("Y/", tup[0]+"/").replace("X/", tup[1]+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}
    return path_count_dict

paths_instances_tot = []
def parse_instance_dataset(dataset):
    parsed_dicts = [parse_instance(tup) for tup in dataset.keys()]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    global paths_instances_tot
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
    paths_instances_tot.append(paths)
    counts = [list(path_dict.values()) for path_dict in paths]
    paths = [list(path_dict.keys()) for path_dict in paths]
    targets = [rel_indexer[relation] for relation in dataset.values()]
    return list(to_list_mixed(paths)), counts, targets

def id_to_entity(db, entity_id):
    entity = db[str(entity_id)]
    return entity

def id_to_path(db, entity_id):
    entity = db[str(entity_id)]
    entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
    return entity

def entity_to_id(db, entity, resolve=True):
    global success, failed
    entity_id = db.get(entity)
    if entity_id:
        success.append(entity)
        return int(entity_id)
    if not resolve:
        return -1
    closest_entity = resolved.get(entity, "")
    if closest_entity and closest_entity[0] and float(closest_entity[1]) > threshold:
        success.append(entity)
        return int(db[closest_entity[0]])
    failed.append(entity)
    return -1

def extract_paths(db, x, y):
    key = (str(x) + '###' + str(y))
    try:
        relation = db[key]
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        return {}

word2id_db = pickledb.load(prefix + "w2i.db", False)
id2word_db = pickledb.load(prefix + "i2w.db", False)
path2id_db = pickledb.load(prefix + "p2i.db", False)
id2path_db = pickledb.load(prefix + "i2p.db", False)
relations_db = pickledb.load(prefix + "relations.db", False)


In [3]:
# Creating Instance DB
import spacy, subprocess, itertools, multiprocessing, sys, glob,  en_core_web_lg, neuralcoref
from spacy.tokens.token import Token
from spacy.attrs import ORTH, LEMMA
from collections import Counter

def stringifyEdge(word, root=True):
    try:
        w = word.root
    except:
        w = word

    if isinstance(word, Token):
        word = word.lemma_.strip().lower()
    else:
        word = ' '.join([wd.string.strip().lower() for wd in word])
    pos, deps = w.pos_, w.dep_
    path = '/'.join([word, pos, deps if deps and root else 'ROOT'])
    return path

def stringifyArg(word, edge):
    try:
        word = word.root
    except:
        pass
    pos, deps = word.pos_, word.dep_
    path = '/'.join([edge, pos, deps if deps else 'ROOT'])
    return path

def filterPaths(function, lowestCommonHead, paths):
    path1 = [lowestCommonHead]
    path1.extend(paths[:-1])
    path2 = paths
    return any(node not in function(path) for path, node in list(zip(path1, path2)))

def notPunct(arr):
    firstWord = arr[0]
    return firstWord.tag_ != 'PUNCT' and len(firstWord.string.strip()) > 1

def notEqual(x, y):
    try:
        return x!=y
    except:
        return False

def checkHead(token, lowestCommonHead):
    return isinstance(token, Token) and lowestCommonHead == token

def getPathFromRoot(phrase):
    paths = []
    head = phrase.head
    while phrase != head:
        phrase = phrase.head
        paths.append(phrase)
        head = phrase.head
    paths = paths[::-1]
    return paths

def breakCompoundWords(elem):
    try:
        root = elem.root
        return root
    except:
        return elem

def findMinLength(x, y):
    if len(x) < len(y):
        return (len(x), x)
    return (len(y), y)

def findLowestCommonHead(pathX, pathY, minLength, minArray):
    lowestCommonHead = None
    if minLength:        
        uncommon = [i for i in range(minLength) if pathX[i] != pathY[i]]
        if uncommon:
            idx = uncommon[0] - 1
        else:
            idx = minLength - 1
        lowestCommonHead = minArray[idx]
    else:
        idx = 0
        if pathX:
            lowestCommonHead = pathX[0]
        elif pathY:
            lowestCommonHead = pathY[0]
        else:
            lowestCommonHead = None
    
    return idx, lowestCommonHead

def getShortestPath(tup):

    xinit, yinit = tup[0], tup[1]

    x, y = breakCompoundWords(xinit), breakCompoundWords(yinit)
    
    pathX, pathY = getPathFromRoot(x), getPathFromRoot(y)
    
    minLength, minArray = findMinLength(pathX, pathY)
    
    idx, lowestCommonHead = findLowestCommonHead(pathX, pathY, minLength, minArray)
    
    try:
        pathX = pathX[idx+1:]
        pathY = pathY[idx+1:]
        checkLeft, checkRight = lambda h: h.lefts, lambda h: h.rights
        if lowestCommonHead and (filterPaths(checkLeft, lowestCommonHead, pathX) or filterPaths(checkRight, lowestCommonHead, pathY)):
            return None
        pathX = pathX[::-1]

        paths = [(None, xinit, pathX, lowestCommonHead, pathY, yinit, None)]
        lefts, rights = list(xinit.lefts), list(yinit.rights)

        if lefts and notPunct(lefts):
            paths.append((lefts[0], xinit, pathX, lowestCommonHead, pathY, yinit, None))

        if rights and notPunct(rights):
            paths.append((None, xinit, pathX, lowestCommonHead, pathY, yinit, rights[0]))
        
        return paths
    except Exception as e:
        print (e)
        return None

def stringifyFilterPath(path, maxlen):

    lowestCommonHeads = []
    (leftX, x, pathX, lowestCommonHead, pathY, y, rightY) = path

    isXHead, isYHead = checkHead(x, lowestCommonHead), checkHead(y, lowestCommonHead)
    signX = '' if isXHead else '>'
    leftXPath  = []
    if leftX:
        edge_str = stringifyEdge(leftX)
        leftXPath.append(edge_str + "<")

    signY = '' if isYHead else '<'
    rightYPath = []
    if rightY:
        edge_str = stringifyEdge(rightY)
        rightYPath.append(">" + edge_str)

    lowestCommonHeads = [[stringifyEdge(lowestCommonHead, False)] if lowestCommonHead and not (isYHead or isXHead) else []][0]
    
    if maxlen >= len(pathX + leftXPath + pathY + rightYPath + lowestCommonHeads):
        
        if isinstance(x, Token):
            stringifiedX = x.string.strip().lower()
        else:
            stringifiedX = ' '.join([x_wd.string.strip().lower() for x_wd in x])
        
        if isinstance(y, Token):
            stringifiedY = y.string.strip().lower()
        else:
            stringifiedY = ' '.join([y_wd.string.strip().lower() for y_wd in y])

        stringifiedPathX, stringifiedPathY = [stringifyEdge(word) + ">" for word in pathX], ["<" + stringifyEdge(word) for word in pathY]
        stringifiedArgX, stringifiedArgY = [stringifyArg(x, 'X') + signX], [signY + stringifyArg(y, 'Y')]
        
        stringifiedPath = '_'.join(leftXPath + stringifiedArgX + stringifiedPathX + lowestCommonHeads + stringifiedPathY + stringifiedArgY + rightYPath)

        return (stringifiedX, stringifiedY, stringifiedPath)

    return None

def getDependencyPaths(sentence, nlp, sentenceNounChunks, maxlen):

    nps = [(n, n.start, n.end) for n in sentenceNounChunks]
    nps.extend([(word, pos, pos) for (pos, word) in enumerate(sentence) if word.tag_[:2] == 'NN' and len(word.string.strip()) > 2])
    ls = list(itertools.product(nps, nps))
    pairedConcepts = [(el[0][0], el[1][0]) for el in itertools.product(nps, nps) if el[1][1] > el[0][2] and notEqual(el[0], el[1])]
    pairedConcepts = list(dict.fromkeys(pairedConcepts))
    
    paths = []
    for pair in pairedConcepts:
        appendingElem = getShortestPath(pair)
        if appendingElem:
            filtered = [stringifyFilterPath(path, maxlen) for path in appendingElem]
            paths.extend(filtered)

    return paths

def preprocess_word(noun):
    try:
        filt_tokens = ["DET", "ADV", "PUNCT", "CCONJ"]
        start_index = [i for i,token in enumerate(noun) if token.pos_ not in filt_tokens][0]
        np_filt = noun[start_index:].text
        if "(" not in np_filt and ")" in np_filt:
            np_filt = np_filt.replace(")", "")
        elif "(" in np_filt and ")" not in np_filt:
            np_filt = np_filt.replace("(", "")
        return np_filt
    except KeyboardInterrupt:
        sys.exit()
        pass
    except Exception:
        return noun.text

nlp = en_core_web_lg.load()


# load NeuralCoref and add it to the pipe of SpaCy's model, for coreference resolution
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
nlp.tokenizer.add_special_case('Inc.', [{ORTH: 'Inc', LEMMA: 'Incorporated'}])



In [5]:
def to_tuple(seq):
    for item in seq:
        if isinstance(item, list):
            yield tuple(to_tuple(item))
        else:
            yield item

def to_list(seq):
    for item in seq:
        if isinstance(item, tuple):
            yield list(to_list(item))
        else:
            yield item

# thresholds = [0.5, 0.59, 0.6, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]

# for threshold in thresholds:

threshold = 0.86
failed, success = [], []

emb_indexer, pos_indexer, dep_indexer, dir_indexer = [defaultdict(count(0).__next__) for i in range(4)]
unk_emb, unk_pos, unk_dep, unk_dir = emb_indexer["<UNK>"], pos_indexer["<UNK>"], dep_indexer["<UNK>"], dir_indexer["<UNK>"]
rel_indexer = {key: idx for (idx,key) in enumerate(relations)}

train_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(train_file).read().split("\n")}
test_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(test_file).read().split("\n")}
test_knocked = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(knocked_file).read().split("\n")}

paths_train, counts_train, targets_train = parse_dataset(train_dataset)
paths_test, counts_test, targets_test  = parse_dataset(test_dataset)
paths_knocked, counts_knocked, targets_knocked  = parse_dataset(test_knocked)

nodes_train = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in train_dataset]
nodes_test = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_dataset]
nodes_knocked = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_knocked]

all_instances_data = []

instance_files = ["../files/dataset/instances2_orig.tsv", "../files/dataset/instances2_validated.tsv"]
security_files = ["../files/dataset/security2.txt", "../files/dataset/security2.txt"]

for instance_file, security_file in list(zip(instance_files, security_files)):

    print ("Doing ", instance_file)
    
    test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instance_file).read().split("\n")}
    paths_instances_old, counts_instances_old, targets_instances = parse_dataset(test_instances, False)
    nodes_instances = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_instances]
    
    doc = open(security_file).read()
    all_nounchunks = list(nlp(doc).noun_chunks).copy()

    sentences = [list(nlp(nlp(sent.text)._.coref_resolved.replace("\n", " ").replace("  ", " ")).sents)[0]
                 for sent in nlp(doc).sents]
    # [preprocess(nlp(para).noun_chunks) for para in paras]
    all_deps = []
    instances_db = {}
    for sentence in sentences:
        noun_chunks = [n for n in all_nounchunks if sentence.start <= n.start < n.end - 1 < sentence.end]
        noun_chunks = list(nlp(sentence.text).noun_chunks)
        dependencies = getDependencyPaths(sentence, nlp, noun_chunks, 10)
        for dep in dependencies:
            if not dep:
                continue
            key = tuple([preprocess_word(nlp(word)) for word in dep[:2]])
            path = "/".join(["*##*".join(e.split("_", 1)) for e in dep[-1].split("/")])
            if key not in instances_db:
                instances_db[key] = [path]
            else:
                instances_db[key].append(path)
    instances_db = {key: Counter(instances_db[key]) for key in instances_db}

    test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instance_file).read().split("\n")}
    paths_instances_new, counts_instances_new, targets_instances  = parse_instance_dataset(test_instances)
    nodes_instances = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_instances]
    
    paths_instances = []
    counts_instances = []
    
    paths_instances_old_tup, paths_instances_new_tup = list(to_tuple(paths_instances_old)), list(to_tuple(paths_instances_new))
    for i,(path_old, count_old) in enumerate(zip(paths_instances_old_tup, counts_instances_old)):
        counter = Counter(dict(zip(path_old, count_old))) + Counter(dict(zip(paths_instances_new_tup[i], counts_instances_new[i])))
        if NULL_PATH in counter and len(counter) > 1:
            del counter[NULL_PATH]
        paths_instances.append(list(to_list(list(counter.keys()))))
        counts_instances.append(list(counter.values()))
    
    all_instances_data.extend([nodes_instances, paths_instances_old, counts_instances_old, paths_instances_new, counts_instances_new, paths_instances, counts_instances, targets_instances])

print ("Train len: {}, Test len: {}, Instance len: {}, Knocked len: {}".format(len(paths_train), len(paths_test),  len(paths_instances), len(paths_knocked)))
print (len(failed), len(success))
emb_indexer_inv = {emb_indexer[key]: key for key in emb_indexer}
embeds = extractUSEEmbeddings(list(emb_indexer.keys())[1:])
emb_vals = np.array(np.zeros((1, embeds.shape[1])).tolist() + embeds.tolist())


output_file = "../Input/data_instances_new.pkl"
f = open(output_file, "wb+")
pickle.dump([nodes_train, paths_train, counts_train, targets_train, 
             nodes_test, paths_test, counts_test, targets_test,
             nodes_knocked, paths_knocked, counts_knocked, targets_knocked,
             *all_instances_data, emb_indexer, emb_indexer_inv, emb_vals, 
             pos_indexer, dep_indexer, dir_indexer, rel_indexer], f)
f.close()



Doing  ../files/dataset/instances2_orig.tsv
Doing  ../files/dataset/instances2_validated.tsv
Train len: 10739, Test len: 1197, Instance len: 1025, Knocked len: 5538
6869 30318


In [8]:
paths_instances

[[[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[5926, 1, 12, 1], [14493, 3, 2, 2], [5926, 1, 12, 3], [14316, 1, 12, 3]]],
 [[[0, 0, 0, 0]]],
 [[[14309, 1, 1, 1],
   [244, 2, 2, 2],
   [423, 1, 1, 3],
   [1301, 1, 13, 3],
   [14364, 1, 12, 3]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[14544, 1, 1, 1],
   [495, 2, 2, 2],
   [423, 1, 1, 3],
   [94, 2, 6, 3],
   [14543, 1, 25, 3]],
  [[14544, 1, 1, 1],
   [495, 2, 2, 2],
   [423, 1, 1, 3],
   [94, 2, 6, 3],
   [14543, 1, 25, 3],
   [51, 7, 19, 4]]],
 [[[14299, 1, 7, 1],
   [11, 4, 2, 2],
   [875, 1, 7, 3],
   [386, 1, 13, 3],
   [14359, 1, 12, 3]]],
 [[[14299, 1, 7, 1],
   [11, 4, 2, 2],
   [875, 1, 7, 3],
   [386, 1, 13, 3],
   [209, 1, 12, 3],
   [14398, 1, 12, 3]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[14540, 1, 2, 1],
   [423, 1, 2, 2],
   [49, 4, 6, 3],
   [1163, 1, 7, 3],
   [10, 4, 6, 3],
   [14291, 1, 7, 3]]],
 [[[14540, 1, 2, 1],
   [423, 1, 2, 2],
   [49, 4, 6, 3],
   [483

In [16]:
flatten = lambda l: [item for sublist in l for item in sublist]
num_edges_all = [[len(path) for path in element] for element in paths_instances_old]
max(flatten(num_edges_all))

9

In [32]:
[list(elem) for elem in paths_instances]

[[[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[14032, 1, 4, 1],
   [4, 2, 2, 2],
   [193, 1, 4, 3],
   [10, 4, 6, 3],
   [14033, 1, 7, 3]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[15, 5, 8, 5],
   [193, 1, 3, 1],
   [18, 2, 2, 2],
   [14754, 6, 24, 3],
   [11, 4, 6, 3],
   [14036, 1, 7, 3]],
  [[193, 1, 3, 1],
   [18, 2, 2, 2],
   [14754, 6, 24, 3],
   [11, 4, 6, 3],
   [14036, 1, 7, 3]],
  [[14036, 1, 3, 1], [14755, 2, 2, 2], [193, 1, 1, 3]]],
 [[[14034, 1, 4, 1],
   [4, 2, 2, 2],
   [4, 2, 12, 3],
   [14035, 6, 17, 3],
   [17, 4, 6, 3],
   [14036, 1, 7, 3]]],
 [[[14037, 1, 7, 1], [17, 4, 2, 2], [1889, 1, 7, 3], [14038, 1, 12, 3]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[0, 0, 0, 0]]],
 [[[14036, 1, 3, 1], [687, 2, 2, 2], [476, 1, 1, 3]],
  

In [92]:
# Extracting Instances from a document

import glob,  en_core_web_lg
import spacy, neuralcoref, itertools
from spacy.attrs import ORTH, LEMMA

def preprocess(noun_chunks):
    all_parsed_chunks = []
    filt_tokens = ["DET", "ADV", "PUNCT", "CCONJ"]
    for np in noun_chunks:
        start_index = [i for i,token in enumerate(np) if token.pos_ not in filt_tokens][0]
        np_filt = np[start_index:].text
        if "(" not in np_filt and ")" in np_filt:
            np_filt = np_filt.replace(")", "")
        elif "(" in np_filt and ")" not in np_filt:
            np_filt = np_filt.replace("(", "")
        all_parsed_chunks.append(np_filt)
    return list(set(all_parsed_chunks))

nlp = en_core_web_lg.load()


# load NeuralCoref and add it to the pipe of SpaCy's model, for coreference resolution
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
nlp.tokenizer.add_special_case('Inc.', [{ORTH: 'Inc', LEMMA: 'Incorporated'}])

for i,file in enumerate(sorted(glob.glob("../files/dataset/security*"))):
    paras = [t.text for t in list(nlp(open(file).read()).sents)]
    paras = [nlp(para)._.coref_resolved.replace("\n", " ").replace("  ", " ") for para in paras]
    instances = [preprocess(nlp(para).noun_chunks) for para in paras]
    instances_pairs = []
    for instances_sent in instances:
        instances_pairs.extend(list(set(list(itertools.combinations(instances_sent, 2)))))

    instances_pairs = ["\t".join(list(pair) + ["none"]) for pair in instances_pairs if pair]

    open("../files/dataset/instances" + str(i) + ".tsv", "w+").write("\n".join(instances_pairs))


In [93]:
open("../Outputs/Output_instances_softmax_Instances1 (original)_corrected.tsv","r").read().split("\n")

['CAN YOU DEFEND AGAINST ZERO-DAY THREATS? ',
 'Every day, 8,3001 new, previously undiscovered cyber attacks emerge, including zero-day malware, zero-day phishing and social engineering attacks.',
 'With no associated file signatures, anti-virus, firewalls and other core security solutions cannot identify no associated file signatures, anti-virus, firewalls and other core security solutions as malicious and block no associated file signatures, anti-virus, firewalls and other core security solutions from entering the network.',
 'In fact, even the best AV solutions detect only half of malware strains in the wild. ',
 'With no existing indicators of compromise (IOCs), how do you protect against what you do not know? COMMON NETWORK SECURITY APPROACHES HAVE LIMITATIONS ',
 'To protect against zero-day threats, organizations use several approaches. ',
 'These include: •',
 'Conventional sandboxing solutions, which are susceptible to malware evasion techniques, and by default, are configured

In [2]:
import pickledb
prefix = "../junk/db_files/"
word2id_db = pickledb.load(prefix + "w2i.db", False)
allkeys = list(word2id_db.getall())

In [None]:
word2id_db_corrected = pickledb.load(prefix + "w2i_corrected.db", True)
id2word_db_corrected = pickledb.load(prefix + "i2w_corrected.db", True)
allkeys = list(word2id_db.getall())
for key in allkeys:
    try:
        word2id_db_corrected[preprocess_word(nlp(key))] = word2id_db[key]
        id2word_db_corrected[word2id_db[key]] = preprocess_word(nlp(key))
    except:
        print ("Dropping ", key)
        word2id_db_corrected[key] = word2id_db[key]
        id2word_db_corrected[word2id_db[key]] = key
word2id_db_corrected.dump()
id2word_db_corrected.dump()

Dropping  tylgiv
Dropping  valtra
Dropping  matsika
Dropping  frenstrup
Dropping  kakkassery
Dropping  only martelly
Dropping  n700
Dropping  mitteldeutschland
Dropping  n5348a
Dropping  hiramic
Dropping  defined fields
Dropping  the s j p harvie professor


Exception in thread Thread-15154:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  a terminating binary expansion
Dropping  the online canvas design elements
Dropping  the instantaneous angular velocity vector
Dropping  fitting anorexic illnesses
Dropping  a 1920s proposal
Dropping  an international non profit and non governmental student society
Dropping  william a trimble
Dropping  a provincial regiment
Dropping  first real studio experience
Dropping  a lycoming o 360 a4 m
Dropping  other graphics systems
Dropping  polish tradition
Dropping  a practising teacher
Dropping  close diplomatic and economic relationships
Dropping  kiley
Dropping  original or reconstructed fabric
Dropping  scriptural or customary laws
Dropping  national economics challenge champions
Dropping  a long horizontal jump
Dropping  the open bloodstream
Dropping  the officer s blooded horses
Dropping  classical comedy
Dropping  the continental exchanges
Dropping  the most frequent uses
Dropping  major local developers
Dropping  184 restaurants
Dropping  maria s young son
Dropping  utsu


Exception in thread Thread-15446:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration

Exception in thread Thread-15449:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._ar

Dropping  periodic recitals
Dropping  last weekend s post coup presidential election
Dropping  so2 james suh
Dropping  silvie iii
Dropping  pot au feu
Dropping  its operational readiness
Dropping  no one reason


Exception in thread Thread-15472:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  the exterior mirror
Dropping  free agent greg holland
Dropping  keio university hospital
Dropping  negative at skew
Dropping  the former coalfield area
Dropping  a coherent personality
Dropping  intevation
Dropping  fgm 148 javelin
Dropping  17 august robert ritter von greim s fliegerkorps v
Dropping  neither military training
Dropping  self service passport control
Dropping  sierra s salon
Dropping  general no l de castelnau
Dropping  debra delee
Dropping  davis second term
Dropping  the oldest literary account
Dropping  each wall inlet
Dropping  the people s nomadic heritage
Dropping  glasgow academicals
Dropping  fine v fib
Dropping  flat end facets
Dropping  dense grids
Dropping  professor dominique martin
Dropping  the fastest overall driver
Dropping  their sledging rations
Dropping  the lambda company
Dropping  the additional rail
Dropping  maintenance flaws
Dropping  a 75 cm long bundle
Dropping  179 fs
Dropping  military miniatures
Dropping  performance and management

Exception in thread Thread-15713:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  sal n de la paz
Dropping  brian williams lustmord project
Dropping  an exponential behavior
Dropping  this uncommon case
Dropping  only 13 more performances


Exception in thread Thread-15739:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  then a third wrestling team
Dropping  an old watch


Exception in thread Thread-15748:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  a sophisticated propaganda machine
Dropping  a successful and effective program
Dropping  50 s strongest track
Dropping  the yshphh
Dropping  the estimated sinking position
Dropping  phoenix s citizens
Dropping  the cbbb
Dropping  re arranged panels
Dropping  his 50th birthday celebration
Dropping  the male eggs
Dropping  montane meadows
Dropping  the troops good spirit
Dropping  paltrow s performance
Dropping  a free demonstration


Exception in thread Thread-15808:
Traceback (most recent call last):
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/vlead/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vlead/anaconda3/lib/python3.7/json/__init__.py", line 179, in dump
    for chunk in iterable:
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "/home/vlead/anaconda3/lib/python3.7/json/encoder.py", line 356, in _iterencode_dict
    for key, value in items:
RuntimeError: dictionary changed size during iteration



Dropping  roxy attempts
Dropping  either deletion
Dropping  frictional behavior
Dropping  four successive popes
Dropping  engineering design teams
Dropping  felix the cat
Dropping  tidal venuses
Dropping  dsquared2 duo dean and dan caten
Dropping  cooper s most important film
Dropping  consistent subtest scores
Dropping  frances hegarty
Dropping  borland s guitar playing
Dropping  ahsura
Dropping  an unnamed polish clone
Dropping  at least the a credit rating
Dropping  a radio based transatlantic telephone service
Dropping  carddass exclusive storyline series
Dropping  a balance sheet hedge
Dropping  bluebush saltbush steppe


In [76]:
from sklearn.metrics import accuracy_score
from scipy import spatial
import glob, math
from orderedset import OrderedSet
from itertools import groupby
from operator import itemgetter

# Returns cosine similarity of two vectors
def cos_sim(a,b):
    return 1 - spatial.distance.cosine(a, b)

def calculate_recall(true, pred):
    true_f, pred_f = [], []
    for i,elem in enumerate(true):
        if elem!="none":
            true_f.append(elem)
            pred_f.append(pred[i])
    return accuracy_score(true_f, pred_f)

def calculate_precision(true, pred):
    true_f, pred_f = [], []
    for i,elem in enumerate(pred):
        if elem!="none":
            pred_f.append(elem)
            true_f.append(true[i])
    return accuracy_score(true_f, pred_f)
flatten = lambda l: [item for sublist in l for item in sublist]

SECURITY_WORD = "Information Security"
for file in glob.glob("../Outputs/Output_instances_softmax_Instances*).tsv"):
    lines = [l.split("\t") for l in open(file, "r").read().split("\n")]
    words = [SECURITY_WORD] + list(set(flatten([l[:2] for l in lines])))
    embeds = extractUSEEmbeddings(words)
    emb_indexer = dict(zip(words, embeds))
    all_fscores = []
    sims = flatten([(cos_sim(emb_indexer[SECURITY_WORD], emb_indexer[elem[0]]), cos_sim(emb_indexer[SECURITY_WORD], emb_indexer[elem[1]])) for elem in lines])
    for threshold in np.arange(round(min(sims), 3), round(max(sims), 3), 0.001):
        lines_short = [elem for elem in lines if cos_sim(emb_indexer[SECURITY_WORD], emb_indexer[elem[0]]) > threshold and cos_sim(emb_indexer[SECURITY_WORD], emb_indexer[elem[1]]) > threshold]
        if not lines_short:
            continue
        pred, gt = list(zip(*[line[-2:] for line in lines_short]))
        recall = calculate_recall(gt, pred)
        precision = calculate_precision(gt, pred)
        f1score = 2 * (precision * recall)/ (precision + recall)
#         print (threshold, precision, recall, f1score, len(lines_short))
        if math.isnan(f1score):
            continue
        all_fscores.append((lines_short, threshold, len(lines_short), precision, recall, f1score))
    opt_elem = max(all_fscores, key = lambda l: l[-1])
    print([(k, [elem[0] for elem in list(list(zip(*g))[1:-1])]) for k, g in groupby(all_fscores, itemgetter(5))][-10:])
    print (file, opt_elem[-1])
    new_file = file.rsplit(".",1)[0] + "_shortened.tsv"
    open(new_file, "w+").write("\n".join(["\t".join(elem) for elem in opt_elem[0]]))

[(0.25, [0.3300000000000004, 13, 0.15384615384615385, 0.6666666666666666]), (0.28571428571428575, [0.3330000000000004, 11, 0.18181818181818182, 0.6666666666666666]), (0.3333333333333333, [0.3340000000000004, 9, 0.2222222222222222, 0.6666666666666666]), (0.36363636363636365, [0.3360000000000004, 8, 0.25, 0.6666666666666666]), (0.4444444444444445, [0.3390000000000004, 7, 0.2857142857142857, 1.0]), (0.5, [0.3540000000000004, 6, 0.3333333333333333, 1.0]), (0.6666666666666666, [0.36100000000000043, 4, 0.5, 1.0]), (0.5, [0.37900000000000045, 3, 0.3333333333333333, 1.0]), (0.6666666666666666, [0.39600000000000046, 2, 0.5, 1.0]), (1.0, [0.40300000000000047, 1, 1.0, 1.0])]
../Outputs/Output_instances_softmax_Instances3 (hybrid).tsv 1.0


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


[(0.05555555555555556, [0.30099999999999977, 36, 0.02857142857142857, 1.0]), (0.05882352941176471, [0.30299999999999977, 34, 0.030303030303030304, 1.0]), (0.06060606060606061, [0.3139999999999997, 32, 0.03125, 1.0]), (0.06451612903225806, [0.3149999999999997, 30, 0.03333333333333333, 1.0]), (0.06666666666666667, [0.3169999999999997, 29, 0.034482758620689655, 1.0]), (0.0689655172413793, [0.32199999999999973, 28, 0.03571428571428571, 1.0]), (0.07407407407407407, [0.32299999999999973, 26, 0.038461538461538464, 1.0]), (0.07692307692307693, [0.32899999999999974, 25, 0.04, 1.0]), (0.07999999999999999, [0.33199999999999974, 24, 0.041666666666666664, 1.0]), (0.08695652173913045, [0.33599999999999974, 22, 0.045454545454545456, 1.0])]
../Outputs/Output_instances_softmax_Instances4 (original).tsv 0.08695652173913045




[(0.027027027027027025, [0.18600000000000028, 187, 0.013986013986013986, 0.4]), (0.027972027972027972, [0.18800000000000028, 175, 0.014492753623188406, 0.4]), (0.02857142857142857, [0.18900000000000028, 172, 0.014814814814814815, 0.4]), (0.028776978417266185, [0.19100000000000028, 171, 0.014925373134328358, 0.4]), (0.028985507246376812, [0.19300000000000028, 169, 0.015037593984962405, 0.4]), (0.015503875968992246, [0.19400000000000028, 162, 0.007936507936507936, 0.3333333333333333]), (0.015748031496062992, [0.19500000000000028, 159, 0.008064516129032258, 0.3333333333333333]), (0.01652892561983471, [0.19600000000000029, 151, 0.00847457627118644, 0.3333333333333333]), (0.017391304347826087, [0.19700000000000029, 143, 0.008928571428571428, 0.3333333333333333]), (0.017699115044247787, [0.2010000000000003, 140, 0.00909090909090909, 0.3333333333333333])]
../Outputs/Output_instances_softmax_Instances1 (webpage).tsv 0.040816326530612256
[(0.25, [0.3300000000000004, 13, 0.15384615384615385, 0.6

In [88]:
file

'../Outputs/Output_instances_softmax_Instances2 (hybrid).tsv'

In [98]:
pred, gt = list(zip(*[l.split("\t")[2:] for l in open("../Outputs/Output_instances_softmax_Instances1 (original)_corrected.tsv", "r").read().split("\n")[:408]]))
precision = calculate_precision(gt, pred)
recall = calculate_recall(gt, pred) 
f1score = (2 * precision * recall) / (precision + recall)
f1score

0.24888888888888888

In [16]:
# t = time.time()
import sys
def p(word):
    try:
        return preprocess_word(word)
    except KeyboardInterrupt:
        sys.exit()
        pass
    except Exception:
        return word
t = time.time()
allkeys_corrected = [p(word) for word in list(nlp.pipe(allkeys))]
print (time.time()-t)

KeyboardInterrupt: 

In [19]:

lines = "\n".join([l.strip() for l in open("../files/dataset/instances3_validated.tsv").read().split("\n")])

In [21]:
open("../files/dataset/instances3_validated.tsv", "w+").write(lines)

28747