In [30]:
import pickle, pickledb
import numpy as np
from itertools import count
from collections import defaultdict
import tensorflow as tf
import tensorflow_hub as hub

train_file = "/data/Vivek/original/HypeNET/dataset/custom_train_0.0_0.05.tsv"
test_file =  "/data/Vivek/original/HypeNET/dataset/custom_test_0.0_0.05.tsv"
instances_file = '../files/dataset/test_instances.tsv'
knocked_file = '../files/dataset/test_knocked.tsv'

NULL_PATH = ((0, 0, 0, 0),)
relations = ["hypernym", "hyponym", "concept", "instance", "none"]
NUM_RELATIONS = len(relations)
prefix = "../junk/db_files/"

USE_link = "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed"
model = hub.load(USE_link)

f = open("../junk/resolved_use_unbracketed.pkl", "rb")
resolved = pickle.load(f)

def extractUSEEmbeddings(words):
    word_embeddings = model(words)
    return word_embeddings.numpy()

In [31]:
arrow_heads = {">": "up", "<":"down"}

def to_list(seq):
    for item in seq:
        if isinstance(item, tuple):
            yield list(to_list(item))
        elif isinstance(item, list):
            yield [list(to_list(elem)) for elem in item]
        else:
            yield item

def extract_direction(edge):

    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrow_heads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrow_heads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge

def parse_path(path):
    parsed_path = []
    for edge in path.split("*##*"):
        direction, edge = extract_direction(edge)
        if edge.split("/"):
            try:
                embedding, pos, dependency = tuple([a[::-1] for a in edge[::-1].split("/",2)][::-1])
            except:
                print (edge, path)
                raise
            emb_idx, pos_idx, dep_idx, dir_idx = emb_indexer[embedding], pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsed_path.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsed_path)

def parse_tuple(tup):
    x, y = [entity_to_id(word2id_db, elem) for elem in tup]
    paths_x, paths_y = list(extract_paths(relations_db,x,y).items()), list(extract_paths(relations_db,y,x).items())
    path_count_dict_x = { id_to_path(id2path_db, path).replace("X/", tup[0]+"/").replace("Y/", tup[1]+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { id_to_path(id2path_db, path).replace("Y/", tup[0]+"/").replace("X/", tup[1]+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}
    return path_count_dict

def parse_dataset(dataset):
    parsed_dicts = [parse_tuple(tup) for tup in dataset.keys()]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
    counts = [list(path_dict.values()) for path_dict in paths]
    paths = [list(path_dict.keys()) for path_dict in paths]
    targets = [rel_indexer[relation] for relation in dataset.values()]
    return list(to_list(paths)), counts, targets



def id_to_entity(db, entity_id):
    entity = db[str(entity_id)]
    return entity

def id_to_path(db, entity_id):
    entity = db[str(entity_id)]
    entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
    return entity

def entity_to_id(db, entity):
    global success, failed
    entity_id = db.get(entity)
    if entity_id:
        success.append(entity)
        return int(entity_id)
    closest_entity = resolved.get(entity, "")
    if closest_entity and closest_entity[0] and float(closest_entity[1]) > threshold:
        success.append(entity)
        return int(db[closest_entity[0]])
    failed.append(entity)
    return -1

def extract_paths(db, x, y):
    key = (str(x) + '###' + str(y))
    try:
        relation = db[key]
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        return {}

word2id_db = pickledb.load(prefix + "w2i.db", False)
id2word_db = pickledb.load(prefix + "i2w.db", False)
path2id_db = pickledb.load(prefix + "p2i.db", False)
id2path_db = pickledb.load(prefix + "i2p.db", False)
relations_db = pickledb.load(prefix + "relations.db", False)


In [32]:

thresholds = [0.5, 0.59, 0.6, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]

for threshold in thresholds:
    
    failed, success = [], []

    emb_indexer, pos_indexer, dep_indexer, dir_indexer = [defaultdict(count(0).__next__) for i in range(4)]
    unk_emb, unk_pos, unk_dep, unk_dir = emb_indexer["<UNK>"], pos_indexer["<UNK>"], dep_indexer["<UNK>"], dir_indexer["<UNK>"]
    rel_indexer = {key: idx for (idx,key) in enumerate(relations)}

    train_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(train_file).read().split("\n")}
    test_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(test_file).read().split("\n")}
    test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instances_file).read().split("\n")}
    test_knocked = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(knocked_file).read().split("\n")}

    paths_train, counts_train, targets_train = parse_dataset(train_dataset)
    paths_test, counts_test, targets_test  = parse_dataset(test_dataset)
    paths_instances, counts_instances, targets_instances  = parse_dataset(test_instances)
    paths_knocked, counts_knocked, targets_knocked  = parse_dataset(test_knocked)

    nodes_train = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in train_dataset]
    nodes_test = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_dataset]
    nodes_instances = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_instances]
    nodes_knocked = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_knocked]

    print ("Train len: {}, Test len: {}, Instance len: {}, Knocked len: {}".format(len(paths_train), len(paths_test),  len(paths_instances), len(paths_knocked)))
    print (len(failed), len(success))
    emb_indexer_inv = {emb_indexer[key]: key for key in emb_indexer}
    embeds = extractUSEEmbeddings(list(emb_indexer.keys())[1:])
    emb_vals = np.array(np.zeros((1, embeds.shape[1])).tolist() + embeds.tolist())


    output_file = "../Input/data_use_unbracketed_" + str(threshold) + ".pkl"
    f = open(output_file, "wb+")
    pickle.dump([nodes_train, paths_train, counts_train, targets_train, 
                 nodes_test, paths_test, counts_test, targets_test,
                 nodes_instances, paths_instances, counts_instances, targets_instances,
                 nodes_knocked, paths_knocked, counts_knocked, targets_knocked,
                 emb_indexer, emb_indexer_inv, emb_vals, 
                 pos_indexer, dep_indexer, dir_indexer, rel_indexer], f)
    f.close()



Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
0 35498
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
122 35376
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
148 35350
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
579 34919
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
735 34763
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
896 34602
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1048 34450
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1292 34206
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1515 33983
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1698 33800
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1912 33586
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
2139 33359
Train len: 10739, Test l

In [2]:
import re, glob
import urllib.request,  en_core_web_lg
import spacy, neuralcoref, itertools
from bs4 import BeautifulSoup
from bs4.element import Comment
from subject_verb_object_extract import findSVOs, nlp
from nltk.chunk.regexp import RegexpParser
from nltk import pos_tag, word_tokenize, sys
from nltk.tree import Tree

def getInstances(text):
    grammar = """
        PRE:   {<NNS|NNP|NN|NP|JJ|UH>+}
        INSTANCE:   {(<JJ+>)?<PRE>}
    """
    chunker = RegexpParser(grammar)
    taggedText = pos_tag(word_tokenize(text))
    textChunks = chunker.parse(taggedText)
    current_chunk = []
    for i in textChunks:
        if (type(i) == Tree and i.label() == "INSTANCE"):
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
    return current_chunk


nlp = en_core_web_lg.load()


# load NeuralCoref and add it to the pipe of SpaCy's model, for coreference resolution
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

for i,file in enumerate(glob.glob("../files/dataset/security*")):
    paras = [t.text for t in list(nlp(open(file).read()).sents)]
    print (file)
    sys.stdout.flush()
    paras = [nlp(para)._.coref_resolved for para in paras]
    print ("done")
    testData = []
    for para in paras:
        instances = getInstances(para)
        ls = list(set(instances))
        ls = list(set(list(itertools.combinations(ls, 2))))
        testData.extend(["\t".join([a,b]) for (a,b) in ls])
    
    testData = [el + "\tnone" for el in list(set(testData))]

    open("../files/dataset/instances" + str(i) + "_mini.tsv", "w+").write("\n".join(testData))


../files/dataset/security4.txt
done
../files/dataset/security3.txt
done
../files/dataset/security1.txt
done
../files/dataset/security2.txt
done


In [4]:
# from spacy.attrs import ORTH, LEMMA
# import re, glob
# import urllib.request,  en_core_web_lg
# import spacy, neuralcoref, itertools
# from bs4 import BeautifulSoup
# from bs4.element import Comment
# from subject_verb_object_extract import findSVOs, nlp
# from nltk.chunk.regexp import RegexpParser
# from nltk import pos_tag, word_tokenize, sys
# from nltk.tree import Tree

# def getInstances(text):
#     grammar = """
#         PRE:   {<NNS|NNP|NN|NP|JJ|UH>+}
#         INSTANCE:   {(<JJ+>)?<PRE>}
#     """
#     chunker = RegexpParser(grammar)
#     taggedText = pos_tag(word_tokenize(text))
#     textChunks = chunker.parse(taggedText)
#     current_chunk = []
#     for i in textChunks:
#         if (type(i) == Tree and i.label() == "INSTANCE"):
#             current_chunk.append(" ".join([token for token, pos in i.leaves()]))
#     return current_chunk


# nlp = en_core_web_lg.load()


# # load NeuralCoref and add it to the pipe of SpaCy's model, for coreference resolution
# coref = neuralcoref.NeuralCoref(nlp.vocab)
# nlp.add_pipe(coref, name='neuralcoref')
# nlp.tokenizer.add_special_case('Inc.', [{ORTH: 'Inc', LEMMA: 'Incorporated'}])


[(0, Fuzzing Image Parsing in Windows,),
 (1, Part One: Color Profiles
  
  Image parsing and rendering are basic features of any modern operating system (OS).),
 (2,
  Image parsing is an easily accessible attack surface, and a vulnerability that may lead to remote code execution or information disclosure in such a feature is valuable to attackers.),
 (3,
  In this multi-part blog series, I am reviewing Windows OS’ built-in image parsers and related file formats: specifically looking at creating a harness, hunting for corpus and fuzzing to find vulnerabilities.),
 (4,
  In part one of this series I am looking at color profiles—not an image format itself, but something which is regularly embedded within images. 
  ),
 (5, What is an ICC Color Profile?),
 (6,
  Wikipedia provides a more-than-adequate description of ICC color profiles: "In color management, an ICC profile is a set of data that characterizes a color input or output device, or a color space, according to standards promulga

In [6]:
# sentencizer = nlp.create_pipe("sentencizer")
# nlp.add_pipe(sentencizer)
# [(i,el) for (i,el) in enumerate(list(nlp(open("../files/dataset/security3.txt").read()).sents))]
files = sorted(glob.glob("../files/dataset/instances*.tsv"))
for i in range(0,len(files),2):
    l1 = open(files[i]).read().split("\n")
    l2 = open(files[i+1]).read().split("\n")
    print ([line for line in l2 if line not in l1])

['end-to-end production network security infrastructures\tenvironment drift\tnone', 'revenues\tfiscal year\tnone', 'enterprises\tgovernments\tnone', 'security information\tnumber\tnone', 'security posture\tvalue\tnone', 'domains\tThreatARMOR\tnone', 'ongoing security posture\ttime\tnone', 'network\tbotnet C\tnone', 'organizations\tgaps\tnone', 'security capabilities\tpoint\tnone', 'gain actionable insight\tgaps\tnone', 'capable products\tSecurity breaches\tnone', 'related group\tautomated security assessment\tnone', 'unused IP space /\tinternal devices\tnone', 'good security hygiene\tattack simulation\tnone', 'validation\tcost\tnone', 'someone\tIT\tnone', 'security capabilities\tmarket research firm Enterprise Management\tnone', 'Ixia Solutions Group\teffective Today\tnone', 'IT solutions provider Sayers\tsafe\tnone', 'organizations\tvalue\tnone', 'security operations teams\tattack simulation\tnone', 'alerts\t%\tnone', 'block traffic\tThreatARMOR\tnone', 'research director\tlimited vis

In [7]:
files

['../files/dataset/instances0.tsv',
 '../files/dataset/instances0_mini.tsv',
 '../files/dataset/instances1.tsv',
 '../files/dataset/instances1_mini.tsv',
 '../files/dataset/instances2.tsv',
 '../files/dataset/instances2_mini.tsv',
 '../files/dataset/instances3.tsv',
 '../files/dataset/instances3_mini.tsv']

In [34]:
import glob
glob.glob("../files/dataset/security*")

['../files/dataset/security4.txt',
 '../files/dataset/security3.txt',
 '../files/dataset/security1.txt',
 '../files/dataset/security2.txt']

In [121]:
paths_output[15][:6]

tensor([[[ 0.8237, -0.3493],
         [ 0.2487, -0.9980],
         [ 1.7283, -0.3358],
         ...,
         [ 0.5942, -0.1741],
         [-0.9849,  1.0004],
         [-0.2682,  1.6092]],

        [[-0.7238, -0.1685],
         [-0.5104, -1.3679],
         [ 0.5790,  0.3465],
         ...,
         [ 0.5371, -0.9790],
         [-0.8587, -0.3445],
         [-0.2285,  0.9683]],

        [[ 0.7231,  0.0178],
         [ 1.4549, -1.0254],
         [ 1.3443,  0.2672],
         ...,
         [-0.5941,  0.8811],
         [ 0.0183, -0.7739],
         [ 1.1137, -0.2095]],

        [[-0.5445,  0.8521],
         [-1.0216,  0.4876],
         [-0.2075,  0.6121],
         ...,
         [ 0.0787, -1.2297],
         [ 0.2751, -0.0413],
         [ 0.2628,  0.9497]],

        [[-3.2454,  0.8706],
         [ 1.8683,  0.2403],
         [-1.9614,  1.6072],
         ...,
         [ 1.5648,  0.6847],
         [-0.3047, -0.4647],
         [ 0.0309,  0.7784]],

        [[-0.6388, -0.7616],
         [-0.7573,  0

In [117]:
counts_train[:100].index([1, 2, 1, 1, 1, 1])

15