In [5]:
import pickle, pickledb
import numpy as np
from itertools import count
from collections import defaultdict
import tensorflow as tf
import tensorflow_hub as hub

train_file = "/data/Vivek/original/HypeNET/dataset/custom_train_0.0_0.05.tsv"
test_file =  "/data/Vivek/original/HypeNET/dataset/custom_test_0.0_0.05.tsv"
instances_file = '../files/dataset/test_instances.tsv'
knocked_file = '../files/dataset/test_knocked.tsv'

NULL_PATH = ((0, 0, 0, 0),)
relations = ["hypernym", "hyponym", "concept", "instance", "none"]
NUM_RELATIONS = len(relations)
prefix = "../junk/db_files/"

USE_link = "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed"
model = hub.load(USE_link)

f = open("../junk/resolved_use_unbracketed.pkl", "rb")
resolved = pickle.load(f)

def extractUSEEmbeddings(words):
    word_embeddings = model(words)
    return word_embeddings.numpy()

In [6]:
arrow_heads = {">": "up", "<":"down"}

def to_list(seq):
    for item in seq:
        if isinstance(item, tuple):
            yield list(to_list(item))
        elif isinstance(item, list):
            yield [list(to_list(elem)) for elem in item]
        else:
            yield item

def extract_direction(edge):

    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrow_heads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrow_heads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge

def parse_path(path):
    parsed_path = []
    for edge in path.split("*##*"):
        direction, edge = extract_direction(edge)
        if edge.split("/"):
            try:
                embedding, pos, dependency = tuple([a[::-1] for a in edge[::-1].split("/",2)][::-1])
            except:
                print (edge, path)
                raise
            emb_idx, pos_idx, dep_idx, dir_idx = emb_indexer[embedding], pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsed_path.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsed_path)

def parse_tuple(tup):
    x, y = [entity_to_id(word2id_db, elem) for elem in tup]
    paths_x, paths_y = list(extract_paths(relations_db,x,y).items()), list(extract_paths(relations_db,y,x).items())
    path_count_dict_x = { id_to_path(id2path_db, path).replace("X/", tup[0]+"/").replace("Y/", tup[1]+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { id_to_path(id2path_db, path).replace("Y/", tup[0]+"/").replace("X/", tup[1]+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}
    return path_count_dict

def parse_dataset(dataset):
    parsed_dicts = [parse_tuple(tup) for tup in dataset.keys()]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
    counts = [list(path_dict.values()) for path_dict in paths]
    paths = [list(path_dict.keys()) for path_dict in paths]
    targets = [rel_indexer[relation] for relation in dataset.values()]
    return list(to_list(paths)), counts, targets



def id_to_entity(db, entity_id):
    entity = db[str(entity_id)]
    return entity

def id_to_path(db, entity_id):
    entity = db[str(entity_id)]
    entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
    return entity

def entity_to_id(db, entity):
    global success, failed
    entity_id = db.get(entity)
    if entity_id:
        success.append(entity)
        return int(entity_id)
    closest_entity = resolved.get(entity, "")
    if closest_entity and closest_entity[0] and float(closest_entity[1]) > threshold:
        success.append(entity)
        return int(db[closest_entity[0]])
    failed.append(entity)
    return -1

def extract_paths(db, x, y):
    key = (str(x) + '###' + str(y))
    try:
        relation = db[key]
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        return {}

word2id_db = pickledb.load(prefix + "w2i.db", False)
id2word_db = pickledb.load(prefix + "i2w.db", False)
path2id_db = pickledb.load(prefix + "p2i.db", False)
id2path_db = pickledb.load(prefix + "i2p.db", False)
relations_db = pickledb.load(prefix + "relations.db", False)


In [32]:

thresholds = [0.5, 0.59, 0.6, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]

for threshold in thresholds:
    
    failed, success = [], []

    emb_indexer, pos_indexer, dep_indexer, dir_indexer = [defaultdict(count(0).__next__) for i in range(4)]
    unk_emb, unk_pos, unk_dep, unk_dir = emb_indexer["<UNK>"], pos_indexer["<UNK>"], dep_indexer["<UNK>"], dir_indexer["<UNK>"]
    rel_indexer = {key: idx for (idx,key) in enumerate(relations)}

    train_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(train_file).read().split("\n")}
    test_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(test_file).read().split("\n")}
    test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instances_file).read().split("\n")}
    test_knocked = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(knocked_file).read().split("\n")}

    paths_train, counts_train, targets_train = parse_dataset(train_dataset)
    paths_test, counts_test, targets_test  = parse_dataset(test_dataset)
    paths_instances, counts_instances, targets_instances  = parse_dataset(test_instances)
    paths_knocked, counts_knocked, targets_knocked  = parse_dataset(test_knocked)

    nodes_train = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in train_dataset]
    nodes_test = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_dataset]
    nodes_instances = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_instances]
    nodes_knocked = [[emb_indexer[tup[0]], emb_indexer[tup[1]]] for tup in test_knocked]

    print ("Train len: {}, Test len: {}, Instance len: {}, Knocked len: {}".format(len(paths_train), len(paths_test),  len(paths_instances), len(paths_knocked)))
    print (len(failed), len(success))
    emb_indexer_inv = {emb_indexer[key]: key for key in emb_indexer}
    embeds = extractUSEEmbeddings(list(emb_indexer.keys())[1:])
    emb_vals = np.array(np.zeros((1, embeds.shape[1])).tolist() + embeds.tolist())


    output_file = "../Input/data_use_unbracketed_" + str(threshold) + ".pkl"
    f = open(output_file, "wb+")
    pickle.dump([nodes_train, paths_train, counts_train, targets_train, 
                 nodes_test, paths_test, counts_test, targets_test,
                 nodes_instances, paths_instances, counts_instances, targets_instances,
                 nodes_knocked, paths_knocked, counts_knocked, targets_knocked,
                 emb_indexer, emb_indexer_inv, emb_vals, 
                 pos_indexer, dep_indexer, dir_indexer, rel_indexer], f)
    f.close()



Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
0 35498
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
122 35376
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
148 35350
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
579 34919
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
735 34763
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
896 34602
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1048 34450
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1292 34206
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1515 33983
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1698 33800
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
1912 33586
Train len: 10739, Test len: 1197, Instance len: 275, Knocked len: 5538
2139 33359
Train len: 10739, Test l

In [62]:
import re, glob
import urllib.request,  en_core_web_lg
import spacy, neuralcoref, itertools
from bs4 import BeautifulSoup
from bs4.element import Comment
from spacy.attrs import ORTH, LEMMA

def preprocess(noun_chunks):
    all_parsed_chunks = []
    filt_tokens = ["DET", "ADV", "PUNCT", "CCONJ"]
    for np in noun_chunks:
        start_index = [i for i,token in enumerate(np) if token.pos_ not in filt_tokens][0]
        np_filt = np[start_index:].text
        if "(" not in np_filt and ")" in np_filt:
            np_filt = np_filt.replace(")", "")
        elif "(" in np_filt and ")" not in np_filt:
            np_filt = np_filt.replace("(", "")
#         np_filt = np_filt.split("\n")[0]
#         np_filt = [(token, token.pos_) for token in np_filt]
#         print (np_filt)
        all_parsed_chunks.append(np_filt)
    return list(set(all_parsed_chunks))
# [([token.pos_ for token in elem], elem) for elem in list(nlp(open("../files/dataset/security1.txt").read()).noun_chunks)]

nlp = en_core_web_lg.load()


# load NeuralCoref and add it to the pipe of SpaCy's model, for coreference resolution
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
nlp.tokenizer.add_special_case('Inc.', [{ORTH: 'Inc', LEMMA: 'Incorporated'}])

for i,file in enumerate(sorted(glob.glob("../files/dataset/security*"))):
# file = "../files/dataset/security4.txt"
    paras = [t.text for t in list(nlp(open(file).read()).sents)]
    paras = [nlp(para)._.coref_resolved.replace("\n", " ").replace("  ", " ") for para in paras]
    instances = [preprocess(nlp(para).noun_chunks) for para in paras]
    instances_pairs = []
    for instances_sent in instances:
        instances_pairs.extend(list(set(list(itertools.combinations(instances_sent, 2)))))

#     print ("done")
#     testData = []
#     for para in paras:
#         instances = getInstances(para)
#         ls = list(set(instances))
#         ls = list(set(list(itertools.combinations(ls, 2))))
#         testData.extend(["\t".join([a,b]) for (a,b) in ls])
    
    instances_pairs = ["\t".join(list(pair) + ["none"]) for pair in instances_pairs if pair]

    open("../files/dataset/instances" + str(i) + ".tsv", "w+").write("\n".join(instances_pairs))


In [61]:
paras = [t.text for t in list(nlp(open("../files/dataset/security3.txt").read()).sents)]
paras = [nlp(para)._.coref_resolved.replace("\n", " ").replace("  ", " ") for para in paras]
instances = [preprocess(nlp(para).noun_chunks) for para in paras]
instances_pairs = []
for instances_sent in instances:
    instances_pairs.extend(list(set(list(itertools.combinations(instances_sent, 2)))))

#     print ("done")
#     testData = []
#     for para in paras:
#         instances = getInstances(para)
#         ls = list(set(instances))
#         ls = list(set(list(itertools.combinations(ls, 2))))
#         testData.extend(["\t".join([a,b]) for (a,b) in ls])

# instances_pairs = ["\t".join(list(pair) + ["none"]) for pair in instances_pairs if pair]
instances_pairs
# open("../files/dataset/instances" + str(i) + ".tsv", "w+").write("\n".join(instances_pairs))


[('YOU', 'ZERO-DAY THREATS'),
 ('zero-day malware', 'zero-day phishing and social engineering attacks'),
 ('zero-day malware', '8,3001 new, previously undiscovered cyber attacks'),
 ('zero-day phishing and social engineering attacks',
  '8,3001 new, previously undiscovered cyber attacks'),
 ('network', 'other core security solutions'),
 ('anti-virus, firewalls', 'network'),
 ('associated file signatures', 'anti-virus, firewalls'),
 ('associated file signatures', 'network'),
 ('associated file signatures', 'other core security solutions'),
 ('anti-virus, firewalls', 'other core security solutions'),
 ('half', 'fact'),
 ('best AV solutions', 'half'),
 ('best AV solutions', 'fact'),
 ('fact', 'malware strains'),
 ('best AV solutions', 'wild'),
 ('wild', 'half'),
 ('wild', 'malware strains'),
 ('wild', 'fact'),
 ('half', 'malware strains'),
 ('best AV solutions', 'malware strains'),
 ('compromise', 'existing indicators'),
 ('existing indicators', 'IOCs'),
 ('existing indicators', 'what'),


In [59]:
paras

['CAN YOU DEFEND AGAINST ZERO-DAY THREATS?\n',
 'Every day, 8,3001 new, previously undiscovered cyber attacks emerge, including\nzero-day malware, zero-day phishing and social engineering attacks.',
 'With no\nassociated file signatures, anti-virus, firewalls and other core security solutions\ncannot identify no\nassociated file signatures, anti-virus, firewalls and other core security solutions\n as malicious and block no\nassociated file signatures, anti-virus, firewalls and other core security solutions\n from entering the network.',
 'In\nfact, even the best AV solutions detect only half of malware strains in the wild.\n',
 'With no existing indicators of compromise (IOCs), how do you protect against what\nyou do not know?\nCOMMON NETWORK SECURITY APPROACHES HAVE LIMITATIONS\n',
 'To protect against zero-day threats, organizations use several approaches.\n',
 'These include:\n•',
 'Conventional sandboxing solutions, which are susceptible to malware evasion\ntechniques, and by defau

In [1]:
import en_core_web_lg
nlp = en_core_web_lg.load()
nlp(open("../files/dataset/security1.txt").read())

Fuzzing Image Parsing in Windows, Part One: Color Profiles

Image parsing and rendering are basic features of any modern operating system (OS). Image parsing is an easily accessible attack surface, and a vulnerability that may lead to remote code execution or information disclosure in such a feature is valuable to attackers. In this multi-part blog series, I am reviewing Windows OS’ built-in image parsers and related file formats: specifically looking at creating a harness, hunting for corpus and fuzzing to find vulnerabilities. In part one of this series I am looking at color profiles—not an image format itself, but something which is regularly embedded within images. 

What is an ICC Color Profile?
Wikipedia provides a more-than-adequate description of ICC color profiles: "In color management, an ICC profile is a set of data that characterizes a color input or output device, or a color space, according to standards promulgated by the International Color Consortium (ICC). Profiles des

In [20]:
from spacy.symbols import *
import en_core_web_lg, neuralcoref
from spacy.attrs import ORTH, LEMMA

nlp = en_core_web_lg.load()
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

file = "../files/dataset/security1.txt"
paras = [t.text for t in list(nlp(open(file).read()).sents)]
paras = [nlp(para)._.coref_resolved for para in paras]

noun_chunks_all = [list(nlp(para).noun_chunks) for para in paras]
[[([token.pos_ for token in noun_chunk], noun_chunk) for noun_chunk in noun_chunks] for noun_chunks in noun_chunks_all]


[[(['PROPN'], Windows)],
 [(['PROPN', 'PROPN'], Color Profiles),
  (['ADJ', 'NOUN'], basic features),
  (['DET', 'ADJ', 'NOUN', 'NOUN'], any modern operating system),
  (['PROPN'], OS)],
 [(['NOUN', 'NOUN'], Image parsing),
  (['DET', 'ADV', 'ADJ', 'NOUN', 'NOUN'], an easily accessible attack surface),
  (['DET', 'NOUN'], a vulnerability),
  (['ADJ', 'NOUN', 'NOUN'], remote code execution),
  (['NOUN', 'NOUN'], information disclosure),
  (['DET', 'DET', 'NOUN'], such a feature),
  (['NOUN'], attackers)],
 [(['DET', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'NOUN'], this multi-part blog series),
  (['PRON'], I),
  (['PROPN', 'PROPN', 'PUNCT', 'VERB', 'PUNCT', 'PART', 'NOUN', 'NOUN'],
   Windows OS’ built-in image parsers),
  (['ADJ', 'NOUN', 'NOUN'], related file formats),
  (['DET', 'NOUN'], a harness),
  (['NOUN'], corpus),
  (['NOUN'], fuzzing),
  (['NOUN'], vulnerabilities)],
 [(['NOUN'], part),
  (['DET', 'NOUN'], this series),
  (['PRON'], I),
  (['NOUN', 'NOUN'], color profiles),
  (['DET', '

In [44]:
[tok.lemma_ for tok in nlp("Windows")]

['window']