In [10]:
from bsddb3 import btopen
import bcolz, pickle, os, sys, shelve, time
import concurrent.futures
import numpy as np
from math import ceil
from itertools import count
from collections import defaultdict
from difflib import SequenceMatcher
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from sparse_dot_topn import awesome_cossim_topn
from scipy.sparse import coo_matrix

train_file = "../files/dataset/train.tsv"
test_file = "../files/dataset/test.tsv"
instances_file = '../files/dataset/test_instances.tsv'
knocked_file = '../files/dataset/test_knocked.tsv'
output_folder = "../junk/Output/"
embeddings_folder = "../junk/Glove.dat"
USE_folder = "/home/vlead/USE"
embeddings_file = "/data/Vivek/glove.6B.300d.txt"
use_embeddings = "../files/embeddings.pt"

POS_DIM = 4
DEP_DIM = 5
DIR_DIM = 1
EMBEDDING_DIM = 300
NULL_PATH = ((0, 0, 0, 0),)
relations = ["hypernym", "hyponym", "concept", "instance", "none"]
# relations = ["True", "False"]
NUM_RELATIONS = len(relations)
prefix = "/data/Vivek/Final/SIREN-Research/OntoEnricher/junk/Files/security_threshold_7_10/security"
op_file = "dataset_parsed_wiki2vec.pkl"

In [11]:
success, failed = [], []
def id_to_entity(db, entity_id):
    try:
        entity = db[str(entity_id)]    
    except:
        entity = db[str(entity_id).decode("utf-8")]
    return entity

def id_to_path(db, entity_id):
    try:
        entity = db[str(entity_id)]
    except:
        entity = db[str(entity_id).decode("utf-8")]
    entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
    return entity

def entity_to_id(db, entity):
    if entity in db:
        success.append(entity)
        try:
            return int(db[entity])
        except:
            return int(db[entity.decode("utf-8")])
    closest_entity = resolved.get(entity, "")[0]
    if closest_entity:
        return int(db[closest_entity])
    return -1

def extract_paths(db, x, y):
    key = (str(x) + '###' + str(y))
    try:
        relation = db[key]
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        return {}

def load_embeddings_from_disk():
    try:
        vectors = bcolz.open(embeddings_folder)[:]
        words = pickle.load(open(embeddings_folder + 'words.pkl', 'rb'))
        word2idx = pickle.load(open(embeddings_folder + 'words_index.pkl', 'rb'))

        embeddings = vectors
    except:
        embeddings, word2idx = create_embeddings()
    return embeddings, word2idx


def create_embeddings():
    words = ['_unk_']
    idx = 1
    word2idx = {"_unk_": 0}
    vectors = bcolz.carray(np.random.random(300), rootdir=embeddings_folder, mode='w')
    with open(embeddings_file, 'r') as f:
        for l in f:
            line = l.split()
            word, vector = line[0], line[1:]
            words.append(word)
            vectors.append(np.array(vector).astype(np.float))
            word2idx[word] = idx
            idx += 1
    vectors = vectors.reshape((-1, EMBEDDING_DIM))
    row_norm = np.sum(np.abs(vectors)**2, axis=-1)**(1./2)
    vectors /= row_norm[:, np.newaxis]
    vectors = bcolz.carray(vectors, rootdir=embeddings_folder, mode='w')
    vectors.flush()

    pickle.dump(words, open(embeddings_folder + 'words.pkl', 'wb'))
    pickle.dump(word2idx, open(embeddings_folder + 'words_index.pkl', 'wb'))

    return vectors, word2idx

try:
    word2id_db = shelve.open(prefix + "_word_to_id_dict.db", 'r')
except:
    print (prefix)
    raise
id2word_db = shelve.open(prefix + "_id_to_word_dict.db", "r")
path2id_db = shelve.open(prefix + "_path_to_id_dict.db", "r")
id2path_db = shelve.open(prefix + "_id_to_path_dict.db", "r")
relations_db = shelve.open(prefix + "_relations_map.db", "r")

embeddings, emb_indexer = load_embeddings_from_disk()

train_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(train_file).read().split("\n")}
test_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(test_file).read().split("\n")}
test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instances_file).read().split("\n")}
test_knocked = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(knocked_file).read().split("\n")}

arrow_heads = {">": "up", "<":"down"}

def extract_direction(edge):

    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrow_heads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrow_heads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge

def parse_path(path):
    parsed_path = []
    for edge in path.split("*##*"):
        direction, edge = extract_direction(edge)
        if edge.split("/"):
            try:
                embedding, pos, dependency = tuple([a[::-1] for a in edge[::-1].split("/",2)][::-1])
            except:
                print (edge, path)
                raise
            emb_idx, pos_idx, dep_idx, dir_idx = emb_indexer.get(embedding, 0), pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsed_path.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsed_path)

def parse_tuple(tup):
    x, y = entity_to_id(word2id_db, tup[0]), entity_to_id(word2id_db, tup[1])
    # paths = list(extract_paths(relations_db,x,y).items()) + list(extract_paths(relations_db,y,x).items())
    # x_word = id_to_entity(id2word_db, x) if x!=-1 else "X"
    # y_word = id_to_entity(id2word_db, y) if y!=-1 else "Y"
    # path_count_dict = { id_to_path(id2path_db, path).replace("X/", x_word+"/").replace("Y/", y_word+"/") : freq for (path, freq) in paths }
    paths_xy = list(extract_paths(relations_db,x,y).items())
    paths_yx = list(extract_paths(relations_db,y,x).items())
    path_count_dict = { id_to_path(id2path_db, path) : freq for (path, freq) in paths_xy }
    path_count_dict.update({ id_to_path(id2path_db, path).replace("X/", '@@@').replace('Y/', 'X/').replace('@@@', 'Y/') : freq for (path, freq) in paths_yx })
    return path_count_dict

def parse_dataset(dataset):
    print ("Parsing dataset for ", prefix)

    parsed_dicts = [parse_tuple(tup) for tup in dataset]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    empty = [list(dataset)[i] for i, path_list in enumerate(paths) if len(list(path_list.keys())) == 0]
    embed_indices = [(emb_indexer.get(x,0), emb_indexer.get(y,0)) for (x,y) in dataset]

    return embed_indices, paths

pos_indexer, dep_indexer, dir_indexer = defaultdict(count(0).__next__), defaultdict(count(0).__next__), defaultdict(count(0).__next__)
unk_pos, unk_dep, unk_dir = pos_indexer["#UNKNOWN#"], dep_indexer["#UNKNOWN#"], dir_indexer["#UNKNOWN#"]

dataset_keys = list(train_dataset.keys()) + list(test_dataset.keys()) + list(test_instances.keys()) + list(test_knocked.keys())
dataset_vals = list(train_dataset.values()) + list(test_dataset.values()) + list(test_instances.values()) + list(test_knocked.values())

mappingDict = {key: idx for (idx,key) in enumerate(relations)}

embed_indices, x = parse_dataset(dataset_keys)
y = [mappingDict[relation] for relation in dataset_vals]


s1 = len(train_dataset)
s2 = len(train_dataset) + len(test_dataset)
s3 = len(train_dataset)+len(test_dataset)+len(test_instances)

parsed_train = (embed_indices[:s1], x[:s1], y[:s1], dataset_keys[:s1], dataset_vals[:s1])
parsed_test = (embed_indices[s1:s2], x[s1:s2], y[s1:s2], dataset_keys[s1:s2], dataset_vals[s1:s2])
parsed_instances = (embed_indices[s2:s3], x[s2:s3], y[s2:s3], dataset_keys[s2:s3], dataset_vals[s2:s3])
parsed_knocked = (embed_indices[s3:], x[s3:], y[s3:], dataset_keys[s3:], dataset_vals[s3:])

f = open(op_file, "wb+")
pickle.dump([parsed_train, parsed_test, parsed_instances, parsed_knocked, pos_indexer, dep_indexer, dir_indexer], f)
f.close()

print ("Successful hits: ", len(success), "Failed hits: ", len(failed))
print ("Parsed",prefix) 

Parsing dataset for  /data/Vivek/Final/SIREN-Research/OntoEnricher/junk/Files/security_threshold_7_10/security
Successful hits:  138959 Failed hits:  0
Parsed /data/Vivek/Final/SIREN-Research/OntoEnricher/junk/Files/security_threshold_7_10/security


In [None]:
embed_indices, x = parse_dataset(dataset_keys)
y = [mappingDict[relation] for relation in dataset_vals]

f = open(op_file, "wb+")

s1 = len(train_dataset)
s2 = len(train_dataset) + len(test_dataset)
s3 = len(train_dataset)+len(test_dataset)+len(test_instances)

parsed_train = (embed_indices[:s1], x[:s1], y[:s1], dataset_keys[:s1], dataset_vals[:s1])
parsed_test = (embed_indices[s1:s2], x[s1:s2], y[s1:s2], dataset_keys[s1:s2], dataset_vals[s1:s2])
parsed_instances = (embed_indices[s2:s3], x[s2:s3], y[s2:s3], dataset_keys[s2:s3], dataset_vals[s2:s3])
parsed_knocked = (embed_indices[s3:], x[s3:], y[s3:], dataset_keys[s3:], dataset_vals[s3:])
pickle.dump([parsed_train, parsed_test, parsed_instances, parsed_knocked, pos_indexer, dep_indexer, dir_indexer], f)
print ("Successful hits: ", len(success), "Failed hits: ", len(failed))
f.close()

print ("Parsed",prefix) 

In [None]:
class LSTM(nn.Module):

    def __init__(self):
        
        super(LSTM, self).__init__()
        self.cache = {}
        
        self.hidden_dim = HIDDEN_DIM + 2 * EMBEDDING_DIM
        self.input_dim = POS_DIM + DEP_DIM + EMBEDDING_DIM + DIR_DIM
        self.W = nn.Linear(NUM_RELATIONS, self.input_dim)
        self.dropout_layer = nn.Dropout(p=dropout)
        self.softmax = nn.LogSoftmax()
        
        self.word_embeddings = nn.Embedding(len(embeddings), EMBEDDING_DIM)
        self.word_embeddings.load_state_dict({'weight': torch.from_numpy(np.array(embeddings))})
        self.word_embeddings.require_grad = False
        
        self.pos_embeddings = nn.Embedding(len(pos_indexer), POS_DIM)
        self.dep_embeddings = nn.Embedding(len(dep_indexer), DEP_DIM)
        self.dir_embeddings = nn.Embedding(len(dir_indexer), DIR_DIM)
        
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, NUM_LAYERS)
    
    def embed_path(self, elem):
        path, count = elem
        if path in self.cache:
            return cache[path] * count
        lstm_inp = []
        for edge in path:
            inputs = [torch.Tensor([[el]]) for el in edge]
            word_embed = self.dropout_layer(self.word_embeddings(inputs[0]))
            pos_embed = self.dropout_layer(self.pos_embeddings(inputs[1]))
            dep_embed = self.dropout_layer(self.dep_embeddings(inputs[2]))
            dir_embed = self.dropout_layer(self.dir_embeddings(inputs[3]))
            embeds = np.concatenate((word_embed, pos_embed, dep_embed, dir_embed))
            lstm_inp.append(embeds)
        output, _ = self.lstm(lstm_inp)
        cache[path] = output

        return output * count
    
    def forward(self, data, emb_indexer):
        if not data:
            data[NULL_PATH] = 1
        print ("Data: ", data)
        num_paths = [sum(list(paths.values())) for paths in data]
        print ("Number of paths: ", num_paths)
        path_embeddings = [np.sum([self.embed_path(path) for path in paths.items()]) for paths in data]
        print ("Path Embeddings: ", path_embeddings)
        
        h = np.divide(path_embeddings, num_paths)
        h = [np.concatenate((self.word_embeddings(elem[0]), h[i], self.word_embeddings(elem[1]))) for i,emb in enumerate(emb_indexer)]
        return self.softmax(self.W(h))

HIDDEN_DIM = 60
NUM_LAYERS = 2
num_epochs = 3
batch_size = 10

dataset_size = len(y_train)
batch_size = min(batch_size, dataset_size)
num_batches = int(ceil(dataset_size/batch_size))

lr = 0.001
dropout = 0.3
lstm = LSTM()
criterion = nn.NLLLoss()
optimizer = optim.Adam(lstm.parameters(), lr=lr)

for epoch in range(num_epochs):
    
    total_loss, epoch_idx = 0, np.random.permutation(dataset_size)
    
    for batch_idx in range(num_batches):
        batch_end = (batch_idx+1) * batch_size
        batch_start = batch_idx * batch_size
        batch = epoch_idx[batch_start:batch_end]
        
        data, labels, embeddings_idx = x_train[batch], y_train[batch], embed_indices_train[batch]
        
        # Run the forward pass
        outputs = lstm(data, embeddings_idx)
        loss = criterion(outputs, labels)

        # Backprop and perform Adam optimisation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    total_loss /= dataset_size
    print('Epoch [{}/{}] Loss: {:.4f}'.format(epoch + 1, num_epochs, total_loss))
    loss_list.append(loss.item())

lstm.eval()
with torch.no_grad():
    predictedLabels = []
    for batch_idx in range(num_batches):
        outputs = lstm(data)
        print (outputs)
        _, predicted = torch.max(outputs.data, 1)
        predictedLabels.extend(predicted)

In [None]:
x

In [None]:
from torch import nn
import torch
e = nn.Embedding(3, 3)
ls = [[0, 1, 2], [3,4,5], [6,7,8]]
ls = np.array([np.array(el) for el in ls])
e.load_state_dict({'weight': torch.from_numpy(ls)})

In [None]:
num_paths = [sum(list(paths.values())) for paths in data]
        print ("Number of paths: ", num_paths)
        path_embeddings = np.array([np.sum([self.embed_path(path) for path in paths.items()]) for paths in data])
        #print ("Path Embeddings: ", path_embeddings)
        
        h = np.divide(path_embeddings, num_paths)
        print (h.shape)
        h = [np.concatenate((self.word_embeddings(emb[0]), h[i], self.word_embeddings(emb[1]))) for i,emb in enumerate(emb_indexer)]

In [None]:
# t = torch.randn(1,4)
torch.Tensor([[1]]).shape
# torch.cat((h, t.view(1,-1)), 0)

In [None]:
loss = nn.CrossEntropyLoss()
inputt = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(inputt, target)
output.backward()
print (output, inputt, target)

In [5]:
import time 
word = "margherita pizza" 

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
#     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(words, word_to_compare, max_sim=-1000, closest_word=""):
    word_embeddings = extractUSEEmbeddings(words)
    closest_word = ""
    with shelve.open(use_embeddings, 'c') as db:
        for i, w in enumerate(word_embeddings):
            db[words[i]] = w
        closest_word_idx = np.argmax(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
        sim = np.max(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
        if sim > max_sim:
            max_sim = sim
            closest_word = words[closest_word_idx]
        del word_embeddings
    del db
    return closest_word, max_sim

def closest_word_USE(word, method="USE"):

    word_to_compare = extractUSEEmbeddings([word])
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))
    if os.path.isfile(use_embeddings):
        with shelve.open(use_embeddings, 'r') as db:
            embeds = np.array(list(db.values()))
            words = np.array(list(db.keys()))
            print ("Values and keys obtained", time.time()-a)
            sim_mat = awesome_cossim_topn(coo_matrix(embeds, dtype=np.float64), coo_matrix(word_to_compare.T, dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250)
            print ("Sim mat calculated", time.time()-a)
            closest_word_idx = np.argmax(sim_mat)
            print ("idx gotten", time.time()-a)
            closest_word = words[closest_word_idx]
    else:
        words = list(word2id_db.keys())
        print ("Obtained list of words")
        len_part = 100000
        max_sim = -1000
        n_parts = ceil(len(words)/len_part)
        closest_word = ""
        for i in range(n_parts):
            words_part = words[i*len_part:(i+1)*len_part]
            closest_word, max_sim = compare_sim(words_part, word_to_compare, max_sim, closest_word)

    
    return closest_word

a = time.time()
closest_word = closest_word_USE("wansecure firewall")
print (time.time()-a)
closest_word



Took me 16.031342029571533 seconds to extract USE embeddings...


KeyboardInterrupt: 

In [None]:
### import os
os.getcwd()

In [None]:

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))



org_names = names['buyer'].unique()
vectorizer = TfidfVectorizer(min_df=1, analyzer=extract_ngrams)
tf_idf_matrix = vectorizer.fit_transform(org_names)

t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.85)
t = time.time()-t1


print('All 3-grams in "Department":')
print(extract_ngrams('Department'))

In [None]:
import re
from ftfy import fix_text
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
from sparse_dot_topn import awesome_cossim_topn

chars_to_remove = [")","(",".","|","[","]","{","}","'"]

def extract_ngrams(string, n=3):
    string = fix_text(string).encode("ascii", errors="ignore").decode().lower() # fix text
    string = string.replace('&', 'and').replace(',', ' ').replace('-', ' ').title()
    string = re.sub('[' + re.escape(''.join(chars_to_remove)) + ']', '', string)
    string = ' ' + re.sub(' +',' ',string).strip() + ' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    ngrams = [''.join(ngram) for ngram in ngrams]
    return ngrams

word_to_match = "margherita pizza"
words = list(word2id_db.keys())
vectorizer = TfidfVectorizer(min_df=1, analyzer=extract_ngrams)
tf_idf_matrix = vectorizer.fit_transform(words + [word_to_match])

# d = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.85, use_threads=True, n_jobs=256)



In [None]:
d = awesome_cossim_topn(tf_idf_matrix[:-1], tf_idf_matrix[-1].transpose(), 10, 0.85, use_threads=True, n_jobs=256)

In [None]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    print (sparserows)
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
import time
start = time.time()
d = awesome_cossim_topn(tf_idf_matrix[:-1], tf_idf_matrix[-1].transpose(), 10, 0.85, use_threads=True, n_jobs=256)
words[np.argmax(d)]
print ("time: ", start - time.time())

In [None]:
relations_db_new = shelve.open(prefix + "_relations_map.db", "c")
for k, v in relations_db.items():
    relations_db_new["###".join(k.split("_"))] = v
relations_db_new.close()

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms


train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=batch_size, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x)

net = Net()
print(net)

# create a stochastic gradient descent optimizer
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
# create a loss function
criterion = nn.NLLLoss()

# run the main training loop
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        # resize data from (batch_size, 1, 28, 28) to (batch_size, 28*28)
        data = data.view(-1, 28*28)
        optimizer.zero_grad()
        net_out = net(data)
        loss = criterion(net_out, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.data[0]))

# run a test loop
test_loss = 0
correct = 0
for data, target in test_loader:
    data, target = Variable(data, volatile=True), Variable(target)
    data = data.view(-1, 28 * 28)
    net_out = net(data)
    # sum up batch loss
    test_loss += criterion(net_out, target).data[0]
    pred = net_out.data.max(1)[1]  # get the index of the max log-probability
    correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))


In [None]:
data = ["\t".join(l.split("\t")[1:-1]) for l in open("../junk/security_dataset.tsv","r").read().split("\n")[1:]]
open("../files/dataset/dataset.tsv","w").write("\n".join(data))

In [None]:
with shelve.open(use_embeddings, 'r') as db:    
    allitems = list(db.items())
    emb = [el[1] for el in allitems]
    wds = [el[0] for el in allitems]
    file = open("../files/embeddings_list.pkl", "wb")
    pickle.dump(allitems, file)
            
        

In [None]:
import time 
word = "margherita pizza" 

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(args):
    words, word_to_compare, max_sim, closest_word = args
    t = time.time()
    word_embeddings = extractUSEEmbeddings(words)
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-t))
    sys.stdout.flush()
    closest_word_idx = np.argmax(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    sim = np.max(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    if sim > max_sim:
        max_sim = sim
        closest_word = words[closest_word_idx]
    del word_embeddings
    return (closest_word, max_sim)

def closest_word_USE(word, method="USE"):

    word_to_compare = extractUSEEmbeddings([word])
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))
#     words = list(word2id_db.keys())
    print ("Took me {} seconds to obtain words list...".format(time.time()-a))
    len_part = 100000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
    for i in range(n_parts):
        t = time.time()
        words_part = words[i*len_part:(i+1)*len_part]
        sub_arrays = np.array_split(words_part, 2)
        args = [(sub_array, word_to_compare, max_sim, closest_word) for sub_array in sub_arrays]
        results = []
        with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
            for res in executor.map(compare_sim, args):
                results.append(res)
        closest_word, max_sim = max(results, key=lambda l:l[-1])
        print ("Took me {} seconds to iteration of sim compare...".format(time.time()-t))

    
    return closest_word

a = time.time()
closest_word = closest_word_USE("wansecure firewall")
print (time.time()-a)
closest_word



In [3]:
import time 
word = "margherita pizza" 

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(args):
    words, word_to_compare, max_sim, closest_word = args
    t = time.time()
    word_embeddings = extractUSEEmbeddings(words)
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-t))
    sys.stdout.flush()
    closest_word_idx = np.argmax(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    sim = np.max(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    if sim > max_sim:
        max_sim = sim
        closest_word = words[closest_word_idx]
    del word_embeddings
    return (closest_word, max_sim)

def closest_word_USE(word, method="USE"):

    word_to_compare = extractUSEEmbeddings([word])
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))
#     words = list(word2id_db.keys())
    print ("Took me {} seconds to obtain words list...".format(time.time()-a))
    len_part = 100000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
    for i in range(n_parts):
        t = time.time()
        words_part = words[i*len_part:(i+1)*len_part]
        sub_arrays = np.array_split(words_part, 2)
        args = [(sub_array, word_to_compare, max_sim, closest_word) for sub_array in sub_arrays]
        results = []
        with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
            for res in executor.map(compare_sim, args):
                results.append(res)
        closest_word, max_sim = max(results, key=lambda l:l[-1])
        print ("Took me {} seconds to iteration of sim compare...".format(time.time()-t))

    
    return closest_word

a = time.time()
closest_word = closest_word_USE("wansecure firewall")
print (time.time()-a)
closest_word



22912765

In [7]:
# words_sample = ["pizza hut", "burger king", "south africa", "nasa"]
# del og_dict
def calculate_sim(words, word1, max_sim, closest_word):
    t = time.time()
    i = 0
    for word2 in words:
        try:
            sim = wiki2vec.similarity("_".join(word1.lower().split()), "_".join(word2.split()))
            if sim > max_sim:
                max_sim = sim
                closest_word = word2
            i += 1
        except Exception as e:
            continue
    print ("Original word: ", word1, "Closest Word: ", closest_word)
    print ("Took me {} seconds to iteration of sim compare...".format(time.time()-a))
    sys.stdout.flush()
    return (closest_word, max_sim)

def closest_word_w2v(word1):
    len_part = 100000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
    if word1 not in wiki2vec.wv.vocab:
        print ("Original word not in vocab", word1)
        return (closest_word, max_sim)
    for i in range(n_parts):
        words_part = words[i*len_part:(i+1)*len_part]
        closest_word, max_sim = calculate_sim(words_part, word1, max_sim, closest_word)
    return word1, closest_word          

a = time.time()

# closest_word = closest_word_w2v("margherita pizza")

# closest_word_w2v("nelson mandela")

resolved = dict()
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
    for res in executor.map(closest_word_w2v, failed):
        resolved[res[0]] = res[1]


    



Original word not in vocab peter wyche (diplomat)




Original word not in vocab acoma-zuni section
Original word not in vocab madan-harini
Original word not in vocab trust no one (internet security)
Original word not in vocab international tibet independence movement
Original word not in vocab isobase
Original word not in vocab human computer interaction (security)
Original word not in vocab poetas de karaoke




Original word not in vocab ipa pulmonic consonant chart with audio
Original word not in vocab lego clutch powers: bad hair day
Original word not in vocab aed (non-profit)
Original word not in vocab quilmes airport
Original word not in vocab yendegaia airport
Original word not in vocab the pack a.d.
Original word not in vocab harvie-watt baronets
Original word not in vocab sharp actius rd3d notebook
Original word not in vocab big beach boutique ii - the movie
Original word not in vocab privacy by design
Original word not in vocab motorola devour
Original word not in vocab piracy act
Original word not in vocab starter ring gear
Original word not in vocab antonio sánchez (puerto rican host)
Original word not in vocab electronic logbook
Original word not in vocab greg burke (journalist)
Original word not in vocab deaths in november 2013
Original word not in vocab hp mini 311
Original word not in vocab confederation of indigenous nationalities of the ecuadorian amazon
Original word not in v

142 done
154 done
147 done
159 done
139 done
143 done
155 done
148 done
160 done
140 done
144 done
156 done
161 done
149 done
145 done
157 done
141 done
150 done
146 done
142 done
158 done
162 done
151 done
147 done
159 done
143 done
152 done
148 done
163 done
160 done
144 done
149 done
153 done
161 done
145 done
150 done
146 done
154 done
164 done
151 done
147 done
155 done
148 done
152 done
156 done
165 done
162 done
149 done
157 done
166 done
163 done
153 done
150 done
158 done
154 done
151 done
167 done
159 done
155 done
164 done
152 done
168 done
156 done
160 done
165 done
153 done
169 done
161 done
157 done
166 done
170 done
154 done
158 done
167 done
159 done
155 done
171 done
168 done
156 done
162 done
160 done
172 done
169 done
157 done
161 done
163 done
173 done
170 done
158 done
171 done
159 done
174 done
172 done
162 done
160 done
175 done
164 done
161 done
173 done
163 done
176 done
165 done
174 done
166 done
177 done
175 done
178 done
164 done
167 done
162 done
176 done
1

330 done
321 done
336 done
327 done
338 done
322 done
337 done
328 done
331 done
323 done
339 done
338 done
332 done
324 done
329 done
340 done
339 done
325 done
333 done
340 done
330 done
326 done
341 done
334 done
327 done
341 done
331 done
335 done
342 done
332 done
336 done
328 done
343 done
342 done
337 done
333 done
344 done
343 done
329 done
345 done
338 done
344 done
334 done
346 done
330 done
339 done
345 done
335 done
347 done
340 done
336 done
331 done
348 done
346 done
337 done
347 done
332 done
341 done
349 done
348 done
338 done
333 done
350 done
342 done
349 done
334 done
339 done
351 done
343 done
350 done
340 done
335 done
344 done
351 done
336 done
352 done
345 done
341 done
353 done
337 done
346 done
352 done
354 done
342 done
347 done
338 done
353 done
355 done
348 done
343 done
339 done
356 done
354 done
344 done
349 done
340 done
355 done
345 done
357 done
350 done
356 done
346 done
341 done
358 done
351 done
347 done
357 done
359 done
348 done
360 done
358 done
3

In [1]:
from gensim.models import KeyedVectors
# w2v = KeyedVectors.load_word2vec_format("~/GoogleNews-vectors-negative300.bin", binary=True)
wiki2vec = KeyedVectors.load_word2vec_format("/home/vlead/enwiki_20180420_win10_300d.txt")

In [4]:
from copy import deepcopy
og_dict = deepcopy(wiki2vec.wv.vocab)
for k in og_dict:
    if "/" in k:
        wiki2vec.wv.vocab[k.split("/")[1].lower()] = wiki2vec.wv.vocab[k]
        del wiki2vec.wv.vocab[k]
del og_dict
f = open("w2v_data", "wb")
pickle.dump([words, failed], f)

  
  """
  


In [44]:
a = time.time()
"abrkadabra" in w2v.wv
print (time.time()-a)
a = time.time()
try:
    w2v.similarity("margherita_pizza", "abrkadabra")
except:    
    print (time.time()-a)
    pass

0.0005288124084472656
0.00016236305236816406


  


In [3]:
words = list(word2id_db.keys())

In [3]:
wiki2vec["january"]

array([ 0.0884,  0.2092, -0.1895, -0.1527, -0.0978,  0.0378, -0.1611,
        0.0245,  0.0549, -0.2892,  0.0931, -0.3243, -0.2276, -0.0727,
        0.0521, -0.2883, -0.0754, -0.0059, -0.0705, -0.3562, -0.1019,
        0.0847,  0.111 ,  0.0049, -0.3304, -0.2235,  0.1369, -0.1037,
       -0.0751, -0.3887,  0.1092, -0.1504,  0.0167,  0.0217,  0.0204,
        0.064 , -0.2647,  0.3114, -0.0973,  0.1509, -0.2116, -0.0882,
        0.1436, -0.2557,  0.23  ,  0.1662,  0.04  , -0.1121,  0.0426,
       -0.179 , -0.0356, -0.1443, -0.2153, -0.1841, -0.2113, -0.1561,
        0.258 , -0.0593, -0.1704, -0.0394, -0.0992, -0.1615,  0.0623,
       -0.1708, -0.1204,  0.2041,  0.173 , -0.3095, -0.0589, -0.0366,
        0.0084, -0.2201, -0.3896, -0.2086,  0.323 , -0.0779, -0.1028,
        0.0626,  0.2596,  0.0631,  0.18  ,  0.1857,  0.3112,  0.0103,
        0.2184, -0.102 ,  0.0504,  0.0907,  0.2355,  0.2216,  0.0125,
        0.0075,  0.0846, -0.1534,  0.4137,  0.0309, -0.2167, -0.0785,
       -0.0552,  0.1

In [5]:
import pickle
f = open("resolved", "rb")
resolved = pickle.load(f)

{'peter wyche (diplomat)': ('', -1000),
 'acoma-zuni section': ('', -1000),
 'madan-harini': ('', -1000),
 'trust no one (internet security)': ('', -1000),
 'ipa pulmonic consonant chart with audio': ('', -1000),
 'isobase': ('', -1000),
 'international tibet independence movement': ('', -1000),
 'human computer interaction (security)': ('', -1000),
 'poetas de karaoke': ('', -1000),
 'lego clutch powers: bad hair day': ('', -1000),
 'yendegaia airport': ('', -1000),
 'aed (non-profit)': ('', -1000),
 'quilmes airport': ('', -1000),
 'the pack a.d.': ('', -1000),
 'harvie-watt baronets': ('', -1000),
 'sharp actius rd3d notebook': ('', -1000),
 'big beach boutique ii - the movie': ('', -1000),
 'privacy by design': ('', -1000),
 'motorola devour': ('', -1000),
 'piracy act': ('', -1000),
 'antonio sánchez (puerto rican host)': ('', -1000),
 'yinzcam': ('nanorex', 0.69111955),
 'starter ring gear': ('', -1000),
 'electronic logbook': ('', -1000),
 'greg burke (journalist)': ('', -1000),