In [1]:
import bcolz, pickle, os, sys, pickledb, time
import concurrent.futures
import numpy as np
from math import ceil
from itertools import count
from collections import defaultdict
from difflib import SequenceMatcher
import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial
from gensim.models.keyedvectors import KeyedVectors
from copy import deepcopy

train_file = "/data/Vivek/original/HypeNET/dataset/custom_train_0.0_0.05.tsv"
test_file =  "/data/Vivek/original/HypeNET/dataset/custom_test_0.0_0.05.tsv"
instances_file = '../files/dataset/test_instances.tsv'
knocked_file = '../files/dataset/test_knocked.tsv'
output_folder = "../junk/Output/"
embeddings_folder = "../junk/Wiki2Vec.dat/"
USE_folder = "/home/vlead/USE"
embeddings_file = "/home/vlead/wiki2vec_glove.txt"
use_embeddings = "../files/embeddings.pt"

POS_DIM = 4
DEP_DIM = 5
DIR_DIM = 1
EMBEDDING_DIM = 512
NULL_VECTOR = np.random.rand(512)
NULL_PATH = ((tuple(NULL_VECTOR), 0, 0, 0),)
# NULL_PATH = ((0, 0, 0, 0), )
relations = ["hypernym", "hyponym", "concept", "instance", "none"]
NUM_RELATIONS = len(relations)
prefix = "../junk/db_files/"
# op_file = "../junk/wiki2vec_input.pkl"

# model = KeyedVectors.load_word2vec_format("/data/Vivek/glove_tmp")
 
# wiki2vec = KeyedVectors.load_word2vec_format("/home/vlead/enwiki_20180420_win10_300d.txt")

# og_dict = deepcopy(wiki2vec.wv.vocab)
# for k in og_dict:
#     if "/" in k:
#         wiki2vec.wv.vocab[k.split("/")[1].lower()] = wiki2vec.wv.vocab[k]
#         del wiki2vec.wv.vocab[k]
# del og_dict

In [38]:
def load_embeddings_from_disk():
    try:
        embeddings = bcolz.open(embeddings_folder)[:]
        word2idx = pickle.load(open(embeddings_folder + 'words_index.pkl', 'rb'))
        og = deepcopy(word2idx)
        for k in og:
            if "/" in k:
                word2idx[k.split("/")[1].lower()] = og[k]
                del word2idx[k]
        row_norm = np.sum(np.abs(embeddings)**2, axis=-1)**(1./2)
        embeddings /= row_norm[:, np.newaxis]
    except:
        embeddings, word2idx = create_embeddings()
    return embeddings, word2idx


def create_embeddings():
#     vocab = set([a for a in success + dataset_keys if a])
    idx = 1
    word2idx = {"_unk_": 0}
    vectors = bcolz.carray(NULL_VECTOR.reshape(1, EMBEDDING_DIM), rootdir=embeddings_folder, mode='w')
    with open(embeddings_file, 'r') as f:
        for l in f:
            line = [a[::-1] for a in l[::-1].split(" ", 300)[::-1]]
            word, vector = line[0], [float(s) for s in line[1:]]
            vectors.append(np.resize(np.array(vector), (1, 300)).astype(np.float))
            word2idx[word] = idx
            idx += 1


    vectors = bcolz.carray(vectors, rootdir=embeddings_folder, mode='w')
    vectors.flush()

    pickle.dump(word2idx, open(embeddings_folder + 'words_index.pkl', 'wb'))
    row_norm = np.sum(np.abs(vectors)**2, axis=-1)**(1./2)
    vectors /= row_norm[:, np.newaxis]
    return vectors, word2idx



arrow_heads = {">": "up", "<":"down"}


def extract_direction(edge):

    if edge[0] == ">" or edge[0] == "<":
        direction = "start_" + arrow_heads[edge[0]]
        edge = edge[1:]
    elif edge[-1] == ">" or edge[-1] == "<":
        direction = "end_" + arrow_heads[edge[-1]]
        edge = edge[:-1]
    else:
        direction = ' '
    return direction, edge

def parse_path(path):
    parsed_path = []
    for edge in path.split("*##*"):
        direction, edge = extract_direction(edge)
        if edge.split("/"):
            try:
                embedding, pos, dependency = tuple([a[::-1] for a in edge[::-1].split("/",2)][::-1])
            except:
                print (edge, path)
                raise
            emb_idx, pos_idx, dep_idx, dir_idx = tuple(emb_indexer[embedding]), pos_indexer[pos], dep_indexer[dependency], dir_indexer[direction]
            parsed_path.append(tuple([emb_idx, pos_idx, dep_idx, dir_idx]))
        else:
            return None
    return tuple(parsed_path)

def parse_tuple(tup):
    x, y = tup
    paths_x = list(extract_paths(relations_db,x,y).items()) 
    paths_y = list(extract_paths(relations_db,y,x).items())
    x_word = id_to_entity(id2word_db, x) if x!=-1 else "X"
    y_word = id_to_entity(id2word_db, y) if y!=-1 else "Y"
    path_count_dict_x = { id_to_path(id2path_db, path).replace("X/", x_word+"/").replace("Y/", y_word+"/") : freq for (path, freq) in paths_x }
    path_count_dict_y = { id_to_path(id2path_db, path).replace("Y/", x_word+"/").replace("X/", y_word+"/") : freq for (path, freq) in paths_y }
    path_count_dict = {**path_count_dict_x, **path_count_dict_y}

#     paths_xy = list(extract_paths(relations_db,x,y).items())
#     paths_yx = list(extract_paths(relations_db,y,x).items())
#     path_count_dict = { id_to_path(id2path_db, path) : freq for (path, freq) in paths_xy }
#     path_count_dict.update({ id_to_path(id2path_db, path).replace("X/", '@@@').replace('Y/', 'X/').replace('@@@', 'Y/') : freq for (path, freq) in paths_yx })
    return path_count_dict

def parse_dataset(dataset):
    print ("Parsing dataset for ", prefix)

    dataset_ids = [(entity_to_id(word2id_db, tup[0]), entity_to_id(word2id_db, tup[1])) for tup in dataset]
#     global words_list
#     for (x,y) in dataset:
#         words_list.append(x)
#         words_list.append(y)
    parsed_dicts = [parse_tuple(tup) for tup in dataset_ids]
    parsed_dicts = [{ parse_path(path) : path_count_dict[path] for path in path_count_dict } for path_count_dict in parsed_dicts]
    paths = [{ path : path_count_dict[path] for path in path_count_dict if path} for path_count_dict in parsed_dicts]
    empty = [list(dataset_ids)[i] for i, path_list in enumerate(paths) if len(list(path_list.keys())) == 0]
    paths = [{NULL_PATH: 1} if not path_list else path_list for i, path_list in enumerate(paths)]
#     embed_indices = [(return_sim(x), return_sim(y)) for (x,y) in dataset]
#     embed_indices = []
    embed_indices = [(tuple(emb_indexer[x]), tuple(emb_indexer[y])) for (x,y) in dataset]
    
    return embed_indices, paths

pos_indexer, dep_indexer, dir_indexer = defaultdict(count(0).__next__), defaultdict(count(0).__next__), defaultdict(count(0).__next__)
unk_pos, unk_dep, unk_dir = pos_indexer["#UNKNOWN#"], dep_indexer["#UNKNOWN#"], dir_indexer["#UNKNOWN#"]

train_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(train_file).read().split("\n")}
test_dataset = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(test_file).read().split("\n")}
test_instances = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instances_file).read().split("\n")}
# test_instances_2 = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(instances_file).read().split("\n")[141:]}
test_knocked = {tuple(l.split("\t")[:2]): l.split("\t")[2] for l in open(knocked_file).read().split("\n")}


dataset_keys = list(train_dataset.keys()) + list(test_dataset.keys()) + list(test_instances.keys()) + list(test_knocked.keys())
dataset_vals = list(train_dataset.values()) + list(test_dataset.values()) + list(test_instances.values()) + list(test_knocked.values())
# # dataset_keys = list(test_instances.keys())
# # dataset_vals = list(test_instances.values())


# embeddings, emb_indexer = load_embeddings_from_disk()

mappingDict = {key: idx for (idx,key) in enumerate(relations)}

word2id_db = pickledb.load(prefix + "w2i.db", False)
id2word_db = pickledb.load(prefix + "i2w.db", False)
path2id_db = pickledb.load(prefix + "p2i.db", False)
id2path_db = pickledb.load(prefix + "i2p.db", False)
relations_db = pickledb.load(prefix + "relations.db", False)




# for threshold in [0] + np.arange(0.5,1,0.05):
for threshold in [0]+ list(np.arange(0.5,1,0.05)):
    op_file = "../junk/use_input_" + str(threshold) + ".pkl"
    failed, success = [], []
    # embeddings_file = "/data/Vivek/glove.6B.300d.txt"
    def id_to_entity(db, entity_id):
        entity = db.get(str(entity_id))
        return entity

    def id_to_path(db, entity_id):
        entity = db.get(str(entity_id))
        entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
        return entity

    def entity_to_id(db, entity):
        global success, failed
        entity_id = db.get(entity)
        if entity_id:
            success.append(entity)
            return int(entity_id)
        closest_entity = resolved.get(entity, "")
        if closest_entity and closest_entity[0] and float(closest_entity[1]) > threshold:
            success.append(entity)
            return int(db.get(closest_entity[0]))
        failed.append(entity)
        return -1

    def extract_paths(db, x, y):
        key = (str(x) + '###' + str(y))
        try:
            relation = db.get(key)
            return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
        except Exception as e:
            return {}

    def return_sim(model, word):
        try:
            return tuple(model["_".join(word.split())])
        except:
            return tuple(NULL_VECTOR)
    # def load_embeddings_from_disk():
    #     try:
    #         vectors = bcolz.open(embeddings_folder)[:]
    #         word2idx = pickle.load(open(embeddings_folder + 'words_index.pkl', 'rb'))

    #         vocab = set([a for a in success + dataset_keys if a])
    #         word2idx_lite = {"_unk_": 0}
    #         embeddings_lite = bcolz.carray(np.random.uniform(-1, 1, (1, 300)), rootdir=embeddings_folder, mode='w')

    #         for word in word2idx:
    #             if word not in vocab:
    #                 continue
    #             embeddings_lite.append(embeddings[word2idx[word]])
    #             word2idx_lite[word] = idx
    #             idx += 1

    #         embeddings = vectors
    #     except:
    #         embeddings_lite, word2idx_lite = create_embeddings()
    #     return embeddings_lite, word2idx_lite


    
    

    embed_indices, x = parse_dataset(dataset_keys)
    y = [mappingDict[relation] for relation in dataset_vals]


    s1 = len(train_dataset)
    s2 = len(train_dataset) + len(test_dataset)
    s3 = len(train_dataset)+len(test_dataset)+len(test_instances)

    parsed_train = (embed_indices[:s1], x[:s1], y[:s1], dataset_keys[:s1], dataset_vals[:s1])
    parsed_test = (embed_indices[s1:s2], x[s1:s2], y[s1:s2], dataset_keys[s1:s2], dataset_vals[s1:s2])
    parsed_instances = (embed_indices[s2:s3], x[s2:s3], y[s2:s3], dataset_keys[s2:s3], dataset_vals[s2:s3])
    parsed_knocked = (embed_indices[s3:], x[s3:], y[s3:], dataset_keys[s3:], dataset_vals[s3:])

    f = open(op_file, "wb+")
    pickle.dump([parsed_train, parsed_test, parsed_instances, parsed_knocked, pos_indexer, dep_indexer, dir_indexer], f)
    f.close()

    print ("Successful hits: ", len(success), "Failed hits: ", len(failed))
    print ("Parsed",prefix) 

KeyboardInterrupt: 

In [41]:
def id_to_entity(db, entity_id):
    entity = db.get(str(entity_id))
    return entity

def id_to_path(db, entity_id):
    entity = db.get(str(entity_id))
    entity = "/".join(["*##*".join(e.split("_", 1)) for e in entity.split("/")])
    return entity

def entity_to_id(db, entity):
    global success, failed
    entity_id = db.get(entity)
    if entity_id:
#         success.append(entity)
        return int(entity_id)
#     closest_entity = resolved.get(entity, "")
#     if closest_entity and closest_entity[0] and float(closest_entity[1]) > threshold:
#         success.append(entity)
#         return int(db.get(closest_entity[0]))
#     failed.append(entity)
    return -1

def extract_paths(db, x, y):
    key = (str(x) + '###' + str(y))
    try:
        relation = db.get(key)
        print (relation)
        return {int(path_count.split(":")[0]): int(path_count.split(":")[1]) for path_count in relation.split(",")}
    except Exception as e:
        print (e)
        return {}

parse_tuple((entity_to_id(word2id_db, "cipher"), entity_to_id(word2id_db, "feal")))

False
'bool' object has no attribute 'split'
2862967:1,4788466:1


{'feal/PROPN/nsubj>*##*be/VERB/ROOT*##*<cipher/NOUN/attr*##*>propose/VERB/acl': 1,
 'feal/PROPN/nsubj>*##*be/VERB/ROOT*##*<cipher/NOUN/attr': 1}

In [7]:
emb_indexer

{'_unk_': 0,
 'the': 769585,
 'in': 1501724,
 'of': 3,
 'a': 4181521,
 'and': 3754231,
 'is': 6,
 'to': 736573,
 'was': 8,
 'by': 9,
 'for': 10,
 'on': 2153702,
 'as': 4181319,
 'at': 13,
 'from': 14,
 'with': 15,
 'an': 4367177,
 'it': 17,
 'that': 2961895,
 'also': 19,
 'which': 20,
 'first': 21,
 'this': 22,
 'has': 23,
 'he': 1571821,
 'one': 3968824,
 'his': 26,
 'are': 27,
 'after': 28,
 'who': 29,
 'were': 1306507,
 'two': 2968491,
 'its': 32,
 'new': 33,
 'be': 3814830,
 'or': 35,
 'but': 1243094,
 'had': 37,
 'their': 38,
 'been': 39,
 'born': 40,
 'not': 41,
 'other': 42,
 'all': 3293660,
 'have': 44,
 'during': 45,
 'time': 4186961,
 'when': 47,
 'may': 358946,
 'they': 2960908,
 'into': 50,
 'category': 51,
 'known': 52,
 'united': 2785692,
 'up': 2332954,
 'where': 55,
 'years': 56,
 'only': 57,
 'over': 58,
 'there': 59,
 'three': 60,
 'american': 61,
 'year': 68014,
 'more': 63,
 'part': 64,
 'most': 65,
 'later': 66,
 'between': 67,
 '1': 3981100,
 'national': 69,
 'mad

In [28]:
dataset_ids = [(entity_to_id(word2id_db, tup[0]), entity_to_id(word2id_db, tup[1])) for tup in dataset_keys]

parsed_dicts = [parse_tuple(tup) for tup in dataset_ids]


False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
439431:1
1513242:1,4788466:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
1054871:1,1103884:1,1111918:1,1127970:1,1197544:2,1204440:1,1237694:1,1271646:1,1292660:2,1360121:1,1499340:1,1564195:1,1605444:1,1620340:1,1683924:1,1693463:1,1

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
4788466:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
4788466:1,87220:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no 

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
1671418:1,4788466:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribut

'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
1111918:1,2357175:1,4788466:1,848043:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
4654369:1,4788466:1
False
'bool' object has no attribute 'split'
Fa

406826:1,4169938:1,4562032:1
1675288:4,1680810:1,207132:1,2511053:2,3578799:1,3901418:1,3914107:2,4044541:1,4605980:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no 

'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' obj

'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
1965752:1,2693745:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
1111918:1,4788466:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
1111918:1,4788466:1
F

'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' obj

'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
2135587:1,4431273:1
4788466:1,4803658:1
1008201:1,1236429:1,4788466:1,70950:1
1608783:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' objec

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
192100:1,4788466:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
4788466:1,87220:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' objec

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'boo

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'boo

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'boo

'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
1111918:1,2949121:1,4788466:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
2590066:1
538977:1
3395245:1,4581212:1
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'boo

'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' obj

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'boo

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'boo

False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'bool' object has no attribute 'split'
False
'boo

In [40]:
len(word2id_db.getall())

22912765

In [8]:
import tensorflow as tf
import tensorflow_hub as hub

USE_folder = "/home/vlead/USE"

def extractUSEEmbeddings(words):
    model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed")
    word_embeddings = model(words)
    return word_embeddings.numpy()

extractUSEEmbeddings(["words_list"])

In [5]:
words_list = list(set(words_list))

In [4]:
from scipy import spatial
words = ["river", "stream", "brook", "rivet", "rangerover", "pig", "ravish", "river/stream", "river or stream", "rivers and rivulets"]
embeds = extractUSEEmbeddings(words)
for i,word in enumerate(embeds[1:]):
    print ("river", words[i+1], 1 - spatial.distance.cosine(embeds[0], word))

river stream 0.5462131500244141
river brook 0.540290117263794
river rivet 0.27812615036964417
river rangerover 0.30641186237335205
river pig 0.41914620995521545
river ravish 0.34227073192596436
river river/stream 0.4524398148059845
river river or stream 0.7695298790931702
river rivers and rivulets 0.5320086479187012


In [11]:
f = open("../junk/use_embeddings.pkl", "wb")
pickle.dump(emb_indexer, f)

In [2]:
import time 
word = "margherita pizza" 

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
#     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(words, word_to_compare, max_sim=-1000, closest_word=""):
    word_embeddings = extractUSEEmbeddings(words)
    closest_word = ""
    with shelve.open(use_embeddings, 'c') as db:
        for i, w in enumerate(word_embeddings):
            db[words[i]] = w
        closest_word_idx = np.argmax(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
        sim = np.max(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
        if sim > max_sim:
            max_sim = sim
            closest_word = words[closest_word_idx]
        del word_embeddings
    del db
    return closest_word, max_sim

def closest_word_USE(word, method="USE"):

    word_to_compare = extractUSEEmbeddings([word])
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))
    if os.path.isfile(use_embeddings):
        with shelve.open(use_embeddings, 'r') as db:
            embeds = np.array(list(db.values()))
            words = np.array(list(db.keys()))
            print ("Values and keys obtained", time.time()-a)
            sim_mat = awesome_cossim_topn(coo_matrix(embeds, dtype=np.float64), coo_matrix(word_to_compare.T, dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250)
            print ("Sim mat calculated", time.time()-a)
            closest_word_idx = np.argmax(sim_mat)
            print ("idx gotten", time.time()-a)
            closest_word = words[closest_word_idx]
    else:
        words = list(word2id_db.keys())
        print ("Obtained list of words")
        len_part = 100000
        max_sim = -1000
        n_parts = ceil(len(words)/len_part)
        closest_word = ""
        for i in range(n_parts):
            words_part = words[i*len_part:(i+1)*len_part]
            closest_word, max_sim = compare_sim(words_part, word_to_compare, max_sim, closest_word)

    
    return closest_word

a = time.time()
closest_word = closest_word_USE("wansecure firewall")
print (time.time()-a)
closest_word



In [6]:
word2id_db = pickledb.load(prefix + "w2i.db", False)
words = list(word2id_db.getall())

In [None]:

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))



org_names = names['buyer'].unique()
vectorizer = TfidfVectorizer(min_df=1, analyzer=extract_ngrams)
tf_idf_matrix = vectorizer.fit_transform(org_names)

t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.85)
t = time.time()-t1


print('All 3-grams in "Department":')
print(extract_ngrams('Department'))

In [4]:
para = """Accuracy:0.7961570593149541 Precision:0.7648686030428768 Recall: 0.7402945113788487F1-score: 0.7523809523809524
Accuracy:0.14909090909090908 Precision:0.041666666666666664 Recall: 0.10101010101010101F1-score: 0.058997050147492625
Accuracy:0.7531599855543517 Precision:0.9236049601417183 Recall: 0.7531599855543517F1-score: 0.8297195146210463
Accuracy:0.7953216374269005 Precision:0.7613168724279835 Recall: 0.7429718875502008F1-score: 0.7520325203252033
Accuracy:0.14181818181818182 Precision:0.04132231404958678 Recall: 0.10101010101010101F1-score: 0.058651026392961866
Accuracy:0.7583965330444203 Precision:0.9247027741083224 Recall: 0.7583965330444203F1-score: 0.8333333333333334
Accuracy:0.7986633249791144 Precision:0.7664835164835165 Recall: 0.7469879518072289F1-score: 0.7566101694915256
Accuracy:0.14545454545454545 Precision:0.03765690376569038 Recall: 0.09090909090909091F1-score: 0.05325443786982249
Accuracy:0.7582159624413145 Precision:0.9255014326647565 Recall: 0.7582159624413145F1-score: 0.8335483870967741
Accuracy:0.8003341687552213 Precision:0.7678571428571429 Recall: 0.7483266398929049F1-score: 0.7579661016949153
Accuracy:0.16363636363636364 Precision:0.046413502109704644 Recall: 0.1111111111111111F1-score: 0.0654761904761905
Accuracy:0.7542434091729866 Precision:0.9230939226519337 Recall: 0.7542434091729866F1-score: 0.8301699294444996
Accuracy:0.7961570593149541 Precision:0.7638121546961326 Recall: 0.7402945113788487F1-score: 0.7518694765465669
Accuracy:0.14909090909090908 Precision:0.045454545454545456 Recall: 0.1111111111111111F1-score: 0.06451612903225805
Accuracy:0.7602022390754785 Precision:0.9234481245887256 Recall: 0.7602022390754785F1-score: 0.8339110626918886
Accuracy:0.797827903091061 Precision:0.7661623108665749 Recall: 0.7456492637215528F1-score: 0.7557666214382632
Accuracy:0.15636363636363637 Precision:0.04201680672268908 Recall: 0.10101010101010101F1-score: 0.059347181008902086
Accuracy:0.7609245214879018 Precision:0.9239201929401447 Recall: 0.7609245214879018F1-score: 0.8345380730765423
Accuracy:0.8020050125313283 Precision:0.7734806629834254 Recall: 0.749665327978581F1-score: 0.761386811692726
Accuracy:0.16363636363636364 Precision:0.0423728813559322 Recall: 0.10101010101010101F1-score: 0.05970149253731344
Accuracy:0.7605633802816901 Precision:0.9259177841283799 Recall: 0.7605633802816901F1-score: 0.8351343313175374
Accuracy:0.7969924812030075 Precision:0.7651933701657458 Recall: 0.7416331994645248F1-score: 0.7532290958531611
Accuracy:0.15636363636363637 Precision:0.04201680672268908 Recall: 0.10101010101010101F1-score: 0.059347181008902086
Accuracy:0.7657999277717588 Precision:0.9259825327510917 Recall: 0.7657999277717588F1-score: 0.838307966001186
Accuracy:0.7986633249791144 Precision:0.7682758620689655 Recall: 0.7456492637215528F1-score: 0.7567934782608694
Accuracy:0.16 Precision:0.046413502109704644 Recall: 0.1111111111111111F1-score: 0.0654761904761905
Accuracy:0.760743950884796 Precision:0.9214785651793526 Recall: 0.760743950884796F1-score: 0.8334322453016815
Accuracy:0.7986633249791144 Precision:0.766896551724138 Recall: 0.7443105756358769F1-score: 0.7554347826086958
Accuracy:0.14909090909090908 Precision:0.041666666666666664 Recall: 0.10101010101010101F1-score: 0.058997050147492625
Accuracy:0.7670639219934995 Precision:0.9246843709185895 Recall: 0.7670639219934995F1-score: 0.8385313857086459"""

import numpy as np
ls = np.array_split(para.split("\n"), int(len(para.split("\n"))/3))
print ("\n".join([el[2].split(" ")[-1].split("F")[0] for el in ls]))



0.8297195146210463
0.8333333333333334
0.8335483870967741
0.8301699294444996
0.8339110626918886
0.8345380730765423
0.8351343313175374
0.838307966001186
0.8334322453016815
0.8385313857086459


In [17]:
import torch.nn.functional as F
F.softmax(torch.DoubleTensor([2,1]))

  


tensor([0.7311, 0.2689], dtype=torch.float64)

In [None]:
import re
from ftfy import fix_text
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
from sparse_dot_topn import awesome_cossim_topn

chars_to_remove = [")","(",".","|","[","]","{","}","'"]

def extract_ngrams(string, n=3):
    string = fix_text(string).encode("ascii", errors="ignore").decode().lower() # fix text
    string = string.replace('&', 'and').replace(',', ' ').replace('-', ' ').title()
    string = re.sub('[' + re.escape(''.join(chars_to_remove)) + ']', '', string)
    string = ' ' + re.sub(' +',' ',string).strip() + ' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    ngrams = [''.join(ngram) for ngram in ngrams]
    return ngrams

word_to_match = "margherita pizza"
words = list(word2id_db.keys())
vectorizer = TfidfVectorizer(min_df=1, analyzer=extract_ngrams)
tf_idf_matrix = vectorizer.fit_transform(words + [word_to_match])

# d = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.85, use_threads=True, n_jobs=256)



In [None]:
d = awesome_cossim_topn(tf_idf_matrix[:-1], tf_idf_matrix[-1].transpose(), 10, 0.85, use_threads=True, n_jobs=256)

In [None]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    print (sparserows)
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [6]:
import pickledb
prefix= "../junk/db_files/"
word2id_db = pickledb.load(prefix + "w2i.db", False)
len(word2id_db.getall())

22912765

In [None]:
relations_db_new = shelve.open(prefix + "_relations_map.db", "c")
for k, v in relations_db.items():
    relations_db_new["###".join(k.split("_"))] = v
relations_db_new.close()

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms


train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=batch_size, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x)

net = Net()
print(net)

# create a stochastic gradient descent optimizer
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
# create a loss function
criterion = nn.NLLLoss()

# run the main training loop
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        # resize data from (batch_size, 1, 28, 28) to (batch_size, 28*28)
        data = data.view(-1, 28*28)
        optimizer.zero_grad()
        net_out = net(data)
        loss = criterion(net_out, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.data[0]))

# run a test loop
test_loss = 0
correct = 0
for data, target in test_loader:
    data, target = Variable(data, volatile=True), Variable(target)
    data = data.view(-1, 28 * 28)
    net_out = net(data)
    # sum up batch loss
    test_loss += criterion(net_out, target).data[0]
    pred = net_out.data.max(1)[1]  # get the index of the max log-probability
    correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))


In [None]:
data = ["\t".join(l.split("\t")[1:-1]) for l in open("../junk/security_dataset.tsv","r").read().split("\n")[1:]]
open("../files/dataset/dataset.tsv","w").write("\n".join(data))

In [None]:
with shelve.open(use_embeddings, 'r') as db:    
    allitems = list(db.items())
    emb = [el[1] for el in allitems]
    wds = [el[0] for el in allitems]
    file = open("../files/embeddings_list.pkl", "wb")
    pickle.dump(allitems, file)
            
        

In [None]:
import time 
word = "margherita pizza" 

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(args):
    words, word_to_compare, max_sim, closest_word = args
    t = time.time()
    word_embeddings = extractUSEEmbeddings(words)
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-t))
    sys.stdout.flush()
    closest_word_idx = np.argmax(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    sim = np.max(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    if sim > max_sim:
        max_sim = sim
        closest_word = words[closest_word_idx]
    del word_embeddings
    return (closest_word, max_sim)

def closest_word_USE(word, method="USE"):

    word_to_compare = extractUSEEmbeddings([word])
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))
#     words = list(word2id_db.keys())
    print ("Took me {} seconds to obtain words list...".format(time.time()-a))
    len_part = 100000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
            
    for i in range(n_parts):
        words_part = words[i*len_part:(i+1)*len_part]
        closest_word, max_sim = compare_sim(words_part, word_to_compare, max_sim, closest_word)
        print ("Took me {} seconds to iteration of sim compare...".format(time.time()-t))

    
    return closest_word

a = time.time()
closest_word = closest_word_USE("wansecure firewall")
print (time.time()-a)
closest_word



In [1]:
import time 
word = "margherita pizza" 

def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(args):
    words, word_to_compare, max_sim, closest_word = args
    t = time.time()
    word_embeddings = extractUSEEmbeddings(words)
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-t))
    sys.stdout.flush()
    closest_word_idx = np.argmax(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    sim = np.max(awesome_cossim_topn(coo_matrix(word_embeddings, dtype=np.float64), coo_matrix(word_to_compare.transpose(), dtype=np.float64), 10, 0.85, use_threads=True, n_jobs=250))
    if sim > max_sim:
        max_sim = sim
        closest_word = words[closest_word_idx]
    del word_embeddings
    return (closest_word, max_sim)

def closest_word_USE(word, method="USE"):

    word_to_compare = extractUSEEmbeddings([word])
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))
#     words = list(word2id_db.keys())
    print ("Took me {} seconds to obtain words list...".format(time.time()-a))
    len_part = 100000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
    for i in range(n_parts):
        t = time.time()
        words_part = words[i*len_part:(i+1)*len_part]
        sub_arrays = np.array_split(words_part, 2)
        args = [(sub_array, word_to_compare, max_sim, closest_word) for sub_array in sub_arrays]
        results = []
        with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
            for res in executor.map(compare_sim, args):
                results.append(res)
        closest_word, max_sim = max(results, key=lambda l:l[-1])
        print ("Took me {} seconds to iteration of sim compare...".format(time.time()-t))

    
    return closest_word

a = time.time()
closest_word = closest_word_USE("wansecure firewall")
print (time.time()-a)
closest_word



In [4]:
messages = ["has Related Synonym", "has Approximate Synonym", "has Exact Synonym"]
extractUSEEmbeddings(messages)

array([[-0.00992627,  0.04524262,  0.01897666, ...,  0.09815302,
        -0.03195954, -0.01154737],
       [-0.02741307,  0.02852679,  0.03163458, ...,  0.06687995,
        -0.0338574 ,  0.02150605],
       [-0.03083228,  0.05407378,  0.01047603, ...,  0.08310957,
        -0.03368025,  0.03365675]], dtype=float32)

In [7]:
# words_sample = ["pizza hut", "burger king", "south africa", "nasa"]
# del og_dict
def calculate_sim(words, word1, max_sim, closest_word):
    t = time.time()
    i = 0
    for word2 in words:
        try:
            sim = wiki2vec.similarity("_".join(word1.lower().split()), "_".join(word2.split()))
            if sim > max_sim:
                max_sim = sim
                closest_word = word2
            i += 1
        except Exception as e:
            continue
    print ("Original word: ", word1, "Closest Word: ", closest_word)
    print ("Took me {} seconds to iteration of sim compare...".format(time.time()-a))
    sys.stdout.flush()
    return (closest_word, max_sim)

def closest_word_w2v(word1):
    len_part = 100000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
    if word1 not in wiki2vec.wv.vocab:
        print ("Original word not in vocab", word1)
        return (closest_word, max_sim)
    for i in range(n_parts):
        words_part = words[i*len_part:(i+1)*len_part]
        closest_word, max_sim = calculate_sim(words_part, word1, max_sim, closest_word)
    return word1, closest_word          

a = time.time()

# closest_word = closest_word_w2v("margherita pizza")

# closest_word_w2v("nelson mandela")

resolved = dict()
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
    for res in executor.map(closest_word_w2v, failed):
        resolved[res[0]] = res[1]


    



Original word not in vocab peter wyche (diplomat)




Original word not in vocab acoma-zuni section
Original word not in vocab madan-harini
Original word not in vocab trust no one (internet security)
Original word not in vocab international tibet independence movement
Original word not in vocab isobase
Original word not in vocab human computer interaction (security)
Original word not in vocab poetas de karaoke




Original word not in vocab ipa pulmonic consonant chart with audio
Original word not in vocab lego clutch powers: bad hair day
Original word not in vocab aed (non-profit)
Original word not in vocab quilmes airport
Original word not in vocab yendegaia airport
Original word not in vocab the pack a.d.
Original word not in vocab harvie-watt baronets
Original word not in vocab sharp actius rd3d notebook
Original word not in vocab big beach boutique ii - the movie
Original word not in vocab privacy by design
Original word not in vocab motorola devour
Original word not in vocab piracy act
Original word not in vocab starter ring gear
Original word not in vocab antonio sánchez (puerto rican host)
Original word not in vocab electronic logbook
Original word not in vocab greg burke (journalist)
Original word not in vocab deaths in november 2013
Original word not in vocab hp mini 311
Original word not in vocab confederation of indigenous nationalities of the ecuadorian amazon
Original word not in v

142 done
154 done
147 done
159 done
139 done
143 done
155 done
148 done
160 done
140 done
144 done
156 done
161 done
149 done
145 done
157 done
141 done
150 done
146 done
142 done
158 done
162 done
151 done
147 done
159 done
143 done
152 done
148 done
163 done
160 done
144 done
149 done
153 done
161 done
145 done
150 done
146 done
154 done
164 done
151 done
147 done
155 done
148 done
152 done
156 done
165 done
162 done
149 done
157 done
166 done
163 done
153 done
150 done
158 done
154 done
151 done
167 done
159 done
155 done
164 done
152 done
168 done
156 done
160 done
165 done
153 done
169 done
161 done
157 done
166 done
170 done
154 done
158 done
167 done
159 done
155 done
171 done
168 done
156 done
162 done
160 done
172 done
169 done
157 done
161 done
163 done
173 done
170 done
158 done
171 done
159 done
174 done
172 done
162 done
160 done
175 done
164 done
161 done
173 done
163 done
176 done
165 done
174 done
166 done
177 done
175 done
178 done
164 done
167 done
162 done
176 done
1

330 done
321 done
336 done
327 done
338 done
322 done
337 done
328 done
331 done
323 done
339 done
338 done
332 done
324 done
329 done
340 done
339 done
325 done
333 done
340 done
330 done
326 done
341 done
334 done
327 done
341 done
331 done
335 done
342 done
332 done
336 done
328 done
343 done
342 done
337 done
333 done
344 done
343 done
329 done
345 done
338 done
344 done
334 done
346 done
330 done
339 done
345 done
335 done
347 done
340 done
336 done
331 done
348 done
346 done
337 done
347 done
332 done
341 done
349 done
348 done
338 done
333 done
350 done
342 done
349 done
334 done
339 done
351 done
343 done
350 done
340 done
335 done
344 done
351 done
336 done
352 done
345 done
341 done
353 done
337 done
346 done
352 done
354 done
342 done
347 done
338 done
353 done
355 done
348 done
343 done
339 done
356 done
354 done
344 done
349 done
340 done
355 done
345 done
357 done
350 done
356 done
346 done
341 done
358 done
351 done
347 done
357 done
359 done
348 done
360 done
358 done
3

In [7]:
f = open("../junk/failed_words", "rb") 
failed, words = pickle.load(f)

def extractUSEEmbeddings(words):

    embed = hub.KerasLayer(USE_folder)






    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(words, word_to_compare, max_sim, closest_word):
    print ("Extracting for", len(words))
    word_embeddings = extractUSEEmbeddings(words)
    print ("emb extracted for ", word_to_compare)
    for i,w in enumerate(word_embeddings):
        sim = np.dot(word_to_compare, w)
        if sim > max_sim:
            max_sim = sim
            closest_word = words[i]
    print ("Original word: ", word, "Closest Word: ", closest_word, "Sim: ", max_sim)
    return (closest_word, max_sim)

def closest_word_USE(argument):
    
    word, embed = argument
    len_part = 10000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
    for i in range(n_parts):
        words_part = words[i*len_part:(i+1)*len_part]
        closest_word, max_sim = compare_sim(words_part, embed, max_sim, closest_word)
    with counter.get_lock():
        counter.value += 1
    print ("RESOLVED: Original word: ", word, "Closest Word: ", closest_word, "Sim: ", max_sim)
    print ("Percentage done: ", float(counter.value*100/len(failed)))
    return word1, closest_word, max_sim

def run():
    resolved = dict()
    print ("Working on it...")
    counter = Value('i', 0)
    a = time.time()
    failed_embeddings = extractUSEEmbeddings(failed)
    print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for res in executor.map(closest_word_USE, zip(failed, failed_embeddings)):
            resolved[res[0]] = (res[1], res[2])

    f = open("resolved", "wb")
    pickle.dump(resolved, f)



    
if __name__ == '__main__':
    run()

Working on it...
Took me 31.98935627937317 seconds to extract USE embeddings...
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10000
Extracting for 10

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [12]:
resolved["approach"]

KeyError: 'approach'

In [44]:
a = time.time()
"abrkadabra" in w2v.wv
print (time.time()-a)
a = time.time()
try:
    w2v.similarity("margherita_pizza", "abrkadabra")
except:    
    print (time.time()-a)
    pass

0.0005288124084472656
0.00016236305236816406


  


In [37]:
words = list(word2id_db.keys())

In [3]:
extractUSEEmbeddings(["river", ])

array([ 0.0884,  0.2092, -0.1895, -0.1527, -0.0978,  0.0378, -0.1611,
        0.0245,  0.0549, -0.2892,  0.0931, -0.3243, -0.2276, -0.0727,
        0.0521, -0.2883, -0.0754, -0.0059, -0.0705, -0.3562, -0.1019,
        0.0847,  0.111 ,  0.0049, -0.3304, -0.2235,  0.1369, -0.1037,
       -0.0751, -0.3887,  0.1092, -0.1504,  0.0167,  0.0217,  0.0204,
        0.064 , -0.2647,  0.3114, -0.0973,  0.1509, -0.2116, -0.0882,
        0.1436, -0.2557,  0.23  ,  0.1662,  0.04  , -0.1121,  0.0426,
       -0.179 , -0.0356, -0.1443, -0.2153, -0.1841, -0.2113, -0.1561,
        0.258 , -0.0593, -0.1704, -0.0394, -0.0992, -0.1615,  0.0623,
       -0.1708, -0.1204,  0.2041,  0.173 , -0.3095, -0.0589, -0.0366,
        0.0084, -0.2201, -0.3896, -0.2086,  0.323 , -0.0779, -0.1028,
        0.0626,  0.2596,  0.0631,  0.18  ,  0.1857,  0.3112,  0.0103,
        0.2184, -0.102 ,  0.0504,  0.0907,  0.2355,  0.2216,  0.0125,
        0.0075,  0.0846, -0.1534,  0.4137,  0.0309, -0.2167, -0.0785,
       -0.0552,  0.1

In [6]:
import pickle
f = open("../junk/resolved_use.pkl", "rb")
resolved = pickle.load(f)
resolved

{'trust no one (internet security)': ('trust no one', 0.6982047),
 'human computer interaction (security)': ('computer human interaction',
  0.81223464),
 '(isc)²': ('narinskiy', 1.0),
 '1260 (computer virus)': ('his 1260 virus', 0.75306094),
 '1964 (emulator)': ('64 emulator', 0.8074869),
 'skipjack (cipher)': ('skipjack cipher', 0.99999976),
 'arp spoofing': ('arp spoofing attacks', 0.89317155),
 'list of government mass surveillance projects': ('a mass surveillance government project',
  0.71680576),
 '1legcall': ('donauhort', 0.99999994),
 '3 skypephone s2': ('the 3 skypephone s2', 0.892357),
 'certificate-based encryption': ('unforgeable encryption', 0.9158317),
 '3cx phone system': ('pbx telephone system', 0.72289276),
 '3wplayer': ('brscn', 0.99999976),
 '4k (computer virus)': ('virus infected computers', 0.6718956),
 'salvatore d. morgera': ('salvatore catalanotte', 0.8625345),
 'completeness (cryptography)': ('pure cryptography', 0.8610318),
 'haystack (software)': ('haystack'

In [15]:

f = open("../junk/use_embeddings.pkl", "rb")
emb_indexer = pickle.load(f)

In [None]:
ls = resolved.keys()
ls

In [41]:
def extractUSEEmbeddings(words):
    try:
        embed = hub.KerasLayer(USE_folder)
    except Exception as e:
        !mkdir $USE_folder
        !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/5?tf-hub-format=compressed" | tar -zxvC $USE_folder
        embed = hub.KerasLayer(USE_folder)
        pass
    word_embeddings = embed(words)
    return word_embeddings.numpy()

embed = extractUSEEmbeddings(["orange", "apple", "elon musk"])

In [32]:
not {}

True

In [10]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
glove_file = datapath('/data/Vivek/glove.6B.300d.txt')
tmp_file = get_tmpfile("/data/Vivek/glove_tmp")

_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [44]:
vocab = set(success)
words = ['_unk_']
idx = 1
word2idx = {"_unk_": 0}
vectors = bcolz.carray(np.random.random(300), rootdir=embeddings_folder, mode='w')
with open(embeddings_file, 'r') as f:
    for l in f:
        line = [a[::-1] for a in l[::-1].split(" ", 300)[::-1]]
        word, vector = line[0], [float(s) for s in line[1:]]
        if len(vector) != 300:
            print (len(vector))
        if word not in vocab:
            continue
        words.append(word)
        vectors.append(np.array(vector).astype(np.float))
        word2idx[word] = idx
        idx += 1

In [78]:
v = bcolz.carray(np.random.rand(1, 300), rootdir=embeddings_folder, mode='w')
v.append(np.resize(np.array(vector), (1, 300)).astype(np.float))

In [3]:
import pickle
f = open("../junk/failed_words", "rb")
failed, words = pickle.load(f)
words

22912765

In [None]:
import time, sys, pickle
from multiprocessing import Value
import numpy as np
from math import ceil
import concurrent.futures
import tensorflow as tf
import tensorflow_hub as hub

USE_folder = "/home/vlead/USE"

f = open("../junk/failed_words", "rb") 
failed, words = pickle.load(f)

def extractUSEEmbeddings(words):

    embed = hub.KerasLayer(USE_folder)






    word_embeddings = embed(words)
    return word_embeddings.numpy()

def compare_sim(words, word_to_compare, max_sim, closest_word):
    print ("Extracting for", len(words))
    word_embeddings = extractUSEEmbeddings(words)
    print ("emb extracted for ", word_to_compare)
    for i,w in enumerate(word_embeddings):
        sim = np.dot(word_to_compare, w)
        if sim > max_sim:
            max_sim = sim
            closest_word = words[i]
    print ("Original word: ", word, "Closest Word: ", closest_word, "Sim: ", max_sim)
    return (closest_word, max_sim)

def closest_word_USE(argument):
    
    word, embed = argument
    len_part = 10000
    max_sim = -1000
    n_parts = ceil(len(words)/len_part)
    closest_word = ""
    for i in range(n_parts):
        words_part = words[i*len_part:(i+1)*len_part]
        closest_word, max_sim = compare_sim(words_part, embed, max_sim, closest_word)
    with counter.get_lock():
        counter.value += 1
    print ("RESOLVED: Original word: ", word, "Closest Word: ", closest_word, "Sim: ", max_sim)
    print ("Percentage done: ", float(counter.value*100/len(failed)))
    return word1, closest_word, max_sim


def run():
    # resolved = dict()
    # print ("Working on it...")
    # counter = Value('i', 0)
    # a = time.time()
    # failed_embeddings = extractUSEEmbeddings(failed)
    # print ("Took me {} seconds to extract USE embeddings...".format(time.time()-a))

    with concurrent.futures.ProcessPoolExecutor() as executor:
        for res in executor.map(closest_word_USE, zip(failed, failed_embeddings)):
            resolved[res[0]] = (res[1], res[2])

    f = open("resolved_use.pkl", "wb")
    pickle.dump(resolved, f)
    

if __name__ == '__main__':
    run()

In [9]:
import subprocess
bashCommand = "ps -ef | grep ' python3'"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print (output)

b'UID         PID   PPID  C STIME TTY          TIME CMD\nroot          1      0  0 Mar15 ?        00:43:37 /sbin/init\nroot          2      0  0 Mar15 ?        00:01:11 [kthreadd]\nroot          4      2  0 Mar15 ?        00:00:00 [kworker/0:0H]\nroot          5      2  0 Mar15 ?        00:00:00 [kworker/u256:0]\nroot          7      2  0 Mar15 ?        00:00:00 [mm_percpu_wq]\nroot          8      2  0 Mar15 ?        00:04:42 [ksoftirqd/0]\nroot          9      2  0 Mar15 ?        00:43:05 [rcu_sched]\nroot         10      2  0 Mar15 ?        00:00:00 [rcu_bh]\nroot         11      2  0 Mar15 ?        00:01:34 [migration/0]\nroot         12      2  0 Mar15 ?        00:00:15 [watchdog/0]\nroot         13      2  0 Mar15 ?        00:00:00 [cpuhp/0]\nroot         14      2  0 Mar15 ?        00:00:00 [cpuhp/1]\nroot         15      2  0 Mar15 ?        00:00:10 [watchdog/1]\nroot         16      2  0 Mar15 ?        00:01:26 [migration/1]\nroot         17      2  0 Mar15 ?        00:02:20 [

In [11]:
f = open("../junk/failed_instances", "wb")
pickle.dump(list(set(failed)) ,f)

In [30]:
import os, pickle
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

f = open("../junk/failed_words", "rb")
failed, words = pickle.load(f)
results = {i: ("", -1000) for i in failed}

def compare_sim(word_str, word, embeds):
    max_sim = -1000
    closest_word = ""
    for i,w in enumerate(embeds):
        sim = np.dot(word, w[1])
        if sim > max_sim:
            max_sim = sim
            closest_word = w[0]
    print ("Original word: ", word_str, "Closest Word: ", closest_word, "Sim: ", max_sim)
    return (closest_word, max_sim)

def extractUSEEmbeddings(words):
    embed = hub.KerasLayer(USE_folder)
    word_embeddings = embed(words)
    return word_embeddings.numpy()

failed_embeds = extractUSEEmbeddings([elem.split("(")[0].strip() for elem in failed])


for file in ["../junk/" + s for s in os.listdir("../junk/") if s.startswith("use_embeddings_")]:
    emb_file = open(file, "rb")
    use_embeds = pickle.load(emb_file)
    output = {word: compare_sim(failed[i], failed_embeds[i], use_embeds) for i,word in enumerate(failed)}
    results = {word: results[i] if results[i][1] > output[i][1] else output[i] for i in results}

resolved_file = open("../junk/resolved_use_unbracketed.pkl", "wb")
pickle.dump(results, resolved_file)

Original word:  trust no one (internet security) Closest Word:  never trust Sim:  0.7894043
Original word:  human computer interaction (security) Closest Word:  latterly human computer interaction Sim:  0.9285768
Original word:  (isc)² Closest Word:  bongmu Sim:  0.64656067
Original word:  1260 (computer virus) Closest Word:  westerstraatmarkt Sim:  0.6974803
Original word:  1964 (emulator) Closest Word:  1967 Sim:  0.8550332
Original word:  skipjack (cipher) Closest Word:  fungiform Sim:  0.9999998
Original word:  arp spoofing Closest Word:  inverse arp Sim:  0.782235
Original word:  list of government mass surveillance projects Closest Word:  a most comprehensive domestic covert surveillance project Sim:  0.5988555
Original word:  1legcall Closest Word:  patlahuatzin Sim:  0.64279723
Original word:  3 skypephone s2 Closest Word:  s2 s3 Sim:  0.67851406
Original word:  certificate-based encryption Closest Word:  homomorphic encryption Sim:  0.84442616
Original word:  3cx phone system 

KeyboardInterrupt: 

In [28]:
pickle.load(open("../junk/use_embeddings_0", "rb"))[:10]

[('gregory feist',
  array([-0.00809772,  0.01843606, -0.02493137, -0.03261997,  0.01658829,
         -0.01495019,  0.00918457,  0.00727165,  0.03706673, -0.11205372,
          0.02929727,  0.01354519,  0.05471265, -0.01608668, -0.03687082,
          0.00999278, -0.00103143,  0.00315072, -0.01755182, -0.09822135,
         -0.01678401,  0.03329777,  0.08278999, -0.07331081,  0.03356961,
         -0.10130599,  0.05421298,  0.04093452, -0.01322428,  0.02143382,
          0.01269278,  0.04487535, -0.03097309,  0.02998142,  0.0450017 ,
         -0.06459187, -0.01139481, -0.00014227,  0.01801517, -0.01368487,
          0.01699944, -0.06480575, -0.03736628, -0.01619588,  0.01431554,
          0.02359258, -0.05890146,  0.03295605,  0.0091161 , -0.02759184,
         -0.00935134,  0.04138604, -0.02473297,  0.06366778, -0.0451021 ,
          0.07964347,  0.02356339,  0.00293936,  0.00107426, -0.06866623,
         -0.0059583 ,  0.02780963,  0.00734548,  0.01936775, -0.08502334,
         -0.0243488

In [32]:
nn.LSTM(5, 4, 1)(torch.zeros(3,4,5))

(tensor([[[ 0.1371,  0.0265,  0.0091,  0.0975],
          [ 0.1371,  0.0265,  0.0091,  0.0975],
          [ 0.1371,  0.0265,  0.0091,  0.0975],
          [ 0.1371,  0.0265,  0.0091,  0.0975]],
 
         [[ 0.2076,  0.0507,  0.0038,  0.1409],
          [ 0.2076,  0.0507,  0.0038,  0.1409],
          [ 0.2076,  0.0507,  0.0038,  0.1409],
          [ 0.2076,  0.0507,  0.0038,  0.1409]],
 
         [[ 0.2435,  0.0692, -0.0032,  0.1587],
          [ 0.2435,  0.0692, -0.0032,  0.1587],
          [ 0.2435,  0.0692, -0.0032,  0.1587],
          [ 0.2435,  0.0692, -0.0032,  0.1587]]], grad_fn=<StackBackward>),
 (tensor([[[ 0.2435,  0.0692, -0.0032,  0.1587],
           [ 0.2435,  0.0692, -0.0032,  0.1587],
           [ 0.2435,  0.0692, -0.0032,  0.1587],
           [ 0.2435,  0.0692, -0.0032,  0.1587]]], grad_fn=<StackBackward>),
  tensor([[[ 0.5494,  0.1317, -0.0058,  0.3090],
           [ 0.5494,  0.1317, -0.0058,  0.3090],
           [ 0.5494,  0.1317, -0.0058,  0.3090],
           [ 0.5494