In [76]:
import numpy as np
from models.enc_dec_dis import ParaphraseGenerator
import torch
import torch.nn as nn
from glob import glob
import os
from misc import net_utils, utils
import re
import pickle
import spacy
from scipy import spatial



def find_closest_word(word, vocab_embed, tree):
    word = nlp(str(word))
    wordembedding = word[0].vector
    
    
    
    index_embeddings = tree.query(wordembedding)[1]
    
    # embedded_words = {"word": embedding vector}
    index_embedded_words = list(vocab_embed.keys())[index_embeddings]
    
    return vocab[index_embedded_words]


def tokenize(sentence):
    return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if i!='' and i!=' ' and i!='\n'];

def expandToMax(sent, max_sent_size):
    if len(sent) > max_sent_size:
        print("Removing phrase, too long:   "+" ".join(sent))
        return None
    else:
        while len(sent) < max_sent_size:
            sent.append(" ")
        return sent

    
def inputData(sents, max_sent_size, vocab, vocab_embed, tree):

    # Remove 
    tokens = [tokenize(sent) for sent in sents]

    # sentencesT of shape (batch_size, max_seq_size)
    sentencesT = np.array([expandToMax(x, max_sent_size) for x in tokens])
    # Sentences of shape (max, batch)
    sentences = sentencesT.transpose()
    
    out_set = np.zeros(sentences.shape)

    reverse_vocab = {x[1]:x[0] for x in list(vocab.items())}
   
    
    for i in range(sentences.shape[0]):
        for j in range(sentences.shape[1]):
            try:
                out_set[i][j] = reverse_vocab[sentences[i][j]]
            except KeyError:
                closest_word = find_closest_word(sentences[i][j],vocab, tree)
                out_set[i][j] = reverse_vocab[closest_word]
    
    return out_set

def main(sents):

    parser = utils.make_parser()
    args = parser.parse_args()

    # build model

    # # get data
    #data = Dataloader(args.input_json, args.input_ques_h5)

    # # make op
    op = {
        "vocab_sz": 27699,#data.getVocabSize(),
        "max_seq_len": 28,#data.getSeqLength(),
        "emb_hid_dim": 256,#args.emb_hid_dim,
        "emb_dim": 512,#args.emb_dim,
        "enc_dim": 512,#args.enc_dim,
        "enc_dropout": 0.5,#args.enc_dropout,
        "enc_rnn_dim": 512,#args.enc_rnn_dim,
        "gen_rnn_dim": 512,#args.gen_rnn_dim,
        #"gen_dropout": 0.5,#args.gen_dropout,
        #"lr": 0.0008,#args.learning_rate,
        #"epochs": 1,#args.n_epoch
    }

    files = glob("save/*")
    files.sort(key=os.path.getmtime)
    WEIGHT_PATH = files[-1]
    print("### Loading weights from {} ###".format(WEIGHT_PATH))

    model = ParaphraseGenerator(op)
    model.load_state_dict(torch.load(WEIGHT_PATH))

    print("Maximum sequence length = {}".format(28))
    
    with open('data/Vocab_Extra','rb') as f:
        vocab = pickle.load(f)
        
    
    sents = inputData(sents, 28, vocab, embed_model)

    out, _, _ = model.forward(sents)

In [77]:
sents = ["This is a test sentence, used to test the model.", "The second sentence is a bit trickier, as it is less straight forwards.", "Hello friends, here is a real sounding made up word: hallo"]
inputData(sents, 28, vocab, embeds, tree)

array([[13591.,  9816.,  1243.],
       [10751., 20745., 17241.],
       [23290., 24430., 24193.],
       [ 2119., 10751., 20758.],
       [24430., 23290., 10751.],
       [24193., 25340., 23290.],
       [ 9910., 25212.,  9502.],
       [14713., 24193., 17457.],
       [ 2119.,  7815., 20687.],
       [ 9021., 10753., 15113.],
       [12893., 10751., 22446.],
       [  258., 22266., 21893.],
       [18154., 14984., 25433.],
       [18154., 21687., 18154.],
       [18154.,   258., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.],
       [18154., 18154., 18154.]])

In [78]:
main(sents)

usage: ipykernel_launcher.py [-h] [--input_ques_h5 INPUT_QUES_H5]
                             [--input_json INPUT_JSON]
                             [--start_from START_FROM] [--model MODEL]
                             [--batch_size BATCH_SIZE]
                             [--input_encoding_size INPUT_ENCODING_SIZE]
                             [--att_size ATT_SIZE] [--emb_size EMB_SIZE]
                             [--rnn_layers RNN_LAYERS]
                             [--train_dataset_len TRAIN_DATASET_LEN]
                             [--val_dataset_len VAL_DATASET_LEN]
                             [--optim OPTIM] [--learning_rate LEARNING_RATE]
                             [--learning_rate_decay_start LEARNING_RATE_DECAY_START]
                             [--learning_rate_decay_every LEARNING_RATE_DECAY_EVERY]
                             [--momentum MOMENTUM] [--optim_alpha OPTIM_ALPHA]
                             [--optim_beta OPTIM_BETA]
                             [--optim

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [28]:
with open('data/Vocab_Extra','rb') as f:
    vocab = pickle.load(f)

embeds = {}
import en_core_web_sm
nlp = en_core_web_sm.load()
    
for word in list(vocab.values()):
    doc = nlp(word)
    embedding = doc[0].vector
    embeds[word] = embedding
    
with open('embeddings.p', 'wb') as fp:
    pickle.dump(embeds, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
embeds

{'Szczecin': array([ 4.8972487e-01, -8.3100158e-01,  4.6033053e+00, -5.4619920e-01,
         5.0455470e+00,  1.2321994e+00,  4.3238983e+00,  1.9927826e+00,
        -2.1602297e+00, -1.3079751e+00, -6.9146115e-01,  5.3015381e-02,
        -4.8268044e-01, -9.3153745e-02, -1.4138623e+00, -7.3381865e-01,
         1.0861731e+00,  2.0909905e-02,  3.7457809e+00, -6.2870812e-01,
         2.5332193e+00,  1.3575262e-01, -2.2240503e+00, -2.2520411e+00,
         1.7353342e+00, -3.0237112e+00, -3.5660359e-01, -1.1621103e+00,
         1.0180223e+00, -1.0643258e+00, -5.1586980e-01,  2.5556726e+00,
        -1.4970391e+00,  4.1568580e+00,  2.5496909e-01, -4.2628989e+00,
         1.1833129e+00,  9.8969126e-01,  1.0735711e+00, -5.0657594e-01,
         5.5927300e-01, -1.2371132e+00, -7.8065711e-01, -3.2571149e+00,
         6.4695233e-01,  1.2356906e+00,  7.2818124e-01, -6.9766361e-01,
         1.8464003e+00,  1.5882987e-01,  2.6280200e+00,  1.0425026e+00,
        -3.3224704e+00, -1.9793093e-02, -2.9081538e+

In [36]:
vocab

{1: 'Szczecin',
 2: 'woods',
 3: 'spiders',
 4: 'hanging',
 5: 'localized',
 6: 'Rs500',
 7: 'opener',
 8: 'Western',
 9: 'alphabetic',
 10: 'Euro',
 11: '1800–243–0019',
 12: 'eugenics',
 13: 'appropriation',
 14: 'politician',
 15: 'bringing',
 16: 'advices',
 17: 'wednesday',
 18: 'unblurred',
 19: 'Practicum',
 20: 'crotch',
 21: 'Transitions',
 22: 'Omegle',
 23: 'rebuilding',
 24: '-the-',
 25: 'ffor',
 26: '270',
 27: 'Iphone',
 28: 'periodontitis',
 29: 'scraped',
 30: 'inanimate',
 31: 'errors',
 32: 'deferred',
 33: 'cooking',
 34: 'Kilimanjaro',
 35: 'Hamilton',
 36: 'designing',
 37: 'College',
 38: 'shocks',
 39: 'widget',
 40: 'Foundation',
 41: 'brainwashed',
 42: 'affiliates',
 43: 'china',
 44: 'affiliated',
 45: 'chino',
 46: 'kids',
 47: 'controversy',
 48: 'neurologist',
 49: 'Hafiz',
 50: 'golden',
 51: 'Awakens',
 52: 'projection',
 53: 'Harvey',
 54: '2000/-',
 55: 'ProActiv',
 56: 'dna',
 57: 'insecurity',
 58: 'abbreviations',
 59: 'music',
 60: 'therefore',
 6

In [56]:
tree = spatial.KDTree(np.array(list(embeds.values())))