In [23]:
import sys
sys.path.append('../generate_dataset')
sys.path.append('../Siamese')

from pytorch_fast_elmo import FastElmo, batch_to_char_ids
import sys
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy
import matplotlib.pyplot as plt
import random
import model
import torch
from typing import List
import numpy as np
import utils

from typing import NewType

In [46]:
class Vector(object):
    
    def __init__(self, vec, sentence, index):
        
        self.vec = vec
        self.sentence = sentence
        self.index = index
        self.size = np.linalg.norm(self.vec)
    
    def get_word(self):
        
        return self.sentence[self.index]
    
    def get_vector(self): 
        
        return self.vec
    
    def get_sentence(self):
        
        return self.sentence
    
    def get_index(self):
        
        return self.index
    
    def get_size(self):
        
        return self.size
    
    def __str__(self):
        
        words = self.get_sentence()
        i = self.get_index()
        sent = '"' + " ".join(words[:i]) + color.BOLD + color.BLUE + " " + words[i] + color.END + " " + " ".join(words[i + 1:]) + '"'
        return sent
    
    def similarity(self, other):
        
        if other is self: return -np.inf
        
        return self.get_vector().dot(other.get_vector())/(self.get_size() * other.get_size())
    
    @staticmethod
    def get_closest_vector(vec, vecs):
    
        closest = max(vecs, key = lambda vector: vector.similarity(vec))
        return closest

In [40]:
def transform(model, vector: Vector):
    
    vec_pytorch = torch.from_numpy(vector.get_vector()).float()
    vector.vec = model._represent(vec_pytorch).detach().numpy()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
def print_closest_vectors(sample, all_vecs):
    
    for vector in sample:
        closest = Vector.get_closest_vector(vector, all_vecs)
        print("The closest vector to\n{}\nIs\n{} \n ==========================================\n".format(vector, closest))
    
def load_sents(fname = "sents_f", max_length = 15):
    
    with open(fname, "r") as f:
              
        lines = f.readlines()
    
    lines =  [line.strip().split(" ") for line in lines]
    if max_length is not None:
        lines = list(filter(lambda sentence: len(sentence) < max_length, lines))
    return lines

def load_model(name = "model.pt"):
    net = model.SiameseNet()
    net.load_state_dict(torch.load(name))
    net.eval()
    return net

def list_vectors(vecs, sents: List[List[str]]) -> List[Vector]:
    
    num_sentences = len(sents)
    sents_indices_and_vecs =  zip(range(num_sentences), vecs)
    all_vectors = []

    for sent_index, sent_vectors in sents_indices_and_vecs:
        
        for i, (w,vec) in enumerate(zip(sents[sent_index], vecs[sent_index])):   
            
            v = Vector(vec.detach().numpy(), sents[sent_index], i)
            all_vectors.append(v)
            
    return all_vectors

Load pretrained ELMO and a collection of Wikipedia sentences.

In [34]:
all_sentences = load_sents(max_length = 15)
options_file = "../generate_dataset/" + utils.DEFAULT_PARAMS["elmo_options"]
weight_file = "../generate_dataset/" + utils.DEFAULT_PARAMS["elmo_weights"]
elmo = FastElmo(options_file, weight_file)

In [43]:
N = 1000
subset_size = 150
random.seed(0)
sentences = all_sentences[:N]

Collect ELMO states over N sentences, and load pretrained Siamese network.

In [36]:
character_ids = batch_to_char_ids(sentences)
embeddings = elmo(character_ids) # collect elmo states
network = load_model()

Find the closest vectors to a collection of vectors.

In [47]:
vecs = list_vectors(embeddings["elmo_representations"][0], sentences)
subset = np.random.choice(vecs, size = subset_size)
print_closest_vectors(subset, vecs)

The closest vector to
"pierce died of complications from an abdominal infection[1m[94m in[0m los angeles , california ."
Is
"he died of cancer at a veterans administration hospital[1m[94m in[0m houston , texas ." 

The closest vector to
"my main reason for[1m[94m asking[0m the question and inviting debate was to learn ."
Is
"if another administrator blocked the user , i would[1m[94m ask[0m for their view ." 

The closest vector to
"he was elected lieutenant governor in 1982 as running[1m[94m mate[0m of richard f ."
Is
"joe is a technician aboard the toronto and a good[1m[94m friend[0m of tom ." 

The closest vector to
"the[1m[94m other[0m six temples are located in villages , largely in remote locations ."
Is
"[1m[94m other[0m complete sentences consist of two or more clauses ( see below ) ." 

The closest vector to
"looks like it was split from censorship with just brazil and venezuela ([1m[94m ![0m "
Is
"i trust we have an admin here who knows what to do ne

The closest vector to
"unreferenced and unverifiable music album that even does n't contain its artist[1m[94m name[0m !"
Is
"sul for this name , they have the greatest priority for this[1m[94m name[0m ." 

The closest vector to
"strong support[1m[94m experienced[0m member , often reverts vandalism and beats me to it ."
Is
"they had come from places where they had already[1m[94m experienced[0m and known humiliation ." 

The closest vector to
"in november 2007 , sears holdings[1m[94m corporation[0m announced the purchase of a 13 ."
Is
"the play was adapted for tv by the australian broadcasting[1m[94m corporation[0m in 1973 ." 

The closest vector to
"the library also[1m[94m has[0m video games , computer games , and internet facilities ."
Is
"it was founded over 100 years ago and[1m[94m has[0m more than 700 employees ." 

The closest vector to
"the human condition refers to the experience of existence and[1m[94m life[0m as humans ."
Is
"he was terrified of death

The closest vector to
"notable enough for[1m[94m our[0m purposes means receiving significant coverage in reliable secondary sources ."
Is
"on the main topic of discussion here ,[1m[94m our[0m practice is well established ." 

The closest vector to
"the tang chinese had recruited many central asian turks into[1m[94m their[0m armed forces ."
Is
"i know people that within[1m[94m their[0m family that are not really well off ." 

The closest vector to
"iraqi soldiers engaged in running battles with insurgents[1m[94m up[0m and down haifa street ."
Is
"iraqi soldiers engaged in running battles with insurgents up and[1m[94m down[0m haifa street ." 

The closest vector to
"if it did exist in the mid-1990s , that[1m[94m version[0m is certainly notable ."
Is
"his[1m[94m version[0m of the song peaked at number 44 on hot country songs ." 

The closest vector to
"it appears to have[1m[94m had[0m a lavish binding decorated with a roman cameo ."
Is
"on trials ,[1m[94m had

The closest vector to
"there is also a plan on having food service in the[1m[94m near[0m future ."
Is
"founder of petty enterprises , level cross , north carolina ,[1m[94m near[0m greensboro ." 

The closest vector to
"it think it also needs to be expanded[1m[94m ,[0m based on the sources ."
Is
"that very well may happen[1m[94m ,[0m but that is for later to decide ." 

The closest vector to
"that quote[1m[94m could[0m be used as an example for what wp is not ."
Is
"it[1m[94m could[0m just be like friendly , but have the different types available ." 

The closest vector to
"income tax ( building societies ) ( audit powers ) regulations 1992 s[1m[94m .[0m "
Is
"relevant to sole traders , it also somewhat amended individual voluntary arrangements procedures[1m[94m .[0m " 

The closest vector to
"there is no need to explicitly point out[1m[94m the[0m error in the article ."
Is
"i think the whole approach is making too much out of[1m[94m the[0m problem ." 

The

The closest vector to
"this implementation has a few nice[1m[94m extensions[0m and outputs many different image formats ."
Is
"on february 28 2013 , the lions signed smith to a contract[1m[94m extension[0m ." 

The closest vector to
"the 3rd attack wing and group operated out of fort crockett[1m[94m ,[0m texas ."
Is
"pierce died of complications from an abdominal infection in los angeles[1m[94m ,[0m california ." 

The closest vector to
"she studied classical guitar techniques and began composing from the age of[1m[94m 11[0m ."
Is
"he made his debut in the chinese professional baseball league at age[1m[94m 23[0m ." 

The closest vector to
"the latter applies to[1m[94m all[0m 27 current member states of the european union ."
Is
"this resolved catalan 's conjecture for[1m[94m all[0m but a finite number of cases ." 

The closest vector to
"[1m[94m former[0m president salad appointed seven people to his cabinet on 4 august 2007 ."
Is
"former[1m[94m president[0

Now, first apply the trained Siamese network to each vector, and recalculate closest vectors.

In [48]:
for v in vecs:
    transform(network, v)

random.seed(0)
subset = np.random.choice(vecs, size = subset_size)
print_closest_vectors(subset, vecs)

The closest vector to
"notable enough for our purposes means[1m[94m receiving[0m significant coverage in reliable secondary sources ."
Is
"dont delete - you guys[1m[94m have[0m any idea how popular this man is ." 

The closest vector to
"[1m[94m in[0m july of the same year , he was elevated to chief justice ."
Is
"the baltimore colts ran a version of the play[1m[94m in[0m december , 1970 ." 

The closest vector to
"now if only we can get someone to[1m[94m write[0m the australian literature article ."
Is
"i am ready to[1m[94m make[0m an introduction at the category page to explain ." 

The closest vector to
"[1m[94m the[0m notion that you lock people up for smoking marijuana is pretty silly ."
Is
"that is why[1m[94m the[0m 50 members should not directly represent the organizations notability ." 

The closest vector to
"the effects in local government is[1m[94m immediate[0m and an awareness factor is critical ."
Is
"keep the information is[1m[94m accurate[0m

The closest vector to
"june 18 , 2004[1m[94m [[0m 2 ] whilst others implement many other strategies ."
Is
"in 1959 all[1m[94m of[0m galway 's competing hurling teams transferred provinces to munster ." 

The closest vector to
"different[1m[94m schools[0m of painting are shown from the sixteenth to the twentieth centuries ."
Is
"the most depressed comic book[1m[94m characters[0m '' on the best week ever blog ." 

The closest vector to
"the piece in the pocket can be put back[1m[94m on[0m the board later ."
Is
"a hand made by hitting two consecutive cards[1m[94m on[0m the turn and river ." 

The closest vector to
"the blocks were also[1m[94m supposedly[0m used for selling slaves during the slave period ."
Is
"even if married in another state , it is[1m[94m not[0m recognized within missouri ." 

The closest vector to
"fringe , would have to[1m[94m be[0m proved , and covered to be notable ."
Is
"wp concerned with vaughan politics , if the other afds[1m[94m are

The closest vector to
"that is , concerning the[1m[94m offices[0m and majesty of christ the mediator ] ."
Is
"rock and roll[1m[94m people[0m '' which is from the mind games sessions ) ." 

The closest vector to
"peter and wendy free her , and peter is honored by[1m[94m the[0m tribe ."
Is
"[1m[94m the[0m community support for the project was another key aspect of its success ." 

The closest vector to
"the fifth volume is scheduled to be released on april 22 , 2009[1m[94m .[0m "
Is
"they had come from places where they had already experienced and known humiliation[1m[94m .[0m " 

The closest vector to
"bureaucrats that are active regularly[1m[94m are[0m the bureaucrats that were promoted more recently ."
Is
"i have been doing it manually , but it[1m[94m is[0m pretty time consuming ." 

The closest vector to
"he[1m[94m served[0m in the vermont senate from 1851 to 1852 was senate president ."
Is
"da silva then moved to italy in 1991 where[1m[94m he[0m played 

The closest vector to
"national archives and records administration ( talk ) stub-class ( no-class[1m[94m )[0m added ."
Is
"income tax ( building societies ) ( audit powers[1m[94m )[0m regulations 1992 s ." 

The closest vector to
"would someone please commit on the above question which relates[1m[94m to[0m reference 8 ."
Is
"you go the other way[1m[94m to[0m turn those characters into the original data ." 

The closest vector to
"wikipedia article into an example[1m[94m of[0m a point merely to illustrate that point ."
Is
"i do n't believe you are permitted[1m[94m to[0m have a say in it ." 

The closest vector to
"or perhaps the prose should be touched[1m[94m up[0m first and then cut down ."
Is
"i do n't believe you are permitted to have a say[1m[94m in[0m it ." 

The closest vector to
"if merge , then delete[1m[94m the[0m 2 redirects as they are rather irrelevant ."
Is
"that is why[1m[94m the[0m 50 members should not directly represent the organizations n

The closest vector to
"[1m[94m one[0m of five players in nba history to win consecutive finals mvp awards ."
Is
"january 12 , 1931 ) was an[1m[94m indian[0m freedom fighter , a revolutionary ." 

