In [14]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re

class DenseNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = 100
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,512)
        self.hidden3 = nn.Linear(512,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    


model_to_test =  DenseNet(context_length = 10)
model_to_test.load_state_dict(torch.load("../outputs/2021-09_wiki_only_lucky-shape-35.pt"))

<All keys matched successfully>

In [2]:
from util import *

#Loading the data
W_norm,vocab,ivocab = load_glove()

In [17]:
random_sent = '<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> hotel singapore officially reopened line singapore governments safety measures <pad> '
target_word = 'tekong'
target_label = np.array(get_glove_vec(target_word,W_norm,vocab))
random_sent = re.sub('[\n\r\ ]+',' ',random_sent).strip()

sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in random_sent.split(' ')]])
sample_output = model_to_test(sample_tensor)
output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
#print(f"Test 3: Custom Test\n\n{random_sent}\n\n")
print(f"Target word: {target_word}")
def __distance(W, vocab, ivocab, vec_output):


    dist = np.dot(W, vec_output.T).squeeze(1)
    #print(dist.shape)
    a = np.argsort(-dist)[:10]

    print("\n                               Word       Unnormalized Cosine distance\n")
    print("---------------------------------------------------------\n")
    for i,x in enumerate(a):
        print("%d%35s\t\t%f" % (i,ivocab[str(x)], dist[x]))

def cosim(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

__distance(W_norm,vocab,ivocab,vec_output)
print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")

Target word: tekong

                               Word       Unnormalized Cosine distance

---------------------------------------------------------

0                          president		0.615233
1                           national		0.493375
2                               vice		0.492262
3                                the		0.483846
4                         government		0.482385
5                            general		0.479991
6                                  ,		0.477759
7                                 as		0.477570
8                     administration		0.472556
9                              state		0.471898


		Cosim score: [-0.25220658]


In [10]:
infer_batch = []
with open('../processed_data/cat_corpus_c10.txt','r',encoding='utf-8') as f:
    sents = f.readlines()
    for line in sents:
        x = re.sub('[\r\n\ ]+',' ',line)
        infer_batch.append(np.array([get_glove_vec(w,W_norm,vocab) for w in x.strip().split(' ')]))
        
sample_infer = torch.tensor(np.array(infer_batch)).float()

sample_output = model_to_test(sample_infer)

a = torch.mean(sample_output,axis = 0).detach().numpy().reshape((1,100))
__distance(W_norm,vocab,ivocab,a)
print(f"\n\n\t\tCosim score: {cosim(a,target_label)}")

(400000,)

                               Word       Unnormalized Cosine distance

---------------------------------------------------------

0                            disease		0.955879
1                           diseases		0.848173
2                          infection		0.794759
3                              virus		0.761556
4                            illness		0.761492
5                             cancer		0.744749
6                           epidemic		0.742815
7                         infections		0.736916
8                       tuberculosis		0.726880
9                                flu		0.724777


		Cosim score: [0.71008273]
