In [22]:
import torch
from torch import nn
from torch.autograd import Variable

from data_loader import DataLoader
from model import UniSkip
from config import *
from datetime import datetime, timedelta

from tensorboardX import SummaryWriter
import nltk

from vocab import load_dictionary
import gather
from sacremoses import MosesDetokenizer
detokenizer = MosesDetokenizer()

import numpy as np

In [23]:
lr = 3e-4
batch_size = 256

language = "english"
categories = list(gather.datasets[language].keys())
n_categories = len(categories)
print("Categories: ", categories)



save_loc = "./saved_models/skip-best-{}".format(VOCAB_SIZE)
mod = UniSkip(n_categories=len(categories))
if USE_CUDA:
    mod.cuda(CUDA_DEVICE)
mod.load_state_dict(torch.load(save_loc))

encoder = mod.encoder

d = DataLoader(sentences=[''], word_dict=load_dictionary('./dataset/'+language+'/corpus.txt.pkl'))

Categories:  ['common', 'austin', 'dickens', 'shakespeare', 'wilde', 'songs']


  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "


Making reverse dictionary


In [48]:
sentences_vectors = {}
pre_sentence = {}
embedding_size = 1200

print("Pre-computing vectors")

for c in ["songs"]:
    cat_index = categories.index(c)
    cat_tensor = torch.Tensor([1 if c == cat_index else 0 for c in range(n_categories)]).cuda(CUDA_DEVICE)
    
    path = gather.get_corpus_location(language, c)
    author_sentences = DataLoader(path)
    n_sent = len(author_sentences.sentences)
    
    sentences_vectors[c] = np.empty((n_sent, embedding_size))
    pre_sentence[c] = []
    print(c)
    for i in tqdm.tqdm(range(0, n_sent-batch_size, batch_size)):
        batch = []
        for j in range(i, min(i + batch_size, n_sent)):
            sent = author_sentences.sentences[j]
            pre_sentence[c].append(sent)
            ind = d.convert_sentence_to_indices(sent)
            batch.append(ind)
        output, _ = encoder(torch.stack(batch), cat_tensor)
        sentences_vectors[c][i:min(i+batch_size, n_sent)] = output.cpu().data.numpy()
    

  0%|          | 0/141 [00:00<?, ?it/s]

Pre-computing vectors
STD: OK
Loading text file at /jet/prs/workspace/DL-NLP-Transfer/dataset/english/austin/31100.txt.std
Making dictionary for these words
Using cached dictionary at /jet/prs/workspace/DL-NLP-Transfer/dataset/english/austin/31100.txt.std.pkl
Making reverse dictionary
austin


  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "
100%|██████████| 141/141 [00:14<00:00,  9.92it/s]


In [29]:
import pickle
pickle.dump(sentences_vectors["shakespeare"], open( "dataset/"+language+"/"+"embeddings_shak.p", "wb" ) )

In [58]:
import gather
import tqdm
from scipy.spatial import distance
from torch.nn import functional as F

def prepare_test(sentence):
    return " ".join(nltk.word_tokenize(sentence))

def get_vector(sentence, category_from):
    cat_index = categories.index(category_from)
    cat_tensor = torch.Tensor([1 if c == cat_index else 0 for c in range(n_categories)]).cuda(CUDA_DEVICE)
    
    indices = d.convert_sentence_to_indices(sentence)
    output, _ = encoder(torch.stack([indices]), cat_tensor)
    return output
    
def get_closest_sentence(sentence, source_author, target_author):
    
    path = gather.get_corpus_location(language, target_author)
    author_sentences = DataLoader(path)
    
    target_vector = get_vector(prepare_test(sentence), source_author).cpu().data.numpy()
    
    sentences = sentences_vectors[target_author]
    
    max_sim = 0
    
    for i, vector in tqdm.tqdm(enumerate(sentences)):
        sim = 1 - distance.cosine(vector, target_vector)

        if sim > max_sim:
            max_sim = sim
            max_sent = pre_sentence[target_author][i]
    return max_sent, max_sim

In [89]:
test_sentence = "This is not what I expected."
target_author = "austin"

In [90]:
res, max_sim = get_closest_sentence(test_sentence, "songs", target_author)
print(max_sim)
print(detokenizer.detokenize(res.split(" "), return_str=True))

  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "
1710it [00:00, 17097.86it/s]

STD: OK
Loading text file at /jet/prs/workspace/DL-NLP-Transfer/dataset/english/austin/31100.txt.std
Making dictionary for these words
Using cached dictionary at /jet/prs/workspace/DL-NLP-Transfer/dataset/english/austin/31100.txt.std.pkl
Making reverse dictionary


  dist = 1.0 - uv / np.sqrt(uu * vv)
36346it [00:02, 18092.30it/s]

0.389588867141
This is what the world does.





In [12]:
source_author = "common"
sentence = "Hi, I'm happy."
print(get_vector(prepare_test(sentence), source_author).shape)

torch.Size([1, 1200])


  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "
