In [36]:
import torch
from torch import nn
from torch.autograd import Variable

from data_loader import DataLoader
from model import UniSkip
from config import *
from datetime import datetime, timedelta

from tensorboardX import SummaryWriter
import nltk

from vocab import load_dictionary
import gather
from sacremoses import MosesDetokenizer
detokenizer = MosesDetokenizer()

In [5]:
lr = 3e-4

language = "english"
categories = list(gather.datasets[language].keys())
n_categories = len(categories)
print("Categories: ", categories)



save_loc = "./saved_models/skip-best-{}".format(VOCAB_SIZE)
mod = UniSkip(n_categories=len(categories))
if USE_CUDA:
    mod.cuda(CUDA_DEVICE)
mod.load_state_dict(torch.load(save_loc))

encoder = mod.encoder

d = DataLoader(sentences=[''], word_dict=load_dictionary('./dataset/'+language+'/corpus.txt.pkl'))

Categories:  ['common', 'austin', 'dickens', 'shakespeare', 'wilde', 'songs']


  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "


Making reverse dictionary


In [26]:
import gather
import tqdm
from torch.nn import functional as F

def prepare_test(sentence):
    return " ".join(nltk.word_tokenize(sentence))

def get_vector(sentence, category_from):
    cat_index = categories.index(category_from)
    cat_tensor = torch.Tensor([1 if c == cat_index else 0 for c in range(n_categories)]).cuda(CUDA_DEVICE)
    
    indices = d.convert_sentence_to_indices(sentence)
    output, _ = encoder(torch.stack([indices]), cat_tensor)
    return output
    
def get_closest_sentence(sentence, source_author, target_author):
    cat_index = categories.index(target_author)
    cat_tensor = torch.Tensor([1 if c == cat_index else 0 for c in range(n_categories)]).cuda(CUDA_DEVICE)
    
    
    path = gather.get_corpus_location(language, target_author)
    author_sentences = DataLoader(path)
    
    target_vector = get_vector(prepare_test(sentence), source_author)
    
    batch_size = 256
    max_sim = 0
    max_sent = []
    n_sent = len(author_sentences.sentences)
    
    for i in tqdm.tqdm(range(0, n_sent-batch_size, batch_size)):
        batch = []
        for j in range(i, min(i + batch_size, n_sent)):
            sent = author_sentences.sentences[j]
            ind = d.convert_sentence_to_indices(sent)
            batch.append(ind)
        output, _ = encoder(torch.stack(batch), cat_tensor)
        
        sim = F.cosine_similarity(output, target_vector)
        val, ind = sim.max(0)

        if val > max_sim:
            max_sim = val
            max_sent = batch[ind]
    return max_sent, max_sim

In [46]:
test_sentence = "This is odd."
target_author = "wilde"

In [47]:
res, max_sim = get_closest_sentence(test_sentence, "wilde", target_author)
print(res.cpu().data.numpy(), max_sim)
sent = [x for x in res if x != 0]
sent = d.convert_indices_to_sentences(sent)


print(detokenizer.detokenize(sent.split(" "), return_str=True))

  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "
  4%|▎         | 1/27 [00:00<00:03,  7.63it/s]

STD: OK
Loading text file at /jet/prs/workspace/DL-NLP-Transfer/dataset/english/wilde/cu31924103377051_djvu.txt.std
Making dictionary for these words
Using cached dictionary at /jet/prs/workspace/DL-NLP-Transfer/dataset/english/wilde/cu31924103377051_djvu.txt.std.pkl
Making reverse dictionary


100%|██████████| 27/27 [00:02<00:00, 10.79it/s]

[ 145   14 3180 1272    3    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0] tensor(0.6097, device='cuda:0', grad_fn=<MaxBackward0>)
This is Uncle Jack.





In [21]:
source_author = "common"
sentence = "Hi, I'm happy."
print(get_vector(prepare_test(sentence), source_author))

tensor([[-2.9509e-01, -8.9650e-02, -1.1045e-01,  ..., -3.3493e-02,
         -2.6543e-03, -1.3911e-01]],
       device='cuda:0', grad_fn=<SelectBackward>)


  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "
