In [1]:
import time
import os
import numpy as np
import random

import torch
import torch.nn as nn
from torchtext import data

from args import get_args
from model import SmPlusPlus
from trec_dataset import TrecDataset
from wiki_dataset import WikiDataset
from evaluate import evaluate

In [2]:
# TEXT = data.Field(batch_first=True, tokenize=clean_str_sst)
# LABEL = data.Field(sequential=False)
def set_vectors(field, vector_path):
    if os.path.isfile(vector_path):
        stoi, vectors, dim = torch.load(vector_path)
        field.vocab.vectors = torch.Tensor(len(field.vocab), dim)

        for i, token in enumerate(field.vocab.itos):
            wv_index = stoi.get(token, None)
            if wv_index is not None:
                field.vocab.vectors[i] = vectors[wv_index]
            else:
                # initialize <unk> with U(-0.25, 0.25) vectors
                field.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
    else:
        print("Error: Need word embedding pt file")
        print("Error: Need word embedding pt file")
        exit(1)
    return field

QID = data.Field(sequential=False)
AID = data.Field(sequential=False)
QUESTION = data.Field(batch_first=True)
ANSWER = data.Field(batch_first=True)
LABEL = data.Field(sequential=False)
EXTERNAL = data.Field(sequential=False, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False,
                      preprocessing=data.Pipeline(lambda x: x.split()),
                      postprocessing=data.Pipeline(lambda x, train: [float(y) for y in x]))

train, dev, test = TrecDataset.splits(QID, QUESTION, AID, ANSWER, EXTERNAL, LABEL)

QID.build_vocab(train, dev, test)
AID.build_vocab(train, dev, test)
QUESTION.build_vocab(train, dev, test)
ANSWER.build_vocab(train, dev, test)
LABEL.build_vocab(train, dev, test)
vector_cache = "data/word2vec.trecqa.pt"
QUESTION = set_vectors(QUESTION, vector_cache)
ANSWER = set_vectors(ANSWER, vector_cache)

TypeError: must be str, not Field

In [3]:
# print(type(train.splits()))
print(dir(train.examples[1]))
print(train[1].question)
train_iter = data.Iterator(train, batch_size=1000, device=-1, train = True, repeat=False,
                                   sort=False, shuffle=True)
train_iter.init_epoch()
for batch_idx, batch in enumerate(train_iter):
    print(dir(batch))
    print(dir(batch.dataset))
    print(batch.fromvars)
    break

NameError: name 'train' is not defined

In [4]:
resume_snapshot = "saves/TREC/static_best_model.pt"
model = torch.load(resume_snapshot, map_location=lambda storage, location: storage)

model.train();
features = model(batch)



In [5]:
features.data.cpu().numpy().shape

(1000, 204)

In [7]:
import torch.nn.functional as F
F.tanh(model.combined_feature_vector(features))

Variable containing:
-1.2799e-01 -3.0379e-02 -4.7480e-01  ...   1.2660e-01  6.1756e-02 -3.1299e-01
-2.5744e-01 -2.1584e-02 -5.4953e-01  ...   1.9152e-01 -1.8904e-02 -4.0735e-01
-1.5847e-01  1.1012e-01 -4.7973e-01  ...   1.6229e-01  2.5392e-01 -4.0335e-01
                ...                   ⋱                   ...                
-2.9572e-01 -1.9766e-02 -5.5622e-01  ...   2.1145e-01 -3.4282e-02 -4.2156e-01
-2.1236e-01 -2.4452e-02 -5.3466e-01  ...   1.2155e-01 -3.6664e-02 -3.5795e-01
-1.3215e-01 -2.3809e-02 -5.6484e-01  ...   1.6987e-01  2.4750e-02 -4.1851e-01
[torch.FloatTensor of size 1000x204]

In [8]:
model.dropout(features)

Variable containing:
-0.3792 -0.0000 -0.9834  ...   0.2067  0.0000 -0.0000
-0.0000 -0.0571 -1.0969  ...   0.0000  0.0000 -0.0000
-0.3504  0.0563 -0.0000  ...   0.0000  0.0191 -0.0000
          ...             ⋱             ...          
-0.3124 -0.0000 -0.9964  ...   0.2556  0.0101 -0.6769
-0.0000 -0.0000 -1.0005  ...   0.3454  0.0000 -0.5915
-0.0000  0.1076 -0.0000  ...   0.0000  0.3264 -0.6988
[torch.FloatTensor of size 1000x204]

In [13]:
features.size()

torch.Size([1000, 204])

In [14]:
model.hidden(features)

Variable containing:
-3.8244e+00  2.6957e+00 -7.2175e-02
-3.9288e+00  2.2850e+00 -1.1860e-01
-4.0627e+00  2.1671e+00  3.0250e-01
                 ⋮                  
-4.5525e+00  2.4905e+00 -8.6415e-02
-3.8286e+00  1.7874e+00  5.5805e-01
-4.4491e+00  2.3969e+00 -2.4469e-01
[torch.FloatTensor of size 1000x3]

In [24]:
# batch.aid.data.numpy()
test = [features.data.cpu().numpy()[5],features.data.cpu().numpy()[7]]

In [25]:
np.array(test).shape

(2, 204)

In [21]:
batch.label

Variable containing:
 1
 1
 1
⋮ 
 1
 2
 1
[torch.LongTensor of size 1000]

In [83]:
if batch.label[1].data.numpy()[0] == 3:
    print("OK")

In [79]:
print(dir(batch.label[1].data.numpy()))

['T', '__abs__', '__add__', '__and__', '__array__', '__array_finalize__', '__array_interface__', '__array_prepare__', '__array_priority__', '__array_struct__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__complex__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dir__', '__divmod__', '__doc__', '__eq__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__ilshift__', '__imatmul__', '__imod__', '__imul__', '__index__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__irshift__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lshift__', '__lt__', '__matmul__', '__mod__', '__mul__', '__ne__', '__neg__', '__new__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rlshift__', '__rmatmul__', '__

In [81]:
batch.label[1].data.numpy()[0]

2

In [19]:
features.data.cpu().numpy()

array([[-0.        ,  0.07533404, -0.        , ...,  0.33050776,
        -0.03647172, -0.        ],
       [-0.50200146, -0.        , -0.        , ...,  0.        ,
        -0.12017281, -0.        ],
       [-0.        , -0.00975599, -0.        , ...,  0.3870208 ,
        -0.        , -0.        ],
       ..., 
       [-0.18306208,  0.16188723, -0.        , ...,  0.16567066,
         0.        , -0.        ],
       [-0.        ,  0.        , -0.        , ...,  0.36261645,
         0.        , -0.        ],
       [-0.38052365, -0.09780853, -0.        , ...,  0.        ,
        -0.        , -0.79321569]], dtype=float32)

In [84]:
batch.label

Variable containing:
 2
 2
 2
⋮ 
 1
 1
 1
[torch.LongTensor of size 1000]

In [99]:
import numpy as np
target_feature = [1,2,3]
features = [[-1,-1,1],[-2,2,-2]]
np.sqrt(np.sum((np.array(features)-target_feature)**2,axis = 1))


array([ 4.12310563,  5.83095189])

In [20]:
print(dir(batch))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'aid', 'answer', 'batch_size', 'dataset', 'ext_feat', 'fromvars', 'label', 'qid', 'question', 'train']


In [110]:
print(type(train))

<class 'SST1.SubjDataset'>


In [111]:
train.fields

{'label': <torchtext.data.field.Field at 0x7f322a56ada0>,
 'text': <torchtext.data.field.Field at 0x7f322a56ad68>}

In [113]:
len(batch.label)

1000

In [114]:
batch.dataset.examples[1]

<torchtext.data.example.Example at 0x7f3222fbaac8>

In [115]:
len(batch.dataset.examples)

7999

In [116]:
print(dir(batch))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'batch_size', 'dataset', 'fromvars', 'label', 'text', 'train']


In [117]:
batch.batch_size

1000

In [120]:
batch.text

Variable containing:
     2    808     21  ...       1      1      1
     4      0      5  ...       1      1      1
  9685      0     46  ...       1      1      1
        ...            ⋱           ...         
  4230   2062      9  ...       1      1      1
    15      9     63  ...       1      1      1
   332     82    908  ...       1      1      1
[torch.LongTensor of size 1000x81]

In [123]:
batch.dataset.examples[1].text

['emerging',
 'from',
 'the',
 'human',
 'psyche',
 'and',
 'showing',
 'characteristics',
 'of',
 'abstract',
 'expressionism',
 ',',
 'minimalism',
 'and',
 'russian',
 'constructivism',
 ',',
 'graffiti',
 'removal',
 'has',
 'secured',
 'its',
 'place',
 'in',
 'the',
 'history',
 'of',
 'modern',
 'art',
 'while',
 'being',
 'created',
 'by',
 'artists',
 'who',
 'are',
 'unconscious',
 'of',
 'their',
 'artistic',
 'achievements']

In [124]:
batch.text

Variable containing:
     2    808     21  ...       1      1      1
     4      0      5  ...       1      1      1
  9685      0     46  ...       1      1      1
        ...            ⋱           ...         
  4230   2062      9  ...       1      1      1
    15      9     63  ...       1      1      1
   332     82    908  ...       1      1      1
[torch.LongTensor of size 1000x81]

In [126]:
print(dir(batch))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'batch_size', 'dataset', 'fromvars', 'label', 'text', 'train']


In [131]:
type(batch.dataset.examples[1])

torchtext.data.example.Example

In [132]:
len(batch.dataset.examples)

7999

In [133]:
dir(batch)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'batch_size',
 'dataset',
 'fromvars',
 'label',
 'text',
 'train']

In [135]:
batch.dataset[5]

<torchtext.data.example.Example at 0x7f32230d8f28>

In [139]:
batch.text[1].data.numpy()

array([   4,    0,    5, 2274,  699, 1016,    8, 6534,  612,    0,    5,
         18, 3489, 8614,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1])

In [141]:
type(batch.label)

torch.autograd.variable.Variable

In [142]:
batch.label

Variable containing:
 2
 2
 2
⋮ 
 1
 1
 1
[torch.LongTensor of size 1000]

In [143]:
optimizer

NameError: name 'optimizer' is not defined

In [146]:
parameter = filter(lambda p: p.requires_grad, model.parameters())
lr = 1.0
weight_decay = 0
optimizer = torch.optim.Adadelta(parameter, lr=lr, weight_decay=weight_decay)

In [147]:
optimizer

<torch.optim.adadelta.Adadelta at 0x7f3218094588>

In [148]:
batch.text

Variable containing:
     2    808     21  ...       1      1      1
     4      0      5  ...       1      1      1
  9685      0     46  ...       1      1      1
        ...            ⋱           ...         
  4230   2062      9  ...       1      1      1
    15      9     63  ...       1      1      1
   332     82    908  ...       1      1      1
[torch.LongTensor of size 1000x81]

In [151]:
features = model(batch)


array([[ 0.        ,  0.        ,  0.        , ...,  0.21150009,
         0.        ,  0.19599402],
       [ 0.        ,  0.        ,  0.        , ...,  0.22643675,
         0.2998307 ,  0.15882176],
       [ 0.51109791,  0.43724236,  0.        , ...,  0.        ,
         0.38846686,  0.        ],
       ..., 
       [ 0.28448257,  0.53172576,  0.        , ...,  0.        ,
         0.        ,  0.18477558],
       [ 0.3036153 ,  0.46967146,  0.        , ...,  0.23388135,
         0.39685246,  0.        ],
       [ 0.        ,  0.50386667,  0.        , ...,  0.29729113,
         0.41291639,  0.        ]], dtype=float32)

In [153]:
features.data.cpu().numpy()[2].shape

(300,)

In [155]:
batch.text

Variable containing:
     2    808     21  ...       1      1      1
     4      0      5  ...       1      1      1
  9685      0     46  ...       1      1      1
        ...            ⋱           ...         
  4230   2062      9  ...       1      1      1
    15      9     63  ...       1      1      1
   332     82    908  ...       1      1      1
[torch.LongTensor of size 1000x81]

In [156]:
a = batch.text[2]
b = batch.text[3]

In [158]:
torch.stack([a,b])

Variable containing:

Columns 0 to 12 
 9685     0    46     4  1888  1176  1168     5  2051     0  6811   675    13
    2  9789     0   205  4349    43     8    16   104   153     2    35     9

Columns 13 to 25 
  414   902     0  8597     1     1     1     1     1     1     1     1     1
    1     1     1     1     1     1     1     1     1     1     1     1     1

Columns 26 to 38 
    1     1     1     1     1     1     1     1     1     1     1     1     1
    1     1     1     1     1     1     1     1     1     1     1     1     1

Columns 39 to 51 
    1     1     1     1     1     1     1     1     1     1     1     1     1
    1     1     1     1     1     1     1     1     1     1     1     1     1

Columns 52 to 64 
    1     1     1     1     1     1     1     1     1     1     1     1     1
    1     1     1     1     1     1     1     1     1     1     1     1     1

Columns 65 to 77 
    1     1     1     1     1     1     1     1     1     1     1     1     1
    1   

In [163]:
a = np.array([[1,2,3]])

In [165]:
a.repeat(1000,axis = 0)

array([[1, 2, 3],
       [1, 2, 3],
       [1, 2, 3],
       ..., 
       [1, 2, 3],
       [1, 2, 3],
       [1, 2, 3]])

In [166]:
batch.text

Variable containing:
     2    808     21  ...       1      1      1
     4      0      5  ...       1      1      1
  9685      0     46  ...       1      1      1
        ...            ⋱           ...         
  4230   2062      9  ...       1      1      1
    15      9     63  ...       1      1      1
   332     82    908  ...       1      1      1
[torch.LongTensor of size 1000x81]

In [179]:
batch.label[4]

Variable containing:
 1
[torch.LongTensor of size 1]

In [181]:
type(TEXT)

torchtext.data.field.Field

In [33]:

index2label = np.array(LABEL.vocab.itos)
index2qid = np.array(QID.vocab.itos)
index2answer = np.array(ANSWER.vocab.itos)

In [34]:
index2answer

array(['<unk>', '<pad>', 'the', ..., 'zvyagintsev', 'zx', 'ø'],
      dtype='<U63')

In [29]:
index2qid

array(['<unk>', '63', '3', ..., '7.1', '9', '94'],
      dtype='<U5')

In [30]:
index2label

array(['<unk>', '0', '1'],
      dtype='<U5')

In [31]:
index2question = np.array(QUESTION.vocab.itos)

In [41]:
index2question
sen = " ".join([index2answer[x] for x in batch.answer[0].data.cpu().numpy()])

" ".join([index2question[x] for x in batch.question[0].data.cpu().numpy()])    

'who discovered radium <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [39]:
sen

'a generation later , with the explosions over japan , frederic and irene joliot-curie felt betrayed by their american colleagues . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [46]:
batch.answer[0][batch.answer[0] != 1]

Variable containing:
     9
  1522
   275
     3
    18
     2
  5866
    89
   210
     3
 13156
     7
 18836
 22567
  1972
 15727
    21
    56
   107
  2958
     4
[torch.LongTensor of size 21]

In [48]:
batch.answer[0]

Variable containing:
     9
  1522
   275
     3
    18
     2
  5866
    89
   210
     3
 13156
     7
 18836
 22567
  1972
 15727
    21
    56
   107
  2958
     4
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
[torch.LongTensor of size 111]