In [20]:
import numpy as np
from numpy import float32
import torch
from torch.utils import data
import torch
from torch.utils.data import DataLoader

class Sentence(object):
    """docstring for sentence"""
    def __init__(self, content, target, rating, grained):
        # 小写化句子
        self.content, self.target = content.lower(), target
        # grained是细粒度情感，共3种 positive，negative，neutral 1，-1， 0
        self.solution = np.zeros((grained, ),dtype=np.int32)
        # 句子长度
        self.senlength = len(self.content.split(' '))
        try:
            # solution的index对应0,1,2的位置分别对应 negative -1, neutral 0, positive 1
            # 如 rating = 1 即 positive，则 solution = [0, 0, 1]
            # solution为真实的polarity vector
            self.solution[int(rating)+1] = 1
        except:
            exit()

    def stat(self, target_dict, wordlist, grained=3):
        data, data_target, i = [], [], 0
        # solution.shape = (senlength, 3), 输入一句话，预测出三种情感的数值
        # 其实此处的solution没有用，它并非self.solution，因为它根本没有被调用的地方
        solution = list(np.zeros((self.senlength, grained)))
        # 把一句话的每个单词在wordlist里的映射添加到data列表里
        for word in self.content.split(' '):
            data.append(wordlist[word])
            try:
                # Lexicons_dict应该包含真实的 aspect-level-word 的 polarity
                # 用 try 来 找出该句的 aspect word，pol 为该 aspect 在该句中的polarity
                # pol = -1 / 0 / 1
                # 不过有的句子并不包含真的 aspect level word 比如 It's cheap. 没有对应的aspect 'price' 这个词
                pol = Lexicons_dict[word]
                # 预测结果中句子的第i个单词的polarity = 1
                solution[i][pol+1] = 1
            except:
                pass
            i = i+1
        # target就是aspect level word
        for word in self.target.split(' '):
            # 与映射句子的单词类似，添加到data_target列表
            data_target.append(wordlist[word])
        return {'seqs': data, 'target': data_target,
                'solution': list(self.solution), 'target_index': self.get_target(target_dict)}

    def get_target(self, dict_target):
        return dict_target[self.target]

class DataManager(object):

    def __init__(self, dataset, grained=3, train=True, val=False, test=False):
        self.fileList = ['train', 'test', 'dev']
        self.origin = {}
        self.train = train
        self.val = val
        self.test = test

        for fname in self.fileList:
            data = []
            # 读取数据
            with open('%s/%s.cor' % (dataset, fname)) as f:
                sentences = f.readlines()
                # 因为每三行为一个数据，第一行为sentence，第二行为aspect level word，第三行为polarity
                for i in range(len(sentences)//3):
                    content, target, rating = sentences[i*3].strip(), sentences[i*3+1].strip(), sentences[i*3+2].strip()
                    # 每个 Sentecne 其实就是一个数据的实例，包含着评论，aspect level word 和对应的 polarity
                    sentence = Sentence(content, target, rating, grained)
                    data.append(sentence)
            # origin字典保存着 train, test, dev 为键的预处理过的数据，
            # 例如 train 数据保存着训练集的每一句话的单词在wordlist中对应的 index 组成的 vector，
            # 以及该句话的 aspect level word 即 target，以及该 aspect 的 polarity 保存在 solution 里
            self.origin[fname] = data
        # 这里生成了 dict_target, 保存在了self.dict_target里
        self.dict_target = self.gen_target()
        self.gen_word()
        # self.train_data, self.dev_data, self.test_data = self.gen_data()
        self.gen_data()
    def __getitem__(self, index):
        return self.data['train'][index]

    def gen_target(self, threshold=5):
        self.dict_target = {}
        # 遍历每个 train, dev, test 数据集
        for fname in self.fileList:
            # 遍历每一个 Sentence
            for sent in self.origin[fname]:
                # 如果dict_target 中包含了该 aspect level word，则该单词对应的值 +1
                if self.dict_target.get(sent.target)is not None:
                    self.dict_target[sent.target] = self.dict_target[sent.target] + 1
                else:
                    self.dict_target[sent.target] = 1
        i = 0
        # key = aspect level word, val = 出现次数
        for (key,val) in self.dict_target.items():
            # 如果出现次数少于5，就将对应的index设为0，不过如果第一个单词出现大于5次，不也被设为0了吗？
            if val < threshold:
                self.dict_target[key] = 0
            else:
                self.dict_target[key] = i
                i = i + 1
        return self.dict_target

    def gen_word(self):
        wordcount = {}
        def sta(sentence):
            for word in sentence.content.split(' '):
                try:
                    wordcount[word] = wordcount.get(word, 0) + 1
                except:
                    wordcount[word] = 1
            for word in sentence.target.split(' '):
                try:
                    wordcount[word] = wordcount.get(word, 0) + 1
                except:
                    wordcount[word] = 1

        for fname in self.fileList:
            for sent in self.origin[fname]:
                sta(sent)
        # words = [(word1, count1), (word2, count2), ...] (其实是生成器，这里为了方便理解写成list)
        words = wordcount.items()
        # 按出现次数降序排列
        words = sorted(words, key=lambda x:x[1], reverse=True)
        # wordlist = {word1:1, word2:2, ...} index越小的单词出现次数越多
        self.wordlist = {item[0]:index+1 for index, item in enumerate(words)}
        return self.wordlist


    def gen_data(self, grained=3):
        self.data = {}
        for fname in self.fileList:
            # data字典保存着以 train, dev, test为键的数据列表
            # 例如 train列表包含着一句sentence的数据，
            # 包含了未对单词embedding的句向量seq，target为用来aspect embedding的单词，solution为情感极性的one-hot编码，target_index为aspect所属类别
            self.data[fname] = []
            for sent in self.origin[fname]:
                self.data[fname].append(sent.stat(self.dict_target, self.wordlist))
        return self.data['train'], self.data['dev'], self.data['test']

    def word2vec_pre_select(self, mdict, word2vec_file_path, save_vec_file_path):
        list_seledted = ['']
        line = ''
        with open(word2vec_file_path) as f:
            for line in f:
                tmp = line.strip().split(' ', 1)
                if mdict.has_key(tmp[0]):
                    list_seledted.append(line.strip())
        list_seledted[0] = str(len(list_seledted)-1) + ' ' + str(len(line.strip().split())-1)
        open(save_vec_file_path, 'w').write('\n'.join(list_seledted))


if __name__ == '__main__':
    datamanager = DataManager('data', train=True)
    wordlist = datamanager.gen_word()
    # dataloader = DataLoader(datamanager, batch_size=8, num_workers=4)
    train, dev, test = datamanager.gen_data()

In [2]:
import pandas as pd

In [34]:
wordlist

{'the': 1,
 '.': 2,
 ',': 3,
 'and': 4,
 'food': 5,
 'a': 6,
 'is': 7,
 'to': 8,
 'miscellaneous': 9,
 'i': 10,
 'service': 11,
 'was': 12,
 'for': 13,
 'of': 14,
 'it': 15,
 'in': 16,
 'you': 17,
 'this': 18,
 'we': 19,
 'great': 20,
 'ambience': 21,
 '!': 22,
 'with': 23,
 'price': 24,
 'but': 25,
 'place': 26,
 'that': 27,
 'are': 28,
 'good': 29,
 'not': 30,
 'have': 31,
 'at': 32,
 "'s": 33,
 'on': 34,
 "n't": 35,
 'they': 36,
 'my': 37,
 'very': 38,
 'were': 39,
 'be': 40,
 'restaurant': 41,
 'had': 42,
 '(': 43,
 ')': 44,
 'so': 45,
 'there': 46,
 'as': 47,
 'go': 48,
 'if': 49,
 'all': 50,
 'been': 51,
 'like': 52,
 '-': 53,
 'from': 54,
 'here': 55,
 'do': 56,
 'best': 57,
 'one': 58,
 'an': 59,
 'out': 60,
 'their': 61,
 'prices': 62,
 'get': 63,
 'or': 64,
 'excellent': 65,
 'has': 66,
 'our': 67,
 'would': 68,
 'your': 69,
 'delicious': 70,
 'by': 71,
 '$': 72,
 'staff': 73,
 'atmosphere': 74,
 "'ve": 75,
 'will': 76,
 'always': 77,
 'menu': 78,
 'just': 79,
 'about': 80,
 

In [21]:
df = pd.DataFrame(train)
df.head()

Unnamed: 0,seqs,solution,target,target_index
0,"[4, 162, 22]","[0, 0, 1]",[24],0
1,"[1, 73, 7, 35, 1, 1573, 64, 115, 1303, 3, 4, 1...","[1, 0, 0]",[11],1
2,"[1, 11, 7, 77, 213, 168, 3, 56, 35, 413, 149, ...","[1, 0, 0]",[11],1
3,"[10, 399, 131, 18, 26, 22, 22, 22]","[0, 0, 1]",[9],2
4,"[6, 41, 27, 253, 35, 96, 8, 56, 384, 556, 670,...","[0, 0, 1]",[21],3


In [22]:
df.to_csv('train.csv',index=False)

In [19]:
train = pd.read_csv('train.csv')
train.head()
seqs_data = []
solutions_data = []
targets_data = []
target_index_data = []

for seqs in train['seqs']:
    seqs_data.append(eval(seqs))
for solution in train['solution']:
    solutions_data.append(eval(solution))
for target in train['target']:
    targets_data.append(eval(target))
for target_index in train['target_index']:
    target_index_data.append(target_index)
# seqs_data[:1], solutions_data[:1], targets_data[:1], target_index_data[:1]

In [20]:
import numpy as np
import torch

In [21]:
seqs_data

[[4, 162, 22],
 [1,
  73,
  7,
  35,
  1,
  1573,
  64,
  115,
  1303,
  3,
  4,
  10,
  234,
  2052,
  13,
  11,
  3,
  25,
  138,
  367,
  80,
  18,
  26,
  285,
  103,
  13,
  15,
  2],
 [1,
  11,
  7,
  77,
  213,
  168,
  3,
  56,
  35,
  413,
  149,
  14,
  384,
  54,
  69,
  704,
  3,
  4,
  10,
  68,
  30,
  107,
  1304,
  6,
  270,
  55,
  588,
  2],
 [10, 399, 131, 18, 26, 22, 22, 22],
 [6,
  41,
  27,
  253,
  35,
  96,
  8,
  56,
  384,
  556,
  670,
  20,
  5,
  23,
  20,
  11,
  16,
  6,
  400,
  74,
  2],
 [18, 26, 12, 30, 50, 27, 22],
 [1, 5, 7, 20, 2],
 [165, 385, 22, 22, 22, 22, 22],
 [1, 5, 12, 91, 29, 3, 10, 42, 1, 1574, 296, 4, 15, 12, 58, 14, 1, 57, 129, 2],
 [3006, 1, 11, 12, 29, 8, 65, 807, 23, 1, 887, 2],
 [19,
  42,
  6,
  459,
  368,
  3007,
  36,
  136,
  56,
  30,
  705,
  3008,
  4,
  36,
  1305,
  1,
  1306,
  3009,
  3010,
  1,
  460,
  87,
  2],
 [10,
  75,
  51,
  276,
  32,
  888,
  996,
  13,
  133,
  1123,
  218,
  4,
  31,
  277,
  177,
  8,
  40,


In [23]:
seqs_data = np.array(seqs_data)
seqs_data = np.array(seqs_data)


array([list([4, 162, 22]),
       list([1, 73, 7, 35, 1, 1573, 64, 115, 1303, 3, 4, 10, 234, 2052, 13, 11, 3, 25, 138, 367, 80, 18, 26, 285, 103, 13, 15, 2]),
       list([1, 11, 7, 77, 213, 168, 3, 56, 35, 413, 149, 14, 384, 54, 69, 704, 3, 4, 10, 68, 30, 107, 1304, 6, 270, 55, 588, 2]),
       ..., list([1, 78, 7, 241, 52, 1, 58, 34, 1, 1805, 2]),
       list([10, 68, 107, 1179, 33, 13, 27, 3, 25, 30, 13, 61, 5, 2]),
       list([18, 7, 59, 194, 26, 8, 96, 105, 1712, 391, 2])], dtype=object)

In [25]:
import torchtext

In [26]:
TEXT = torchtext.data.Field(sequential=True)

In [28]:
TEXT.build_vocab(seqs_data)

In [32]:
len(TEXT.vocab)

4176

In [33]:
import torch
for seq in train['target']:
    seq = torch.Tensor(eval(seq)).int()
    print(seq)

tensor([24], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32

tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int3

tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([11], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([21], dtype=torch.int3

tensor([11], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([21], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([24], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([5], dtype=torch.int32)
tensor([9], dtype=torch.int32)
tensor([9], dtype=torch.int32)
te

In [37]:
Ws = torch.rand((3, 3)).uniform_(-0.01, 0.01)
Ws

tensor([[-0.0011,  0.0025,  0.0047],
        [-0.0039, -0.0002,  0.0027],
        [ 0.0001, -0.0078,  0.0041]])

In [40]:
Ws.requires_grad_(True)

tensor([[-0.0011,  0.0025,  0.0047],
        [-0.0039, -0.0002,  0.0027],
        [ 0.0001, -0.0078,  0.0041]], requires_grad=True)

In [41]:
Ws.requires_grad

True

In [42]:
ss = torch.rand((3, 3)).uniform_(-1, 1)
result = torch.matmul(Ws, ss)

In [44]:
result.backward

<bound method Tensor.backward of tensor([[-6.3306e-04, -4.6255e-03, -3.3278e-05],
        [ 9.3019e-04, -4.6338e-03, -2.2384e-03],
        [ 3.4747e-03, -2.2516e-03, -8.7778e-04]], grad_fn=<MmBackward>)>

In [46]:
torch.nn.LSTM

Attributes:
        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size x input_size)`
        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size x hidden_size)`
        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`

.. math::
        \begin{array}{ll} \\
            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
            c_t = f_t c_{(t-1)} + i_t g_t \\
            h_t = o_t \tanh(c_t) \\
        \end{array}

        \begin{array}{ll} \\
        
        
                  v = \rho * v + g \\
                  p = p - lr * v

        where p, g, v and :math:`\rho` denote the parameters, gradient,
        velocity, and momentum respectively.

        This is in contrast to Sutskever et. al. and
        other frameworks which employ an update of the form

        .. math::
             v = \rho * v + lr * g \\
             p = p - v

        The Nesterov version is analogously modified.
        
        \end{array}

SyntaxError: invalid syntax (<ipython-input-45-2d254ff22b6c>, line 7)

# 把txt写到csv里

In [83]:
f = open('data/test.cor')
data = []
example = []


In [84]:
for ids, line in enumerate(f.readlines()):
    global example
    example.append(line.replace('\n', ''))
    if len(example) == 3:
        data.append(example)
        example = []

In [71]:
data

[['and cheap !', 'price', '1'],
 ["the staff is n't the friendliest or most competent , and i am stickler for service , but everything else about this place makes up for it .",
  'service',
  '-1'],
 ["the service is always bad though , do n't expect much of anything from your server , and i would not recommend bringing a date here either .",
  'service',
  '-1'],
 ['i absolutely love this place ! ! !', 'miscellaneous', '1'],
 ["a restaurant that does n't try to do anything except serve great food with great service in a pleasant atmosphere .",
  'ambience',
  '1'],
 ['this place was not all that !', 'miscellaneous', '-1'],
 ['the food is great .', 'food', '1'],
 ['highly recommended ! ! ! ! !', 'miscellaneous', '1'],
 ['the food was really good , i had the onion soup and it was one of the best ever .',
  'food',
  '1'],
 ['a++ the service was good to excellent along with the attitude .',
  'service',
  '1'],
 ['we had a 3 hour brunch- they definitely do not rush you- and they kept the

In [72]:
import pandas as pd

In [85]:
dataf = pd.DataFrame(data, index=None)

In [86]:
dataf.to_csv('test_data.csv', index=False, header=False)

Unnamed: 0,0,1,2
0,and cheap !,price,1
1,the staff is n't the friendliest or most compe...,service,-1
2,"the service is always bad though , do n't expe...",service,-1
3,i absolutely love this place ! ! !,miscellaneous,1
4,a restaurant that does n't try to do anything ...,ambience,1
5,this place was not all that !,miscellaneous,-1
6,the food is great .,food,1
7,highly recommended ! ! ! ! !,miscellaneous,1
8,"the food was really good , i had the onion sou...",food,1
9,a++ the service was good to excellent along wi...,service,1


# 用torchtext加载数据

In [156]:
from torchtext import data
from torchtext import datasets
import torch

REVIEW = data.Field(tokenize='spacy', include_lengths=True)
ASPECT = data.Field()
POLARITY = data.LabelField(dtype = torch.float)

In [157]:
fields = [('review', REVIEW), ('aspect', ASPECT), ('polarity', POLARITY)]

In [158]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '.',
                                        train = 'train_data.csv',
                                        validation = 'valid_data.csv',
                                        test = 'test_data.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False
)

In [168]:
print(vars(train_data.examples[2]))
print(f'Number of training examples: {len(train_data)}')

{'review': ['the', 'service', 'is', 'always', 'bad', 'though', ',', 'do', "n't", 'expect', 'much', 'of', 'anything', 'from', 'your', 'server', ',', 'and', 'i', 'would', 'not', 'recommend', 'bringing', 'a', 'date', 'here', 'either', '.'], 'aspect': ['service'], 'polarity': '-1'}
Number of training examples: 2990


In [160]:
REVIEW.build_vocab(train_data, 
                   max_size=10000,
                  vectors='glove.6B.300d')
ASPECT.build_vocab(train_data)
POLARITY.build_vocab(train_data)

In [161]:
print(f"Unique tokens in REVIEW vocabulary: {len(REVIEW.vocab)}")
print(f"Unique tokens in ASPECT vocabulary: {len(ASPECT.vocab)}")

Unique tokens in REVIEW vocabulary: 4084
Unique tokens in ASPECT vocabulary: 7


In [162]:
print(REVIEW.vocab.freqs.most_common(10))

[('the', 2655), ('.', 2651), (',', 1865), ('and', 1727), ('a', 1117), ('is', 930), ('to', 883), ('i', 869), ('was', 643), ('of', 622)]


In [163]:
print(REVIEW.vocab.stoi)



In [164]:
print(POLARITY.vocab.stoi)
print(ASPECT.vocab.stoi)

defaultdict(<function _default_unk_index at 0x11c87e840>, {'1': 0, '-1': 1, '0': 2})
defaultdict(<function _default_unk_index at 0x11c87e840>, {'<unk>': 0, '<pad>': 1, 'food': 2, 'miscellaneous': 3, 'service': 4, 'ambience': 5, 'price': 6})


In [165]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [171]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 32,
    sort_within_batch = False,
    device = device)

In [185]:
print('Train:')
for batch in train_iterator:
    x, x_seq_len = batch.review
    aspect = batch.aspect
    aspect = torch.LongTensor(np.tile(aspect, (x.size(0), 1)))
    self.embedding = nn.Embedding()
        # wordlist的单词索引从1开始，将0设为pad的索引，该 pad vector 全为0
        self.embedding.padding_idx = 0
        # aspect一共五个词，创建一个5个单词表的embedding，通过target_index得到aspect vector
        # {'price': 0, 'service': 1, 'miscellaneous': 2, 'ambience': 3, 'food': 4}
        self.aspect_embedding =nn.Embedding.from_pretrained(self.Va, freeze=False)
    break


Train:
tensor([[ 102,   22,   30,  ...,    2, 3208,  150],
        [1770,   12,  308,  ...,  628, 3484,   53],
        [  26,    6,  546,  ...,    7, 3501,   67],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([[4, 5, 2,  ..., 2, 2, 5],
        [4, 5, 2,  ..., 2, 2, 5],
        [4, 5, 2,  ..., 2, 2, 5],
        ...,
        [4, 5, 2,  ..., 2, 2, 5],
        [4, 5, 2,  ..., 2, 2, 5],
        [4, 5, 2,  ..., 2, 2, 5]])


In [240]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import time
import torch.random
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence

class AttentionLstm(nn.Module):
    def __init__(self, vocab_size,aspect_num, embedding_dim, hidden_dim, aspect_dim, output_dim, n_layers,
                bidirectional, dropout, pad_idx):
        super(AttentionLstm, self).__init__()
        self.model_name = 'atae-lstm'
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.aspect_embedding = nn.Embedding(aspect_num+2, aspect_dim)
        self.aspect_dim = aspect_dim
        # 输入是 aspect vector 和 word vector 堆起来的
        self.lstm = nn.LSTM(embedding_dim+aspect_dim, 
                            hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout)
        # 情感细粒度
        self.grained = 3
        # dropout参数
        self.dropout = nn.Dropout(dropout)
        

        # 给 lstm 传的 input 是 PackedSequence 实例,
        # 他可以通过把 batch 的 sequence 输给 pack_padded_sequence()函数，然后返回一个 PackedSequence 实例
        # pack_padded_sequence() 接受 input, length，
        # input can be of size T x B x * where T is the length of the longest sequence
        # 对长度未排序的 sequences , 设定enforce_sorted = False.
        # 如果 enforce_sorted = True, 这一个 batch 的 sequences 应该按序列长度降序
        # 然后把 PackedSequence 给 lstm，它就能返回所有时间步上的 hidden 了，
        # 所以就可以把它们堆起来，得到论文里的所有 hidden 组成的 H
        u = lambda x : 1 / np.sqrt(x)
        self.Ws = nn.Parameter(torch.rand((aspect_num, hidden_dim)).uniform_(-u(hidden_dim), u(hidden_dim)))
        self.bs = nn.Parameter(torch.zeros((aspect_num, 1)))
        self.Wh = nn.Parameter(torch.rand((hidden_dim, hidden_dim)).uniform_(-u(hidden_dim), u(hidden_dim)))
        self.Wv = nn.Parameter(torch.rand((aspect_dim, aspect_dim)).uniform_(-u(hidden_dim), u(hidden_dim)))
        self.w = nn.Parameter(torch.zeros((hidden_dim+aspect_dim, 1)))
        self.Wp = nn.Parameter(torch.rand((hidden_dim, hidden_dim)).uniform_(-u(hidden_dim), u(hidden_dim)))
        self.Wx = nn.Parameter(torch.rand((hidden_dim, hidden_dim)).uniform_(-u(hidden_dim), u(hidden_dim)))

#         self.Va = torch.rand((aspect_dim, aspect_dim)).uniform_(-0.01, 0.01)
#         self.Vw = torch.rand((self.num, embedding_dim)).uniform_(-0.01, 0.01)
#         self.load_word_vector(self.word_vector, self.wordlist)

#         self.embedding = nn.Embedding.from_pretrained(self.Vw, freeze=False)
        # wordlist的单词索引从1开始，将0设为pad的索引，该 pad vector 全为0
#         self.embedding.padding_idx = 0
        # aspect一共五个词，创建一个5个单词表的embedding，通过target_index得到aspect vector
        # {'price': 0, 'service': 1, 'miscellaneous': 2, 'ambience': 3, 'food': 4}
#         self.aspect_embedding =nn.Embedding.from_pretrained(self.Va, freeze=False)
        self.params = nn.ParameterList([self.Wv, self.Wh, self.Ws, self.bs, self.w, self.Wp, self.Wx])


    def forward(self, x, x_len, aspect):
        # x size = (N, 1, 300)  即 time_step * batch_size * dim_word
        x = self.embedding(x)
        aspect = torch.LongTensor(np.tile(aspect, (x.size(0), 1)))
        aspect = self.aspect_embedding(aspect)
        x_len, indices = torch.sort(x_len, descending=True)
        x = x[:, indices]
        aspect = aspect[:, indices]
        x = torch.cat((x, aspect), dim=2)

        x = pack_padded_sequence(x, x_len)
        
#         packed_seqs = pack_padded_sequence(lstm_input, lengths=x_len)
        output, (h_n, c_n) = self.lstm(x)
        # H.size = (300, N)
        H = torch.t(output.data)
        # h_n size = (300, 1)
        h_n = h_n.view(300, -1)
        # Wh_H.size = (300, N)
        Wh_H = torch.matmul(self.Wh, H)
        # Wv_a.size = (100, N)
        Wv_a = torch.matmul(self.Wv, aspect.view(self.aspect_dim, -1))
        # M.size = (400, N)
        M = torch.tanh(torch.cat((Wh_H, Wv_a), dim=0))
        # alpha.size = (1, N)
        alpha = F.softmax(torch.matmul(torch.t(self.w), M), dim=1)
        # r.size = (300, 1)
        r = torch.matmul(H, torch.t(alpha))
        Wp_r = torch.matmul(self.Wp, r)
        Wx_h = torch.matmul(self.Wx, h_n)
        # h_star.size = (300, 1)
        h_star = torch.tanh(Wp_r + Wx_h)

        h_star = self.dropout(h_star)

        # y.size = (self.grained, 1)
        y = F.softmax(torch.matmul(self.Ws, h_star) + self.bs, dim=0)
        return torch.t(y)

In [241]:
INPUT_DIM = len(REVIEW.vocab)
EMBEDDING_DIM = 300
ASPECT_DIM = 300
ASPECT_NUM = 5
HIDDEN_DIM = 300
OUTPUT_DIM = 3
N_LAYERS = 1
BIDIRECTIONAL = False
DROPOUT = 0
PAD_IDX = REVIEW.vocab.stoi[TEXT.pad_token]

model = AttentionLstm(INPUT_DIM,
            ASPECT_NUM,
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            ASPECT_DIM,
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [242]:
pretrained_embeddings = REVIEW.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([4084, 300])


In [243]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.2215,  0.2320,  0.4953,  ...,  0.1988,  0.0092, -0.3601],
        [-0.0189, -0.5264, -0.1337,  ...,  0.6352, -0.2163, -0.5310],
        [-0.3967,  0.2623, -0.0905,  ...,  0.2497, -0.1803, -0.0729]])

In [244]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [245]:
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [246]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        review, review_lengths = batch.review
        
        aspect = batch.aspect
        
        predictions = model(review, review_lengths, aspect).squeeze(1)
        
        loss = criterion(predictions, batch.polarity)
        
#         acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [247]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

#     epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut2-model.pt')
    
#     print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 241 and 518 in dimension 1 at /Users/soumith/mc3build/conda-bld/pytorch_1549593514549/work/aten/src/TH/generic/THTensorMoreMath.cpp:1307