In [1]:
import os
from lxml import etree
from collections import defaultdict, Counter
import argparse
import datetime
import torch
import torchtext.data as data
import torchtext
from torch.utils.data import Dataset
import json
import re
import numpy as np
from torchtext.vocab import Vectors
from torch.nn import init
from torchtext.vocab import GloVe
import torch.nn as nn

In [2]:
ds_atsa_laptop = {'train': 'data/atsa-laptop/atsa_train.json','test': 'data/atsa-laptop/atsa_test.json','hard_test':'data/atsa-laptop/atsa_hard_test.json'}
ds_atsa_restaurant = {'train': 'data/atsa-restaurant/atsa_train.json','test': 'data/atsa-restaurant/atsa_test.json','hard_test':'data/atsa-restaurant/atsa_hard_test.json'}
ds_atsa = {'laptop':ds_atsa_laptop,'rest':ds_atsa_restaurant}

ds_acsa_large = {'train': 'data/acsa-restaurant-large/acsa_train.json','test':'data/acsa-restaurant-large/acsa_test.json','hard_test':'data/acsa-restaurant-large/acsa_hard_test.json'}
ds_acsa_2014 = {'train': 'data/acsa-restaurant-2014/acsa_train.json','test':'data/acsa-restaurant-2014/acsa_test.json','hard_test':'data/acsa-restaurant-2014/acsa_hard_test.json'}
ds_acsa = {'large':ds_acsa_large,'2014':ds_acsa_2014}
ds_files ={'atsa':ds_atsa,'acsa':ds_acsa}

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def get_data_from_json(file_path):
        with open(file_path,'r') as load_f:
            dataset = json.load(load_f)
            return dataset
def load_semeval_data(text_field,aspect_field,sentiment_field,dataset_file):
    semeval_train = get_data_from_json(dataset_file['train'])
    semeval_test = get_data_from_json(dataset_file['test'])
    semeval_hard_test = get_data_from_json(dataset_file['hard_test'])
    
    train_data = SemEval(text_field,aspect_field,sentiment_field,semeval_train)
    test_data = SemEval(text_field,aspect_field,sentiment_field,semeval_test)
    hard_test_data = SemEval(text_field,aspect_field,sentiment_field,semeval_hard_test)
    return train_data,test_data,hard_test_data

class SemEval(data.Dataset):

    @staticmethod
    def sort_key(ex):
        return len(ex.text)
    
    def __init__(self,text_field,aspect_field,sentiment_field,input_data,**kwargs):
        """ Create an SemEval Dataset instance given a path and fields.
        
        Arguments:
            text_field: The field that will be used for text data.
            aspect_field:  The field that will be used for aspect data.
            sentiment_field: The field that will be used for sentiment data.
            input_data: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of data.Dataset.
        
        """
        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text',text_field),('aspect',aspect_field),('sentiment',sentiment_field)]
        examples = []
        for e in input_data:
            if 'pp.' in e['sentence']:
                continue
            examples.append(data.Example.fromlist([e['sentence'],e['aspect'],e['sentiment']],fields))
        super(SemEval,self).__init__(examples,fields,**kwargs)
        


In [4]:

text_field = data.Field(lower=True,tokenize='moses')
aspect_field = data.Field(sequential=False)
sentiment_field = data.Field(sequential=False)
train_data, test_data,hard_test_data = load_semeval_data(text_field,aspect_field,sentiment_field,ds_files['acsa']['2014'])

In [5]:
text_field.build_vocab(train_data,test_data)
aspect_field.build_vocab(train_data,test_data)
sentiment_field.build_vocab(train_data,test_data)
print(len(text_field.vocab))
print(len(aspect_field.vocab))
print(len(sentiment_field.vocab))

5095
6
5


In [6]:
sentiment_field.vocab.itos

['<unk>', 'positive', 'negative', 'neutral', 'conflict']

In [7]:
aspect_field.vocab.itos

['<unk>', 'food', 'misc', 'service', 'ambience', 'price']

In [8]:
# text_field.vocab.itos
aspect_field.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x00000219D3970088>>,
            {'<unk>': 0,
             'food': 1,
             'misc': 2,
             'service': 3,
             'ambience': 4,
             'price': 5})

In [None]:
def load_glove(word_list,uniform_scale,dimension_size):
    glove_words = {}
    file_path = 'data/glove.6B.50d_test.txt'
    with open(file_path,'r',encoding='UTF-8') as fopen:
        for line in fopen:
            tokens = line.rstrip().split()
            glove_words[tokens[0]] = np.array(tokens[1:],dtype='float32')
    word_vectors = []
    for word in word_list:
        if word in glove_words:
            word_vectors.append(glove_words[word])
        elif word == '<pad>':
            word_vectors.append(np.zeros(dimension_size,dtype=np.float32))
        else:
            word_vectors.append(np.random.uniform(-uniform_scale,uniform_scale,dimension_size))
#     word_vectors = np.asarray(word_vectors,dtype=np.float32)
    return word_vectors
word_vectors = load_glove(text_field.vocab.itos,0.25,300)
def load_aspect_embedding_from_w2v(aspect_list,word_stoi,w2v):
    aspect_vectors = []
#     print(word_stoi)
    for w in aspect_list:
        print(w)
        print(word_stoi[w])
        aspect_vectors.append(w2v[word_stoi[w]])
    return aspect_vectors
load_aspect_embedding_from_w2v(aspect_field.vocab.itos,text_field.vocab.stoi,word_vectors)

In [10]:
def _load_word_vec(path, word2idx=None):
    fin = open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    word_vec = {}
    for line in fin:
        tokens = line.rstrip().split() # rstrip()，删除string末尾的指定字符（默认为空格）
        # 如果预训练词向量的包含的单词在语料中出现，就把单词及对应向量添加到字典中
        if word2idx is None or tokens[0] in word2idx.keys():
            word_vec[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
    return word_vec

In [10]:
batch_size = 32
train_iter,test_iter, hard_test_iter = data.Iterator.splits((train_data,test_data,hard_test_data),batch_sizes=(batch_size,len(test_data),len(hard_test_data)))

In [11]:
train_data

<__main__.SemEval at 0x219d3b50d48>

In [12]:
epochs = 1
for epoch in range(1, epochs+1):
        for batch in train_iter:
            feature, aspect, target = batch.text, batch.aspect, batch.sentiment
#             print(feature.t_())
#             print(batch.text)
            print(feature.shape)
            print(aspect.shape)
            aspect.unsqueeze_(0)
            print(aspect.shape)
            break

torch.Size([36, 32])
torch.Size([32])
torch.Size([1, 32])


In [12]:
batch = next(iter(train_iter))
print("batch text: ", batch.text) # 对应 Fileld 的 name
print("batch aspect: ", batch.aspect)
print("batch sentiment: ", batch.sentiment)

batch text:  tensor([[   5,   12, 2072,  ...,    9,    3,  584],
        [  82,  220,    7,  ...,   74,  414,   18],
        [ 217,    4,    6,  ...,   37,  394,    4],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
batch aspect:  tensor([1, 1, 3, 1, 1, 3, 1, 5, 1, 2, 1, 2, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 2, 4,
        2, 2, 5, 1, 2, 2, 1, 3])
batch sentiment:  tensor([1, 1, 1, 1, 1, 3, 2, 2, 4, 1, 1, 1, 2, 1, 3, 1, 4, 1, 1, 1, 1, 1, 1, 2,
        2, 1, 1, 1, 1, 1, 3, 1])


In [18]:
# TEXT = data.Field(sequential=True)
# vectors = Vectors(name='data/glove.6B.50d_test.txt')
# TEXT.build_vocab(train_data,test_data, vectors=vectors)
# vectors.unk_init = init.xavier_uniform_

In [13]:
batch


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.LongTensor of size 38x32]
	[.aspect]:[torch.LongTensor of size 32]
	[.sentiment]:[torch.LongTensor of size 32]

In [14]:
# ACSA
V = len(text_field.vocab) # 词表长度
D = 300
A = len(aspect_field.vocab)
Co = 128 # kernel num
Ks = [2,3,4]
embed = nn.Embedding(V,D)
aspect_embed = nn.Embedding(A,D)
convs1 = nn.ModuleList([nn.Conv1d(D,Co,K) for K in Ks])
convs2 = nn.ModuleList([nn.Conv1d(D,Co,K) for K in Ks])

In [15]:
nn.ModuleList([nn.Conv1d(D,Co,K) for K in Ks])

ModuleList(
  (0): Conv1d(300, 128, kernel_size=(2,), stride=(1,))
  (1): Conv1d(300, 128, kernel_size=(3,), stride=(1,))
  (2): Conv1d(300, 128, kernel_size=(4,), stride=(1,))
)

In [16]:
feature,aspect = batch.text,batch.aspect

In [17]:
feature.shape

torch.Size([38, 32])

In [18]:
aspect.shape

torch.Size([32])

In [19]:
embed

Embedding(5095, 300)

In [22]:
feature.t_()
print(feature.shape)
feature_embed = embed(feature)
feature_embed.shape

torch.Size([32, 38])


torch.Size([32, 38, 300])

In [23]:
aspect.shape

torch.Size([32])

In [24]:
print(aspect.shape)
aspect.unsqueeze_(0)
print(aspect.shape)

torch.Size([32])
torch.Size([1, 32])


In [27]:
aspect.t_()
print(aspect.shape)
aspect_v = aspect_embed(aspect)
aspect_v.shape

torch.Size([32, 1])


torch.Size([32, 1, 300])

In [28]:
aspect_v.sum(dim=1).shape

torch.Size([32, 300])

In [29]:
aspect_v.size(1)

1

In [30]:
aspect_v = aspect_v.sum(dim=1)/aspect_v.size(dim=1)
print(aspect_v.shape)

torch.Size([32, 300])


In [31]:
feature_embed.shape

torch.Size([32, 38, 300])

In [32]:
x = [torch.tanh(conv(feature_embed.transpose(1,2))) for conv in convs1]

In [33]:
len(x)

3

In [34]:
x[0].shape

torch.Size([32, 128, 37])

In [35]:
x[1].shape

torch.Size([32, 128, 36])

In [36]:
x[2].shape

torch.Size([32, 128, 35])

In [96]:
feature_embed.shape

torch.Size([32, 44, 300])

In [38]:
fc_aspect = nn.Linear(D,Co)
fc_aspect

Linear(in_features=300, out_features=128, bias=True)

In [40]:
fc_aspect(aspect_v).unsqueeze(2).shape

torch.Size([32, 128, 1])

In [41]:
feature_embed.shape

torch.Size([32, 38, 300])

In [42]:
y = [torch.relu(conv(feature_embed.transpose(1,2))+fc_aspect(aspect_v).unsqueeze(2)) for conv in convs2]

In [44]:
len(y)

3

In [46]:
y[0].shape

torch.Size([32, 128, 37])

In [47]:
y[1].shape

torch.Size([32, 128, 36])

In [48]:
y[2].shape

torch.Size([32, 128, 35])

In [49]:
z = [i*j for i,j in zip(x,y)]

In [60]:
torch.max_pool1d

<function _VariableFunctions.max_pool1d>

In [61]:
z[0].shape

torch.Size([32, 128, 37])

In [64]:
torch.max_pool1d(z[0],z[0].size(2)).shape

torch.Size([32, 128, 1])

In [65]:
z1 = [torch.max_pool1d(i, i.size(2)).squeeze(2) for i in z]

In [67]:
z1[0].shape

torch.Size([32, 128])

In [70]:
z1 = [i.view(i.size(0),-1) for i in z1]

In [71]:
z1[0].shape

torch.Size([32, 128])

In [72]:
z2 = torch.cat(z1,1)

In [74]:
z2.shape

torch.Size([32, 384])

In [76]:
fc1 = nn.Linear(len(Ks)*Co, 4)

In [77]:
fc1

Linear(in_features=384, out_features=4, bias=True)

In [78]:
for k in [3]:
    print(k)

3


In [109]:
# ATSA
text_field = data.Field(lower=True,tokenize='moses')
aspect_field = data.Field(lower=True, tokenize='moses')
sentiment_field = data.Field(sequential=False)
train_data, test_data,hard_test_data = load_semeval_data(text_field,aspect_field,sentiment_field,ds_files['atsa']['laptop'])

In [110]:
text_field.build_vocab(train_data,test_data)
aspect_field.build_vocab(train_data,test_data)
sentiment_field.build_vocab(train_data,test_data)
print(len(text_field.vocab))
print(len(aspect_field.vocab))
print(len(sentiment_field.vocab))

3563
988
5


In [111]:
sentiment_field.vocab.itos

['<unk>', 'positive', 'negative', 'neutral', 'conflict']

In [None]:
aspect_field.vocab.itos

In [113]:
batch_size = 32
train_iter,test_iter, hard_test_iter = data.Iterator.splits((train_data,test_data,hard_test_data),batch_sizes=(batch_size,len(test_data),len(hard_test_data)))

In [114]:
batch = next(iter(train_iter))
print("batch text: ", batch.text) # 对应 Fileld 的 name
print("batch aspect: ", batch.aspect)
print("batch sentiment: ", batch.sentiment)

batch text:  tensor([[   8,  115,   15,  ...,   66,   15,    2],
        [  10,   88,   35,  ..., 1006,  295,  387],
        [  55, 1670,  118,  ...,   16,   82,   10],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
batch aspect:  tensor([[  6, 124,   2, 209,   2,  53,  16,  79,  18,  32,   5, 470,  55, 440,
           2,  24, 248, 779,   9,  61,  56, 201,   4,  85,  15, 253, 442,   3,
          19, 138,  71, 345],
        [  1, 319,   1,  27,   8,   1,  88,   1,   1,   1,   1,   1,  25,  10,
           8,   1,   1, 297,   1,   1, 572,   1, 172,   1,  42,  34,   1,   1,
           1, 371,  11,   1],
        [  1,   1,   1, 188,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1, 624,   1,   1,   1,   1,   1,   1,   1,  25,   1,   1,
           1,   1,   1,   1],
        [  1,   1,   1,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
   

In [115]:
aspect = batch.aspect
aspect.shape

torch.Size([4, 32])

In [116]:
aspect

tensor([[  6, 124,   2, 209,   2,  53,  16,  79,  18,  32,   5, 470,  55, 440,
           2,  24, 248, 779,   9,  61,  56, 201,   4,  85,  15, 253, 442,   3,
          19, 138,  71, 345],
        [  1, 319,   1,  27,   8,   1,  88,   1,   1,   1,   1,   1,  25,  10,
           8,   1,   1, 297,   1,   1, 572,   1, 172,   1,  42,  34,   1,   1,
           1, 371,  11,   1],
        [  1,   1,   1, 188,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1, 624,   1,   1,   1,   1,   1,   1,   1,  25,   1,   1,
           1,   1,   1,   1],
        [  1,   1,   1,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1, 662,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1]])

In [119]:
A = len(aspect_field.vocab) 
aspect_embed = nn.Embedding(A,D)

In [120]:
convs3 = nn.ModuleList([nn.Conv1d(D, Co, K, padding=K-2) for K in [3]])
convs3

ModuleList(
  (0): Conv1d(300, 128, kernel_size=(3,), stride=(1,), padding=(1,))
)

In [121]:
fc_aspect1 = nn.Linear(100, Co)
fc_aspect1

Linear(in_features=100, out_features=128, bias=True)

In [123]:
aspect_v1 = aspect_embed(aspect.t_())
aspect_v1.shape

torch.Size([32, 4, 300])

In [124]:
aa = [torch.relu(conv(aspect_v1.transpose(1,2))) for conv in convs3]

In [125]:
len(aa)

1

In [126]:
aa[0].shape

torch.Size([32, 128, 4])

In [127]:
torch.max_pool1d(aa[0],aa[0].size(2)).shape

torch.Size([32, 128, 1])

In [128]:
aa = [torch.max_pool1d(a,a.size(2)).squeeze(2) for a in aa]

In [129]:
aa[0].shape

torch.Size([32, 128])

In [130]:
aspect_v1 = torch.cat(aa,1)
aspect_v1.shape

torch.Size([32, 128])

In [131]:
fc_aspect1(aspect_v1)

RuntimeError: size mismatch, m1: [32 x 128], m2: [100 x 128] at C:\w\1\s\tmp_conda_3.7_100118\conda\conda-bld\pytorch_1579082551706\work\aten\src\TH/generic/THTensorMath.cpp:136