In [1]:

import os
import pickle
import numpy as np
import torch
from torch.utils.data import Dataset
# from pytorch_transformers import BertTokenizer
from transformers import BertTokenizer

def insert(original, new, pos):
# '''Inserts new inside original at pos.'''
    return original[:pos] + new + original[pos:]

def build_tokenizer(fnames, max_seq_len, dat_fname):
    if os.path.exists(dat_fname):
        print('loading tokenizer:', dat_fname)
        tokenizer = pickle.load(open(dat_fname, 'rb'))
    else:
        text = ''
        for fname in fnames:
            fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
            lines = fin.readlines()
            fin.close()
            for i in range(0, len(lines), 3):
                text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
                aspect = lines[i + 1].lower().strip()
                text_raw = text_left + " " + aspect + " " + text_right
                text += text_raw + " "

        tokenizer = Tokenizer(max_seq_len)
        tokenizer.fit_on_text(text)
        pickle.dump(tokenizer, open(dat_fname, 'wb'))
    return tokenizer


def _load_word_vec(path, word2idx=None):
    fin = open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    word_vec = {}
    for line in fin:
        tokens = line.rstrip().split()
        if word2idx is None or tokens[0] in word2idx.keys():
            word_vec[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
    return word_vec


def build_embedding_matrix(word2idx, embed_dim, dat_fname):
    if os.path.exists(dat_fname):
        print('loading embedding_matrix:', dat_fname)
        embedding_matrix = pickle.load(open(dat_fname, 'rb'))
    else:
        print('loading word vectors...')
        embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim))  # idx 0 and len(word2idx)+1 are all-zeros
        fname = './glove.twitter.27B/glove.twitter.27B.' + str(embed_dim) + 'd.txt' \
            if embed_dim != 300 else './glove.42B.300d.txt'
        word_vec = _load_word_vec(fname, word2idx=word2idx)
        print('building embedding_matrix:', dat_fname)
        for word, i in word2idx.items():
            vec = word_vec.get(word)
            if vec is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = vec
        pickle.dump(embedding_matrix, open(dat_fname, 'wb'))
    return embedding_matrix


def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
    x = (np.ones(maxlen) * value).astype(dtype)
    if truncating == 'pre':
        trunc = sequence[-maxlen:]
    else:
        trunc = sequence[:maxlen]
    trunc = np.asarray(trunc, dtype=dtype)
    if padding == 'post':
        x[:len(trunc)] = trunc
    else:
        x[-len(trunc):] = trunc
    return x


def pad(a,maxlen):
    B = np.pad(a, (0, maxlen - len(a)%maxlen), 'constant')
    return B


class Tokenizer(object):
    def __init__(self, max_seq_len, lower=True):
        self.lower = lower
        self.max_seq_len = max_seq_len
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 1

    def fit_on_text(self, text):
        if self.lower:
            text = text.lower()
        words = text.split()
        for word in words:
            if word not in self.word2idx:
                self.word2idx[word] = self.idx
                self.idx2word[self.idx] = word
                self.idx += 1

    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        if self.lower:
            text = text.lower()
        words = text.split()
        unknownidx = len(self.word2idx)+1
        sequence = [self.word2idx[w] if w in self.word2idx else unknownidx for w in words]
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)


class Tokenizer4Bert:
    def __init__(self, max_seq_len, pretrained_bert_name):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
        self.max_seq_len = max_seq_len

    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        sequence = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)
    
    def add_tokens(self,params):
        self.tokenizer.add_tokens(params)

        



In [2]:
import logging
import argparse
import math
import os
import sys
from time import strftime, localtime
import random
import numpy

from pytorch_transformers import BertModel,BertForTokenClassification,BertConfig
# from transformers import BertModel,BertForTokenClassification,BertConfig

# from models.knowledge_bert import BertForTokenClassification

from sklearn import metrics
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split

from data_utils import build_tokenizer, build_embedding_matrix, Tokenizer4Bert, ABSADataset

from models import LSTM, IAN, MemNet, RAM, TD_LSTM, Cabasc, ATAE_LSTM, TNet_LF, AOA, MGAN, LCF_BERT
from models.aen import CrossEntropyLoss_LSR, AEN_BERT
from models.bert_spc import BERT_SPC
from models.bert_raw import BERT_RAW
from models.bert_label import BERT_LABEL
from models.bert_aspect import BERT_ASPECT
from models.bert_target import BERT_TARGET
from models.bert_multi_target import BERT_MULTI_TARGET
from models.bert_kg import BERT_KG
from models.bert_compete import BERT_COMPETE


logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', default='bert_spc', type=str)
parser.add_argument('--dataset', default='laptop', type=str, help='twitter, restaurant, laptop')
parser.add_argument('--optimizer', default='adam', type=str)
parser.add_argument('--initializer', default='xavier_uniform_', type=str)
parser.add_argument('--learning_rate', default=2e-5, type=float, help='try 5e-5, 2e-5 for BERT, 1e-3 for others')
parser.add_argument('--dropout', default=0.1, type=float)
parser.add_argument('--l2reg', default=0.01, type=float)
parser.add_argument('--num_epoch', default=10, type=int, help='try larger number for non-BERT models')
parser.add_argument('--batch_size', default=16, type=int, help='try 16, 32, 64 for BERT models')
parser.add_argument('--log_step', default=5, type=int)
parser.add_argument('--embed_dim', default=300, type=int)
parser.add_argument('--hidden_dim', default=300, type=int)
parser.add_argument('--bert_dim', default=768, type=int)
parser.add_argument('--pretrained_bert_name', default='bert-base-uncased', type=str)
parser.add_argument('--max_seq_len', default=128, type=int)
parser.add_argument('--polarities_dim', default=3, type=int)
# parser.add_argument('--hops', default=3, type=int)
parser.add_argument('--device', default=None, type=str, help='e.g. cuda:0')
parser.add_argument('--seed', default=None, type=int, help='set seed for reproducibility')
parser.add_argument('--valset_ratio', default=0, type=float, help='set ratio between 0 and 1 for validation support')
parser.add_argument('--load_mode', default=0, type=int, help='load existed model')

# The following parameters are only valid for the lcf-bert model
parser.add_argument('--local_context_focus', default='cdm', type=str, help='local context focus mode, cdw or cdm')
parser.add_argument('--SRD', default=3, type=int, help='semantic-relative-distance, see the paper of LCF-BERT model')
opt = parser.parse_args([])

In [4]:
tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name)
config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True)
bert = BertModel.from_pretrained(opt.pretrained_bert_name,config=config)
num_added_tokens = tokenizer.add_tokens(['[aspect_b]','[aspect_e]'])
bert.resize_token_embeddings(len(tokenizer.tokenizer))

loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xiangpan/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/xiangpan/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions"

Embedding(30524, 768)

In [5]:
fin = open('./datasets/semeval14/Restaurants_Train.xml.seg', 'r', encoding='utf-8', newline='\n', errors='ignore')
lines = fin.readlines()
fin.close()

all_data = []

for i in range(0, len(lines), 3):
    text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
    aspect = lines[i + 1].lower().strip()
    polarity = lines[i + 2].strip()

    text_raw="[CLS] " + text_left + " " + aspect + " " + text_right + " [SEP]"
    text_spc='[CLS] ' + text_left + " " + aspect + " " + text_right + ' [SEP] ' + aspect + " [SEP]"
    text_target='[CLS] ' + text_left + ' [aspect_b] '+aspect + ' [aspect_e] '+ text_right + ' [SEP] '

    text_without_cls=text_left + " " + aspect + " " + text_right + ' [SEP] ' + aspect + " [SEP]"


    text_raw_indices = tokenizer.text_to_sequence(text_left + " " + aspect + " " + text_right)


    text_target_indics = tokenizer.text_to_sequence(text_target)
    text_target_segments_ids=np.asarray([0] * (np.sum(text_target_indics != 0)))
    text_target_segments_ids = pad_and_truncate(text_target_segments_ids, tokenizer.max_seq_len)


    text_raw_without_aspect_indices = tokenizer.text_to_sequence(text_left + " " + text_right)
    text_left_indices = tokenizer.text_to_sequence(text_left)
    text_left_with_aspect_indices = tokenizer.text_to_sequence(text_left + " " + aspect)
    text_right_indices = tokenizer.text_to_sequence(text_right, reverse=True)
    text_right_with_aspect_indices = tokenizer.text_to_sequence(" " + aspect + " " + text_right, reverse=True)
    aspect_indices = tokenizer.text_to_sequence(aspect)
    left_context_len = np.sum(text_left_indices != 0)
    aspect_len = np.sum(aspect_indices != 0)
    aspect_pos = left_context_len+1
    target_begin=left_context_len+1
    aspect_in_text = torch.tensor([left_context_len.item(), (left_context_len + aspect_len - 1).item()])
    # aspect_range = torch.LongTensor(range(left_context_len.item()+1, (left_context_len + aspect_len).item()+1))# plus [cls]
    polarity = int(polarity) + 1

    text_bert_indices = tokenizer.text_to_sequence('[CLS] ' + text_left + " " + aspect + " " + text_right + ' [SEP] ' + aspect + " [SEP]")
    # text_bert_indices = tokenizer.text_to_sequence('[CLS] '+ text_left + " " + aspect + " " + text_right + ' [SEP] '+ aspect + " [SEP] ")

    bert_segments_ids = np.asarray([0] * (np.sum(text_raw_indices != 0)+2) + [1] * (aspect_len + 1))
    # bert_segments_ids = np.asarray([1] * (aspect_len + 1)+[0] * (np.sum(text_raw_indices != 0) + 2))
    bert_raw_segments_ids=np.asarray([0] * (np.sum(text_raw_indices != 0)+2))
    bert_segments_ids = pad_and_truncate(bert_segments_ids, tokenizer.max_seq_len)
    bert_raw_segments_ids = pad_and_truncate(bert_raw_segments_ids, tokenizer.max_seq_len)
    text_raw_bert_indices = tokenizer.text_to_sequence("[CLS] " + text_left + " " + aspect + " " + text_right + " [SEP]")
    aspect_bert_indices = tokenizer.text_to_sequence("[CLS] " + aspect + " [SEP]")
    input_mask=torch.tensor([1]*len(text_bert_indices))
    # print(aspect_indices)
    data = {
        'text_target_indics':text_target_indics,
        'text_target_segments_ids':text_target_segments_ids,
        'aspect_pos':aspect_pos,
        'aspect_len':aspect_len,
        'target_begin':target_begin,
        'text_raw': text_raw,
        'text_spc': text_spc,
        'text_without_cls': text_without_cls,
        'text_aspect':aspect,
        'left_context_len': left_context_len,
        'text_bert_indices': text_bert_indices,
        'bert_segments_ids': bert_segments_ids,
        'text_raw_bert_indices': text_raw_bert_indices,
        'aspect_bert_indices': aspect_bert_indices,
        'text_raw_indices': text_raw_indices,
        'bert_raw_segments_ids':bert_raw_segments_ids,
        'text_raw_without_aspect_indices': text_raw_without_aspect_indices,
        'text_left_indices': text_left_indices,
        'text_left_with_aspect_indices': text_left_with_aspect_indices,
        'text_right_indices': text_right_indices,
        'text_right_with_aspect_indices': text_right_with_aspect_indices,
        'aspect_indices': aspect_indices,
        'aspect_in_text': aspect_in_text,
        'polarity': polarity,
        # 'polaritys':polaritys,
        'input_mask':input_mask,
    }

    all_data.append(data)

In [6]:
begin_token=tokenizer.text_to_sequence('[aspect_b]')[0]
end_token=tokenizer.text_to_sequence('[aspect_e]')[0]

idx=0       
while idx in range(len(all_data)):
    data=all_data[idx]
    text_raw=data['text_raw']
    flag = True
    count=0
    while flag:
        count=count+1
        if idx+count not in range(len(all_data)):
            break
        text_raw_next=all_data[idx+count]['text_raw']
        if (text_raw_next!=text_raw):
            flag=False
    aspect_list=[]
    for i in range(0,count):
        text_aspect=all_data[idx+i]['text_aspect']
        aspect_list.append(text_aspect)
    for i in range(0,count):
        all_data[idx+i]['aspect_list']=aspect_list
    idx=idx+count
a=np.array(all_data)
np.save("all_data.npy",a)

In [7]:
num_added_tokens = tokenizer.add_tokens(['[aspect_b]','[aspect_e]'])
num_added_tokens = tokenizer.add_tokens(['[target_b]','[target_e]'])
bert.resize_token_embeddings(len(tokenizer.tokenizer))

Adding [target_b] to the vocabulary
Adding [target_e] to the vocabulary


Embedding(30526, 768)

In [8]:
target_b=tokenizer.text_to_sequence('[target_b]')[0]
target_e=tokenizer.text_to_sequence('[target_e]')[0]
aspect_b=tokenizer.text_to_sequence('[aspect_b]')[0]
aspect_e=tokenizer.text_to_sequence('[aspect_e]')[0]

In [9]:
target_b

30524

In [10]:
i=2

In [14]:
all_data=np.load('all_data.npy',allow_pickle=True).tolist()

In [13]:
for i in range(len(all_data)):
    all_data[i]['text_multi']=all_data[i]['text_raw']
    for aspect in all_data[i]['aspect_list']:
        aspect_len=len(aspect)
        text_multi=all_data[i]['text_multi']
        if aspect == all_data[i]['text_aspect']:
            text_multi=insert(text_multi,' [target_b] ',text_multi.find(aspect))
            text_multi=insert(text_multi,' [target_e] ',text_multi.find(aspect)+len(aspect))
        else:
            text_multi=insert(text_multi,' [aspect_b] ',text_multi.find(aspect))
            text_multi=insert(text_multi,' [aspect_e] ',text_multi.find(aspect)+len(aspect))
        all_data[i]['text_multi']=text_multi
    multi_target_indics = tokenizer.text_to_sequence(all_data[i]['text_multi'])
    all_data[i]['multi_target_indics']=multi_target_indics
    multi_target_segments_ids=np.asarray([0] * (np.sum(multi_target_indics != 0)))
    multi_target_segments_ids = pad_and_truncate(multi_target_segments_ids, tokenizer.max_seq_len)
    all_data[i]['multi_target_segments_ids']=multi_target_segments_ids
    pos=np.argwhere(all_data[i]['multi_target_indics']==target_b)[0][0]
#     print(pos,all_data[i]['multi_target_indics'])
    all_data[i]['target_pos']=pos

In [15]:
for i in range(len(all_data)):
            all_data[i]['text_multi']=all_data[i]['text_raw']
            now=0
            print(len(all_data[i]['aspect_list']),all_data[i]['aspect_list'])
            for aspect in all_data[i]['aspect_list']:
                aspect_len=len(aspect)
                text_multi=all_data[i]['text_multi']
                if aspect == all_data[i]['text_aspect']:
                    text_multi=insert(text_multi,' [target_b] ',text_multi.find(aspect))
                    text_multi=insert(text_multi,' [target_e] ',text_multi.find(aspect)+len(aspect))
                else:
                    aspect_b_now=aspect_b[0]
                    aspect_e_now=aspect_e[0]
                    text_multi=insert(text_multi,' '+aspect_b_now+' ',text_multi.find(aspect))
                    text_multi=insert(text_multi,' '+aspect_e_now+' ',text_multi.find(aspect)+len(aspect))
                    now=now+1
                    # text_multi=insert(text_multi,' [aspect_b] ',text_multi.find(aspect))
                    # text_multi=insert(text_multi,' [aspect_e] ',text_multi.find(aspect)+len(aspect))
                all_data[i]['text_multi']=text_multi
            multi_target_indices = tokenizer.text_to_sequence(all_data[i]['text_multi'])
            all_data[i]['multi_target_indices']=multi_target_indices
            multi_target_segments_ids=np.asarray([0] * (np.sum(multi_target_indices != 0)))
            multi_target_segments_ids = pad_and_truncate(multi_target_segments_ids, tokenizer.max_seq_len)
            all_data[i]['multi_target_segments_ids']=multi_target_segments_ids
            pos=np.argwhere(all_data[i]['multi_target_indices']==target_b)[0][0]
            aspect_poss=np.argwhere(all_data[i]['multi_target_indices']==target_b)[0][0]
            # print(pos,all_data[i]['multi_target_indices'])
            all_data[i]['target_pos']=pos

1 ['staff']
1 ['food']
3 ['food', 'kitchen', 'menu']


IndexError: invalid index to scalar variable.

In [30]:
text_raw.insert()

AttributeError: 'str' object has no attribute 'insert'

In [2]:
i=1

In [3]:
a='[target_'+str(i)+'b]'

In [4]:
a

'[target_1b]'