In [1]:
import logging
import math
import os
import sys
from time import strftime, localtime
import random
import numpy
from sklearn import metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from data_utils import build_tokenizer, build_embedding_matrix, ABSADataset
from layers.dynamic_rnn import DynamicLSTM

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [2]:
# -*- coding: utf-8 -*-
# adapted from ram.py
# author: songyouwei <youwei0314@gmail.com>
# Copyright (C) 2018. All Rights Reserved.

class RAM(nn.Module):
    def locationed_memory(self, memory, memory_len, left_len, aspect_len):
        batch_size = memory.shape[0]
        seq_len = memory.shape[1]
        memory_len = memory_len.cpu().numpy()
        left_len = left_len.cpu().numpy()
        aspect_len = aspect_len.cpu().numpy()
        weight = [[] for i in range(batch_size)]
        u = [[] for i in range(batch_size)]
        for i in range(batch_size):
            for idx in range(left_len[i]):
                weight[i].append(1-(left_len[i]-idx)/memory_len[i])
                u[i].append(idx - left_len[i])
            for idx in range(left_len[i], left_len[i]+aspect_len[i]):
                weight[i].append(1)
                u[i].append(0)
            for idx in range(left_len[i]+aspect_len[i], memory_len[i]):
                weight[i].append(1-(idx-left_len[i]-aspect_len[i]+1)/memory_len[i])
                u[i].append(idx-left_len[i]-aspect_len[i]+1)
            for idx in range(memory_len[i], seq_len):
                weight[i].append(1)
                u[i].append(0)

        u = torch.tensor(u).float().to(self.opt.device).unsqueeze(2)
        weight = torch.tensor(weight).to(self.opt.device).unsqueeze(2)
        memory = torch.cat([memory*weight, u], dim=2) 
   
        return memory

    def __init__(self, embedding_matrix, opt):
        super(RAM, self).__init__()
        self.opt = opt
        self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
        self.bi_lstm_context = DynamicLSTM(opt.embed_dim, opt.hidden_dim, num_layers=1, batch_first=True, bidirectional=True)
        self.att_linear = nn.Linear(opt.hidden_dim*2 + 1 + opt.embed_dim*2, 1)
        self.gru_cell = nn.GRUCell(opt.hidden_dim*2 + 1, opt.embed_dim)
        self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)

    def forward(self, inputs):
        text_raw_indices, aspect_indices, text_left_indices = inputs[0], inputs[1], inputs[2]
        left_len = torch.sum(text_left_indices != 0, dim=-1)
        memory_len = torch.sum(text_raw_indices != 0, dim=-1)
        aspect_len = torch.sum(aspect_indices != 0, dim=-1)
        nonzeros_aspect = aspect_len.float()

        memory = self.embed(text_raw_indices)
        memory, (_, _) = self.bi_lstm_context(memory, memory_len)
        memory = self.locationed_memory(memory, memory_len, left_len, aspect_len)
        
        aspect = self.embed(aspect_indices)
        aspect = torch.sum(aspect, dim=1)
        aspect = torch.div(aspect, nonzeros_aspect.unsqueeze(-1))
        et = torch.zeros_like(aspect).to(self.opt.device)

        batch_size = memory.size(0)
        seq_len = memory.size(1)
        for _ in range(self.opt.hops):
            g = self.att_linear(torch.cat([memory, 
                torch.zeros(batch_size, seq_len, self.opt.embed_dim).to(self.opt.device) + et.unsqueeze(1), 
                torch.zeros(batch_size, seq_len, self.opt.embed_dim).to(self.opt.device) + aspect.unsqueeze(1)], 
                dim=-1))
            alpha = F.softmax(g, dim=1)
            i = torch.bmm(alpha.transpose(1, 2), memory).squeeze(1)  
            et = self.gru_cell(i, et)
        out = self.dense(et)
        return out


In [3]:
# -*- coding: utf-8 -*-
# adapted from train.py
# author: songyouwei <youwei0314@gmail.com>
# Copyright (C) 2018. All Rights Reserved.

class Instructor:
    def __init__(self, opt):
        self.opt = opt

        tokenizer = build_tokenizer(
            fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
            max_seq_len=opt.max_seq_len,
            dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
        embedding_matrix = build_embedding_matrix(
            word2idx=tokenizer.word2idx,
            embed_dim=opt.embed_dim,
            dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()

    def _print_args(self):
        n_trainable_params, n_nontrainable_params = 0, 0
        for p in self.model.parameters():
            n_params = torch.prod(torch.tensor(p.shape))
            if p.requires_grad:
                n_trainable_params += n_params
            else:
                n_nontrainable_params += n_params
        logger.info('> training parameters:')
        for arg in vars(self.opt):
            logger.info('> {0}: {1}'.format(arg, getattr(self.opt, arg)))

    def _reset_params(self):
        for child in self.model.children():
            for p in child.parameters():
                if p.requires_grad:
                    if len(p.shape) > 1:
                        self.opt.initializer(p)
                    else:
                        stdv = 1. / math.sqrt(p.shape[0])
                        torch.nn.init.uniform_(p, a=-stdv, b=stdv)

    def _train(self, criterion, optimizer, train_data_loader, val_data_loader):
        max_val_acc = 0
        max_val_f1 = 0
        global_step = 0
        path = None
        for epoch in range(self.opt.num_epoch):
            logger.info(' ' * 100)
            logger.info('epoch: {}'.format(epoch))
            n_correct, n_total, loss_total = 0, 0, 0
            # switch model to training mode
            self.model.train()
            for i_batch, sample_batched in enumerate(train_data_loader):
                global_step += 1
                # clear gradient accumulators
                optimizer.zero_grad()

                inputs = [sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols]
                outputs = self.model(inputs)
                targets = sample_batched['polarity'].to(self.opt.device)

                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
                n_total += len(outputs)
                loss_total += loss.item() * len(outputs)
                if global_step % self.opt.log_step == 0:
                    train_acc = n_correct / n_total
                    train_loss = loss_total / n_total
                    logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc))

            val_acc, val_f1 = self._evaluate_acc_f1(val_data_loader)
            logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format(val_acc, val_f1))
            if val_acc > max_val_acc:
                max_val_acc = val_acc
                if not os.path.exists('state_dict'):
                    os.mkdir('state_dict')
                path = 'state_dict/{0}_{1}_val_acc{2}'.format(self.opt.model_name, self.opt.dataset, round(val_acc, 4))
                torch.save(self.model.state_dict(), path)
                logger.info('>> saved: {}'.format(path))
            if val_f1 > max_val_f1:
                max_val_f1 = val_f1

        return path

    def _evaluate_acc_f1(self, data_loader):
        n_correct, n_total = 0, 0
        t_targets_all, t_outputs_all = None, None
        # switch model to evaluation mode
        self.model.eval()
        with torch.no_grad():
            for t_batch, t_sample_batched in enumerate(data_loader):
                t_inputs = [t_sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols]
                t_targets = t_sample_batched['polarity'].to(self.opt.device)
                t_outputs = self.model(t_inputs)

                n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item()
                n_total += len(t_outputs)

                if t_targets_all is None:
                    t_targets_all = t_targets
                    t_outputs_all = t_outputs
                else:
                    t_targets_all = torch.cat((t_targets_all, t_targets), dim=0)
                    t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0)

        acc = n_correct / n_total
        f1 = metrics.f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average='macro')
        return acc, f1

    def run(self):
        # Loss and Optimizer
        criterion = nn.CrossEntropyLoss()
        _params = filter(lambda p: p.requires_grad, self.model.parameters())
        optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg)

        train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True)
        test_data_loader = DataLoader(dataset=self.testset, batch_size=self.opt.batch_size, shuffle=False)
        val_data_loader = DataLoader(dataset=self.valset, batch_size=self.opt.batch_size, shuffle=False)

        self._reset_params()
        best_model_path = self._train(criterion, optimizer, train_data_loader, val_data_loader)
        self.model.load_state_dict(torch.load(best_model_path))
        self.model.eval()
        test_acc, test_f1 = self._evaluate_acc_f1(test_data_loader)
        logger.info('>> test_acc: {:.4f}, test_f1: {:.4f}'.format(test_acc, test_f1))

In [4]:
model_classes = {
    'ram': RAM,
}
dataset_files = {
    'twitter': {
        'train': './datasets/acl-14-short-data/train.raw',
        'test': './datasets/acl-14-short-data/test.raw'
    },
    'restaurant': {
        'train': './datasets/semeval14/Restaurants_Train.xml.seg',
        'test': './datasets/semeval14/Restaurants_Test_Gold.xml.seg'
    },
    'laptop': {
        'train': './datasets/semeval14/Laptops_Train.xml.seg',
        'test': './datasets/semeval14/Laptops_Test_Gold.xml.seg'
    }
}
input_colses = {
    'ram': ['text_raw_indices', 'aspect_indices', 'text_left_indices'],
}
initializers = {
    'xavier_uniform_': torch.nn.init.xavier_uniform_,
}
optimizers = {
    'adagrad': torch.optim.Adagrad,  # default lr=0.01
    'adam': torch.optim.Adam,  # default lr=0.001
    'asgd': torch.optim.ASGD,  # default lr=0.01
    'sgd': torch.optim.SGD,
}


In [5]:
class Parameter:
    def __init__(self, model_class, dataset_file, inputs_cols, initializer, 
                 optimizer, model_name, dataset, learning_rate, num_epoch):
        self.model_class = model_class
        self.dataset_file = dataset_file
        self.inputs_cols = inputs_cols
        self.initializer = initializer
        self.optimizer = optimizer
        self.model_name = model_name
        self.dataset = dataset
        self.learning_rate = learning_rate
        self.num_epoch = num_epoch
        
        self.dropout = 0.1
        self.l2reg = 0.01
        self.batch_size = 64
        self.log_step = 5
        self.embed_dim = 300
        self.hidden_dim = 300
        self.max_seq_len = 80
        self.polarities_dim = 3
        self.valset_ratio = 0
        self.hops = 3
        self.device = torch.device('cpu')

In [8]:
model_name = 'ram'
dataset = 'twitter' # twitter, laptop， restaurant
optimizer = 'adam'
initializer = 'xavier_uniform_'
learning_rate = 1e-3

log_file = '{}-{}-{}.log'.format(model_name, dataset, strftime("%y%m%d-%H%M", localtime()))
logger.addHandler(logging.FileHandler(log_file))

opt_lstm = Parameter(model_classes[model_name], dataset_files[dataset], input_colses[model_name], 
              initializers[initializer], optimizers[optimizer], model_name, dataset, learning_rate, 20)

ins = Instructor(opt_lstm)
ins.run()

loading tokenizer: twitter_tokenizer.dat
loading embedding_matrix: 300_twitter_embedding_matrix.dat
> training parameters:
> model_class: <class '__main__.RAM'>
> dataset_file: {'train': './datasets/acl-14-short-data/train.raw', 'test': './datasets/acl-14-short-data/test.raw'}
> inputs_cols: ['text_raw_indices', 'aspect_indices', 'text_left_indices']
> initializer: <function xavier_uniform_ at 0x1a14988ae8>
> optimizer: <class 'torch.optim.adam.Adam'>
> model_name: ram
> dataset: twitter
> learning_rate: 0.001
> num_epoch: 20
> dropout: 0.1
> l2reg: 0.01
> batch_size: 64
> log_step: 5
> embed_dim: 300
> hidden_dim: 300
> max_seq_len: 80
> polarities_dim: 3
> valset_ratio: 0
> hops: 3
> device: cpu
                                                                                                    
epoch: 0
loss: 1.1028, acc: 0.3937
loss: 1.1147, acc: 0.4328
loss: 1.0888, acc: 0.4604
loss: 1.0790, acc: 0.4570
loss: 1.0708, acc: 0.4594
loss: 1.0633, acc: 0.4630
loss: 1.0584, acc: 0.4692
l