In [1]:
# coding: utf-8
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"         # 1 is can change to 0-3

"""
Train a model on Yelp Review.
"""

import os
from datetime import datetime
import time
import numpy as np
import random
import argparse
from shutil import copyfile
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report, accuracy_score

from global_random_seed import RANDOM_SEED

# this doesn't seem to work any better than what we have
from torch.optim.lr_scheduler import ReduceLROnPlateau
from data.loader import DataLoader, KnowledgeLoader
from model.model import SA_Model
from utils import scorer, constant, helper
from utils.vocab import Vocab

# print out what PyTorch Version we are using
print()
print("Current PyTorch Version:", torch.__version__)
print("Current CuDNN Version:", torch.backends.cudnn.version())
print("Current Cuda Version:", torch.version.cuda)
print()

# do this if you run this code in a Notebook
import sys; sys.argv=['']; del sys  # this has to be done if argparse is used in the notebook


parser = argparse.ArgumentParser()

parser.add_argument('--data_dir', type=str, default='dataset/yelp_review_small')
parser.add_argument('--vocab_dir', type=str, default='dataset/yelp_review_small')
parser.add_argument('--emb_dim', type=int, default=300, help='Word embedding dimension.')
parser.add_argument('--hidden_dim', type=int, default=360, help='RNN hidden state size.')       # 200 original
parser.add_argument('--hidden_self', type=int, default=130,
                    help='Hidden size for self-attention.')         # n_model*2 in the paper  # used to be 720
parser.add_argument('--num_layers', type=int, default=2, help='Num of lstm layers.')
parser.add_argument('--dense_dim', type=int, default=200, help='Dense layer size before Softmax.') 

# encoder layers
parser.add_argument('--n_head', type=int, default=3, help='Number of self-attention heads.')
parser.add_argument('--num_layers_encoder', type=int, default=1, help='Num of self-attention encoders.')
parser.add_argument('--dropout', type=float, default=0.4, help='Input and attn dropout rate.')            # 0.1 original
parser.add_argument('--scaled_dropout', type=float, default=0.1, help='Input and scaled dropout rate.')   # 0.1 original
parser.add_argument('--temper_value', type=float, default=0.5, help='Temper value for Scaled Attention.') # 0.5 original

parser.add_argument('--word_dropout', type=float, default=0.06,                                      # 0.04
    help='The rate at which randomly set a word to UNK.')
parser.add_argument('--lstm_dropout', type=float, default=0.5, help='Input and RNN dropout rate.')
parser.add_argument('--topn', type=int, default=1e10, help='Only finetune top N embeddings.')
parser.add_argument('--lower', dest='lower', action='store_true', help='Lowercase all words.',
                   default=False)
parser.add_argument('--no-lower', dest='lower', action='store_false')
parser.set_defaults(lower=False)
parser.add_argument('--weight_no_rel', type=float, default=1.0, help='Weight for no_relation class.')
parser.add_argument('--weight_rest', type=float, default=1.0, help='Weight for other classes.')
parser.add_argument(
    '--self-attn', dest='self_att', action='store_true', help='Use self-attention layer instead of LSTM.', default=True)
# parser.add_argument('--no_self_att', dest='self_att', action='store_false',
#     help='Use self-attention layer instead of LSTM.')
parser.set_defaults(self_att=True)

# batch norm
parser.add_argument('--use_batch_norm', dest='use_batch_norm', action='store_true', 
    help='BatchNorm if true, else LayerNorm in self-attention.', default=False)
parser.add_argument('--use_layer_norm', dest='use_batch_norm', action='store_false',
    help='BatchNorm if true, else LayerNorm in self-attention.', default=True)
parser.set_defaults(use_batch_norm=True)

# dpa
parser.add_argument('--diagonal_positional_attention', dest='diagonal_positional_attention', action='store_true',
    help='Use diagonal attention positional encoding instead of sinusoidal position encoding.', default=False)
parser.add_argument('--no_diagonal_positional_attention', dest='diagonal_positional_attention', action='store_false')
parser.set_defaults(diagonal_positional_attention=False)
parser.add_argument('--relative_pos_dim', type=int, default=50, help='relative position embedding dimension in self-attention.')

# relative positional vectors
parser.add_argument('--relative_positions', dest='relative_positions', action='store_true',
    help='Use relative positions (with position binning) for subj/obj positional vectors.', default=True)
parser.add_argument('--no_relative_positions', dest='relative_positions', action='store_false')
parser.set_defaults(relative_positions=True)

# how to use residual connections
parser.add_argument('--new_residual', dest='new_residual', action='store_true', 
    help='Use a different residual connection than in usual self-attention.', default=True)
parser.add_argument('--old_residual', dest='new_residual', action='store_false')
parser.set_defaults(new_residual=True)

# use positional attention from stanford paper
parser.add_argument('--attn', dest='attn', action='store_true', help='Use attention layer.', default="true")
parser.add_argument('--no-attn', dest='attn', action='store_false')
parser.set_defaults(attn=True)
parser.add_argument('--attn_dim', type=int, default=200, help='Attention size.')                    # 200 original
parser.add_argument('--pe_dim', type=int, default=30, help='Position encoding dimension.')

parser.add_argument('--lr', type=float, default=0.1, help='Applies to SGD and Adagrad.')            # lr 1.0 orig
parser.add_argument('--lr_decay', type=float, default=0.9)
parser.add_argument('--decay_epoch', type=int, default=15, help='Start LR decay from this epoch.')

parser.add_argument('--optim', type=str, default='sgd', help='sgd, asgd, adagrad, adam, nadam or adamax.')
parser.add_argument('--num_epoch', type=int, default=70)
parser.add_argument('--batch_size', type=int, default=50)
parser.add_argument('--max_grad_norm', type=float, default=1.0, help='Gradient clipping.')

# info for model saving
parser.add_argument('--log_step', type=int, default=1000, help='Print log every k steps.')
parser.add_argument('--log', type=str, default='logs.txt', help='Write training log to file.')
parser.add_argument('--save_epoch', type=int, default=1, help='Save model checkpoints every k epochs.')
parser.add_argument('--save_dir', type=str, default='./saved_models', help='Root dir for saving models.')

parser.add_argument('--id', type=str, default='tmp_model',  # change model folder output
    help='Model ID under which to save models.')

parser.add_argument('--info', type=str, default='', help='Optional info for the experiment.')

# We want to set random seed for all files
# so instead set the random seed in the global_random_seed.py file
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
parser.add_argument('--cpu', action='store_true', help='Ignore CUDA.')

args = parser.parse_args()


Current PyTorch Version: 1.0.0
Current CuDNN Version: 7102
Current Cuda Version: 8.0.61



In [3]:
# write the seed into a file and let it be read from there in all files
with open('global_random_seed.py', 'w') as the_file:
    the_file.write('RANDOM_SEED = '+str(args.seed))

# improves speed of cuda, these are set to False by default due to high memory usage
torch.backends.cudnn.fastest = True
torch.backends.cudnn.benchmark = False
# torch.set_num_threads(8)   # TODO: this doesn't seem to do anything

In [4]:
# set top-level random seeds
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)

if args.cpu:
    args.cuda = False
elif args.cuda:
    # force random seed for reproducibility
    # also apply same seed to numpy in every file
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

# make opt
opt = vars(args)
# opt['num_class'] = len(constant.LABEL_TO_ID)

# load vocab
vocab_file = opt['vocab_dir'] + '/vocab.pkl'
vocab = Vocab(vocab_file, load=True)

# in some previous experiments we saw that lower vocab size can improve performance
# but it was in a completely different project although on the same data
# here it seems it's much harder to get this to work
# uncomment the following line if this is solved:
# new_vocab_size = 30000

opt['vocab_size'] = vocab.size
emb_file = opt['vocab_dir'] + '/embedding.npy'
emb_matrix = np.load(emb_file)
assert emb_matrix.shape[0] == vocab.size
assert emb_matrix.shape[1] == opt['emb_dim']

Vocab size 230902 loaded from file


In [5]:
# adjust parameters for experiments
opt['num_class'] = 5
opt['num_epoch'] =70
opt['batch_size'] = 100
opt['lr'] = 0.01
opt['decay_epoch'] = 10
# opt['lr_decay'] = 0.95
# opt['num_layers'] = 1  # LSTM layers original 2

# for transformer encoder
opt['num_layers_encoder'] = 1
opt['n_head'] = 6
opt['diagonal_positional_attention'] = False
opt['relative_pos_dim'] = 50
opt['use_batch_norm'] = False
opt["new_residual"] = True
# opt['hidden_self'] = 130   # default 130

# for position-aware attention
opt['pe_dim'] =30   # 30 original
opt['attn_dim'] = 200  # 200 original

opt['dense_dim'] = 100  # decision-level attn dim or linear dim before softmax. default:100
opt['topn'] = 0  # 0:do not fine tune word embeddings, 21: fine tune entities masks and UNK_TOKEN token
opt['lower'] = False

opt['id'] = 'tmp_model'

In [7]:
# load data
print("Loading data from {} with batch size {}...".format(opt['data_dir'], opt['batch_size']))
train_batch = DataLoader(opt['data_dir'] + '/train_processed.json', opt['batch_size'], opt, vocab, evaluation=False)

Loading data from dataset/yelp_review_small with batch size 100...


100%|██████████| 650000/650000 [00:35<00:00, 18354.76it/s]


6500 batches created for dataset/yelp_review_small/train_processed.json


In [8]:
test_i = 16
test_d = train_batch.data[0][test_i]   # fist batch, ith data

# print('tokens: ', test_d[0], 'length:', len(test_d[0]))
# print('pos: ', test_d[1])
# print('ner: ', test_d[2])
# print('entity markers: ', test_d[3])
# print('subj_positions: ', test_d[4])
# print('obj_positions: ', test_d[5])
# # print('relative_positions: ', test_d[6], 'len:', len(test_d[6]))
# print('inst_position: ', test_d[6])
# print('relation: ', test_d[7])

In [9]:
test_i = 2
test_d_torch = train_batch[0]

# print('tokens: ', test_d_torch[0][test_i], 'length:', len(test_d_torch[0][test_i]))
# print('mask: ', test_d_torch[1][test_i])
# print('pos: ', test_d_torch[2][test_i])
# print('ner: ', test_d_torch[3][test_i])
# print('deprel: ', test_d_torch[4][test_i])
# print('subj_positions: ', test_d_torch[5][test_i])
# print('obj_positions: ', test_d_torch[6][test_i])
# print('relative_positions_dpa: ', test_d_torch[7][test_i], 'len:', len(test_d_torch[7][test_i]))
# print('src_position: ', test_d_torch[8][test_i])
# print('subj_type: ', test_d_torch[9][test_i])
# print('obj_type: ', test_d_torch[10][test_i])
# print('rels: ', test_d_torch[11][test_i])
# print('orig_idx: ', test_d_torch[12][test_i])

In [10]:
dev_batch = DataLoader(opt['data_dir'] + '/test_processed.json', opt['batch_size'], opt, vocab, evaluation=True)

model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
model_save_dir = opt['save_dir'] + '/' + model_id
opt['model_save_dir'] = model_save_dir
helper.ensure_dir(model_save_dir, verbose=True)

# save config
helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
vocab.save(model_save_dir + '/vocab.pkl')
file_logger = helper.FileLogger(
    model_save_dir + '/' + opt['log'],
    header="# epoch\ttrain_loss\tdev_loss\tdev_p\tdev_r\tdev_f1"
)

# print model info
helper.print_config(opt)

100%|██████████| 50000/50000 [00:02<00:00, 24221.60it/s]

1000 batches created for dataset/yelp_review_small/test_processed.json
Config saved to file ./saved_models/tmp_model/config.json
Overwriting old vocab file at ./saved_models/tmp_model/vocab.pkl

Running with the following configs:
	data_dir : dataset/yelp_review_small
	vocab_dir : dataset/yelp_review_small
	emb_dim : 300
	ner_dim : 0
	pos_dim : 0
	entity_marker_dim : 0
	hidden_dim : 360
	hidden_self : 130
	query_size_attn : 360
	num_layers : 2
	ner_dim_subj_obj : 0
	dense_dim : 100
	num_layers_encoder : 1
	dropout : 0.4
	scaled_dropout : 0.1
	temper_value : 0.5
	word_dropout : 0.06
	lstm_dropout : 0.5
	topn : 0
	lower : False
	weight_no_rel : 1.0
	weight_rest : 1.0
	self_att : True
	self_att_and_rnn : False
	use_lemmas : False
	preload_lemmas : False
	obj_sub_pos : False
	use_batch_norm : False
	diagonal_positional_attention : False
	relative_pos_dim : 50
	relative_positions : True
	new_residual : False
	n_head : 6
	n_head_CNN : 6
	attn : True
	attn_dim : 200
	pe_dim : 30
	lr : 0.01
	l




In [11]:
# model
model = SA_Model(opt, emb_matrix=emb_matrix)

Self-attn input dim: 300
Knowledge-attn input dim: 300
Number of self-attn heads:  6
d_v and d_k:  50.0
Do not fine-tune word embedding layer.


In [12]:
dev_f1_history = []
current_lr = opt['lr']

global_step = 0

format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
max_steps = len(train_batch) * opt['num_epoch']

In [13]:
# setup the scheduler for lr decay
# this doesn't seem to work well compared to what we already have
# scheduler = ReduceLROnPlateau(model.optimizer, mode='min', factor=opt['lr_decay'], patience=1)

# start training
for epoch in range(1, opt['num_epoch']+1):

    print(
        "Current params: " + " heads-" + str(opt["n_head"]) + " enc_layers-" + str(opt["num_layers_encoder"]),
        " drop-" + str(opt["dropout"]) + " scaled_drop-" + str(opt["scaled_dropout"]) + " lr-" + str(opt["lr"]),
        " lr_decay-" + str(opt["lr_decay"]) + " max_grad_norm-" + str(opt["max_grad_norm"])
    )
    print(
        " attn-" + str(opt["attn"]) + " attn_dim-" + str(opt["attn_dim"]),
        " new_residual-" + str(opt["new_residual"])
    )
    print(
        " use_batch_norm-"+str(opt["use_batch_norm"]) + " relative_positions-"+str(opt["relative_positions"]),
        " decay_epoch-"+str(opt["decay_epoch"]) + " use_lemmas-"+str(opt["use_lemmas"]),
        " hidden_self-"+str(opt["hidden_self"])
    )

    train_loss = 0
    for i, batch in enumerate(train_batch):

        start_time = time.time()
        global_step += 1

        loss = model.update(batch)
        train_loss += float(loss)

        if global_step % opt['log_step'] == 0:
            duration = time.time() - start_time
            print(
                format_str.format(datetime.now(), global_step, max_steps, epoch,
                opt['num_epoch'], loss, duration, current_lr)
            )
        # do garbage collection,
        # as per https://discuss.pytorch.org/t/best-practices-for-maximum-gpu-utilization/13863/6
        del loss

    # eval on dev
    print("Evaluating on dev set...")
    predictions = []
    dev_loss = 0
    for i, batch in enumerate(dev_batch):
        preds, _, loss,_ = model.predict(batch)
        predictions += preds
        dev_loss += float(loss)
        del loss

    print(classification_report(dev_batch.gold(), predictions))
    dev_f1 = accuracy_score(dev_batch.gold(), predictions)
    print('accuracy:',dev_f1)

    train_loss = train_loss / train_batch.num_examples * opt['batch_size'] # avg loss per batch
    dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']
    print(
        "epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\
            train_loss, dev_loss, dev_f1)
        )
#     file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}\t{:.4f}\t{:.4f}".format(
#         epoch, train_loss, dev_loss, dev_p, dev_r, dev_f1)
#     )

#     # save
#     model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
#     model.save(model_file, epoch)
#     if epoch == 1 or dev_f1 > max(dev_f1_history):
#         copyfile(model_file, model_save_dir + '/best_model.pt')
#         print("new best model saved.")
#     if epoch % opt['save_epoch'] != 0:
#         os.remove(model_file)

    # reduce learning rate if it stagnates by a certain decay rate and within given epoch patience
    # this for some reason works worth than the implementation we have afterwards
    # scheduler.step(dev_loss)

    if opt["optim"] != "noopt_adam" and opt["optim"] != "noopt_nadam":

        # do warm_up_for sgd only instead of adam
        do_warmup_trick = False

        if not do_warmup_trick:
            # decay schedule # 15 is best!
            # simulate patience of x epochs
            if len(dev_f1_history) > opt['decay_epoch'] and dev_f1 <= dev_f1_history[-1]:
                current_lr *= opt['lr_decay']
                model.update_lr(current_lr)
        else:
            # print("do_warmup_trick")
            # 1 and 5 first worked kind of
            # 10 and 15
            current_lr = 10 * (360 ** (-0.5) * min(epoch ** (-0.5), epoch * 15 ** (-1.5)))
            print("current_lr", current_lr)
            model.update_lr(current_lr)
    # else, update the learning rate in torch_utils.py

    dev_f1_history += [dev_f1]
    print("")

print("Training ended with {} epochs.".format(epoch))

Current params:  heads-6 enc_layers-1  drop-0.4 scaled_drop-0.1 lr-0.01  lr_decay-0.9 max_grad_norm-1.0
 weight_no_rel-1.0 weight_rest-1.0 attn-True attn_dim-200  obj_sub_pos-False new_residual-False
 use_batch_norm-False relative_positions-True  decay_epoch-10 use_lemmas-False  hidden_self-130
2019-09-29 15:48:38.885525: step 1000/455000 (epoch 1/70), loss = 1.090372 (0.449 sec/batch), lr: 0.010000
2019-09-29 15:56:08.656363: step 2000/455000 (epoch 1/70), loss = 1.156877 (0.476 sec/batch), lr: 0.010000
2019-09-29 16:03:38.758173: step 3000/455000 (epoch 1/70), loss = 1.015465 (0.316 sec/batch), lr: 0.010000
2019-09-29 16:11:04.470483: step 4000/455000 (epoch 1/70), loss = 0.990872 (0.568 sec/batch), lr: 0.010000
2019-09-29 16:18:36.098254: step 5000/455000 (epoch 1/70), loss = 1.095273 (0.324 sec/batch), lr: 0.010000
2019-09-29 16:26:09.206407: step 6000/455000 (epoch 1/70), loss = 0.884808 (0.440 sec/batch), lr: 0.010000
Evaluating on dev set...
             precision    recall  f1-

In [14]:
print(np.argmax(dev_f1_history)+1)
print(max(dev_f1_history))
dev_f1_history

64
0.67164


[0.577,
 0.59698,
 0.61206,
 0.6144,
 0.61594,
 0.62218,
 0.62954,
 0.63412,
 0.63878,
 0.63834,
 0.63826,
 0.6422,
 0.64044,
 0.64792,
 0.6457,
 0.64996,
 0.64882,
 0.6519,
 0.65096,
 0.65066,
 0.65426,
 0.65054,
 0.65484,
 0.65238,
 0.65468,
 0.65468,
 0.6557,
 0.65754,
 0.65854,
 0.65844,
 0.6598,
 0.65978,
 0.66018,
 0.65952,
 0.66094,
 0.66292,
 0.66358,
 0.6655,
 0.6607,
 0.66222,
 0.66666,
 0.6642,
 0.666,
 0.6672,
 0.66906,
 0.66526,
 0.66664,
 0.66752,
 0.66804,
 0.66814,
 0.66682,
 0.67012,
 0.66782,
 0.6683,
 0.6686,
 0.66834,
 0.66928,
 0.6687,
 0.671,
 0.67068,
 0.66984,
 0.66994,
 0.66968,
 0.67164,
 0.67132,
 0.67072,
 0.67146,
 0.6698,
 0.6716,
 0.67094]