In [None]:
# PeerHTC, with BlurbGenreCollection as an example

In [None]:
from mxnet.contrib import text
import pandas as pd
from mxnet import nd, autograd, init, gluon
import collections
from sklearn.model_selection import train_test_split
from mxnet.gluon import data as gdata, loss as gloss, utils as gutils, nn, rnn
import numpy as np
import d2lzh as d2l
from tqdm import tqdm
import mxnet as mx
import time
import gluonnlp as nlp

In [None]:
from model import PeerHTC
from utilities import binary_cross_entropy as BCE, get_batch, F1, train, evaluate, init_log, 

In [None]:
# load in training and evaluating data
features_train = nd.array(np.load('data/train_X_demo.npy'))
features_val = nd.array(np.load('data/val_X_demo.npy'))

labels_train = np.load('data/train_y_demo.npy')
labels_val = np.load('data/val_y_demo.npy')
labels_train = nd.array(labels_train)
labels_val = nd.array(labels_val)

In [None]:
# keep the first three layers
labels_train = labels_train[:,0:130]
labels_val = labels_val[:,0:130]

In [None]:
# convert the data into proper format
dataset_train = gdata.ArrayDataset(features_train, labels_train)
dataset_val = gdata.ArrayDataset(features_val, labels_val)
iter_train = gdata.DataLoader(dataset_train, 32, shuffle=True)
iter_val = gdata.DataLoader(dataset_val, 10, shuffle=False)

In [None]:
# number of categories in each layer
c1 = 7
c2 = 46
c3 = 77
K = 130

In [None]:
# class names
classes = np.load('data/classes.npy', allow_pickle=True)
classes1 = classes.item()['genre1']
classes2 = classes.item()['genre2']
classes3 = classes.item()['genre3']

In [None]:
# category hierarchy
f = open("data/hierarchy.txt", encoding = "utf-8")
lines = f.readlines()

In [None]:
# get co-occurrence matrices
w12 = nd.zeros((c1, c2))
w23 = nd.zeros((c2, c3))

for line in tqdm(lines):
    cats = line.split('\n')[:-1][0].split('\t')
    if cats[0] in classes1:
        index1 = classes1.index(cats[0])
        if len(cats)>1:
            if cats[1] in classes2:
                index2 = classes2.index(cats[1])
                w12[index1,index2] = 1
    if cats[0] in classes2:
        index2 = classes2.index(cats[0])
        index3 = classes3.index(cats[1])
        w23[index2,index3] = 1
        
w12 = w12.copyto(mx.gpu())
w23 = w23.copyto(mx.gpu())

In [None]:
# get frequency matrices
fre32_init = nd.zeros((c3,c2))
fre21_init = nd.zeros((c2,c1))

for i in range(c1):
    l = w12[i,:]
    if l.sum()>0:
        freq = labels_train[:,c1:(c1+c2)].sum(axis=0)*l.copyto(mx.cpu())
        freq = freq/freq.sum()
        fre21_init[:,i] = freq
        
for i in range(c2):
    l = w23[i,:]
    if l.sum()>0:
        freq = labels_train[:,(c1+c2):].sum(axis=0)*l.copyto(mx.cpu())
        freq = freq/freq.sum()
        fre32_init[:,i] = freq
        
fre32_init = fre32_init.copyto(mx.gpu())
fre21_init = fre21_init.copyto(mx.gpu())

In [None]:
# network and training hyperparameters

dv = 256 # dimension of original label embedding
dh = 256 # dimension of hidden states in the structure encoder
d1 = 5000 # dimension of the 1st layer in global classifier
d2 = 2000 # dimension of the 2nd layer in global classifier
d3 = 1000 # dim of the 3rd layer

prob_thr = 0.5 # tagging threshold

ctx = d2l.try_gpu() # training context
embed_size = 300 # dimension of word embeddings
num_hiddens = 256 # number of LSTM output units
num_layers = 1 # number of LSTM hidden layers

feature_dim = 512 # dimension of label-wise text features
dropout_rate = 0.1 # dropout rate in BERT
learning_rate = 0.00001 # learning rate
decay_rate = 0.0000001 # tuning weight of weight decay

In [None]:
loss1 = BCE # BCE loss function

In [None]:
# keep validation results
log = init_log()

In [None]:
# load in your adjacent matrix
## whole-hierarchy
Ad = np.load('adjacent_matrix.npy')

## levelwise
#Ad1 = Ad[0:c1,0:c1]
#Ad2 = Ad[c1:(c1+c2),c1:(c1+c2)]
#Ad3 = Ad[(c1+c2):,(c1+c2):]

In [None]:
# load in your training weights
weights = np.load('weights.npy')
weights_1 = weights[:,0:c1]
weights_2 = weights[:,c1:(c1+c2)]
weights_3 = weights[:,(c1+c2):K]

In [None]:
# network initialization
net1 = PeerHTC(embed_size, num_hiddens, num_layers)
mx.random.seed(817) # for replicability
net1.initialize(init.Xavier(), ctx=ctx) # random initialization with Xavier

# initialize the frequency matrices in structure encoder
net1.fre32.set_data(fre32_init)
net1.fre21.set_data(fre21_init)

# initialize GCN
## whole-hierarchy
net1.Ad.set_data(nd.array(Ad))
## levelwise
#net1.Ad2.set_data(nd.array(Ad2))
#net1.Ad3.set_data(nd.array(Ad3))

# initialize the trainer
trainer = gluon.Trainer(net1.collect_params(), 'adam', {'learning_rate': learning_rate, 'wd':decay_rate})

In [None]:
# network training and evaluation
train(iter_train, iter_val, net1, trainer, ctx, 30) 