# Imports

In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/project_2001083/nasib/Cache/'
import sys
import random
import numpy as np
#from apex import amp
from model import LightXML

from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader

from transformers import AdamW

import torch

from torch.utils.data import DataLoader
from dataset import MDataset, createDataCSV
from log import Logger

# Utility Functions

In [2]:
def load_group(dataset, group_tree=0):
    if dataset == 'wiki500k':
        return np.load(f'/scratch/project_2001083/nasib/XMC/LightXML/data/Wiki-500K/label_group{group_tree}.npy', allow_pickle=True)
    if dataset == 'amazon670k':
        return np.load(f'/scratch/project_2001083/nasib/XMC/LightXML/data/Amazon-670K/label_group{group_tree}.npy', allow_pickle=True)
    if dataset == 'amazontitles300k':
        return np.load(f'/scratch/project_2001083/nasib/XMC/LightXML/data/AmazonTitles-300K/label_group{group_tree}.npy', allow_pickle=True)
    

In [None]:
grp_y = load_group('amazontitles300k')
len(grp_y),len(grp_y[0]),len(grp_y[-1]),sum([len(y) for y in grp_y])

# Train Function

In [3]:
def train(model, df, label_map):
    tokenizer = model.get_tokenizer()

    if args.dataset in ['wiki500k', 'amazon670k','amazontitles300k']:
        group_y = load_group(args.dataset, args.group_y_group)
        train_d = MDataset(df, 'train', tokenizer, label_map, args.max_len, group_y=group_y,
                           candidates_num=args.group_y_candidate_num)
        test_d = MDataset(df, 'test', tokenizer, label_map, args.max_len, group_y=group_y,
                           candidates_num=args.group_y_candidate_num)
        
        #print(len(train_d[0]),train_d[0])
        print(len(test_d[0]),test_d[0])

        #train_d.tokenizer = model.get_fast_tokenizer()
        #test_d.tokenizer = model.get_fast_tokenizer()

        trainloader = DataLoader(train_d, batch_size=args.batch, num_workers=5,
                                 shuffle=True)
        testloader = DataLoader(test_d, batch_size=args.batch, num_workers=5,
                                shuffle=False)
        if args.valid:
            valid_d = MDataset(df, 'valid', tokenizer, label_map, args.max_len, group_y=group_y,
                               candidates_num=args.group_y_candidate_num)
            validloader = DataLoader(valid_d, batch_size=args.batch, num_workers=0, 
                                     shuffle=False)
    else:
        train_d = MDataset(df, 'train', tokenizer, label_map, args.max_len)
        test_d = MDataset(df, 'test', tokenizer, label_map, args.max_len)
        trainloader = DataLoader(train_d, batch_size=args.batch, num_workers=2,
                                 shuffle=True)
        testloader = DataLoader(test_d, batch_size=args.batch, num_workers=1,
                                shuffle=False)

    model.cuda()
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr)#, eps=1e-8)
        
    #model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    max_only_p5 = 0
    for epoch in range(0, 1):
        train_loss = model.one_epoch(epoch, trainloader, optimizer, mode='train',
                                     eval_loader=validloader if args.valid else testloader,
                                     eval_step=args.eval_step, log=LOG)

        if args.valid:
            ev_result = model.one_epoch(epoch, validloader, optimizer, mode='eval')
        else:
            ev_result = model.one_epoch(epoch, testloader, optimizer, mode='eval')

        g_p1, g_p3, g_p5, p1, p3, p5 = ev_result

        log_str = f'{epoch:>2}: {p1:.4f}, {p3:.4f}, {p5:.4f}, train_loss:{train_loss}'
        if args.dataset in ['wiki500k', 'amazon670k','amazontitles300k']:
            log_str += f' {g_p1:.4f}, {g_p3:.4f}, {g_p5:.4f}'
        if args.valid:
            log_str += ' valid'
        LOG.log(log_str)

        if max_only_p5 < p5:
            max_only_p5 = p5
            model.save_model(f'models/model-{get_exp_name()}.bin')

        if epoch >= args.epoch + 5 and max_only_p5 != p5:
            break


# Runing Configurations

In [4]:
class Config:
    def __init__(self):
        
        self.batch = 200
        self.update_count = 1
        self.lr = .0001
        self.seed = 6088
        self.epoch = 20
        self.dataset = 'amazontitles300k' #amazontitles300k
        self.bert = 'bert-base'
        self.max_len = 128
        self.valid = False #make validation split
        self.swa = True
        self.swa_warmup = 4
        self.swa_step = 50
        
        self.group_y_group = 0
        self.group_y_candidate_num = 2000
        self.group_y_candidate_topk = 75
        self.eval_step = 3000
        self.hidden_dim = 400
        self.eval_model =False #
        
args = Config()

def get_exp_name():
    name = [args.dataset, '' if args.bert == 'bert-base' else args.bert]
    if args.dataset in ['wiki500k', 'amazon670k','amazontitles300k']:
        name.append('t'+str(args.group_y_group))

    return '_'.join([i for i in name if i != ''])


def init_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

# Data Loading

In [None]:
init_seed(args.seed)

print(get_exp_name())

#log_name = 'log_'+str(args.run_name)
#print('Log file name: ',log_name)

LOG = Logger('test')

args.valid = False

print('Running model for the Configuration:',args)

print(f'load {args.dataset} dataset...')
df, label_map = createDataCSV(args.dataset)
#args.valid = False
if args.valid:
    train_df, valid_df = train_test_split(df[df['dataType'] == 'train'],
                                            test_size=4000,
                                            random_state=1240)
    df.iloc[valid_df.index.values, 2] = 'valid'
    print('valid size', len(df[df['dataType'] == 'valid']))

print(f'load {args.dataset} dataset with '
        f'{len(df[df.dataType =="train"])} train {len(df[df.dataType =="test"])} test with {len(label_map)} labels done')

if args.dataset in ['wiki500k', 'amazon670k','amazontitles300k']:
    group_y = load_group(args.dataset, args.group_y_group)
    _group_y = []
    for idx, labels in enumerate(group_y):
        _group_y.append([])
        for label in labels:
            _group_y[-1].append(label_map[label])
        _group_y[-1] = np.array(_group_y[-1])
    group_y = np.array(_group_y)

    model = LightXML(n_labels=len(label_map), group_y=group_y, bert=args.bert,
                        update_count=args.update_count,
                        use_swa=args.swa, swa_warmup_epoch=args.swa_warmup, swa_update_step=args.swa_step,
                        candidates_topk=args.group_y_candidate_topk,
                        hidden_dim=args.hidden_dim)
else:
    model = LightXML(n_labels=len(label_map), bert=args.bert,
                        update_count=args.update_count,
                        use_swa=args.swa, swa_warmup_epoch=args.swa_warmup, swa_update_step=args.swa_step)

if args.eval_model and args.dataset in ['wiki500k', 'amazon670k','amazontitles300k']:
    print(f'load models/model-{get_exp_name()}.bin')
    testloader = DataLoader(MDataset(df, 'test', model.get_fast_tokenizer(), label_map, args.max_len, 
                                        candidates_num=args.group_y_candidate_num),
                            batch_size=256, num_workers=0, 
                            shuffle=False)

    group_y = load_group(args.dataset, args.group_y_group)
    validloader = DataLoader(MDataset(df, 'valid', model.get_fast_tokenizer(), label_map, args.max_len, group_y=group_y,
                                        candidates_num=args.group_y_candidate_num),
                                batch_size=256, num_workers=0, 
                        shuffle=False)
    model.load_state_dict(torch.load(f'models/model-{get_exp_name()}.bin'))
    model = model.cuda()

    print(len(df[df.dataType == 'test']))
    model.one_epoch(0, validloader, None, mode='eval')

    pred_scores, pred_labels = model.one_epoch(0, testloader, None, mode='test')
    np.save(f'results/{get_exp_name()}-labels.npy', np.array(pred_labels))
    np.save(f'results/{get_exp_name()}-scores.npy', np.array(pred_scores))
    sys.exit(0)

train(model, df, label_map)

amazontitles300k_t0
Running model for the Configuration: <__main__.Config object at 0x7f452ed2fc70>
load amazontitles300k dataset...


586781it [00:00, 1675429.94it/s]
260536it [00:00, 1710583.91it/s]
586781it [00:01, 567122.71it/s]


303296


260536it [00:00, 601700.97it/s]


303296
label map 303296
load amazontitles300k dataset with 586781 train 260536 test with 303296 labels done


  group_y = np.array(_group_y)


swa True 4 50 {}
update_count 1
load bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


hidden dim: 400
label group numbers: 4096
load bert-base-uncased tokenizer


  self.group_y = np.array(self.group_y)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


6 (tensor([  101,  9004,  2865, 15550,  2098,  6887, 12356,  2015,  1996,  3733,
         2126,  2000,  6570,  2115,  4743,  2000,  3713,  3872,  1015,  1024,
         5986,  2616,  1998, 15672,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

train-0:  46%|████▌     | 1342/2934 [32:19<38:18,  1.44s/it, loss=0.0161] 

# Making the Cluster labels for each Cluster and Creation of Model

In [None]:
if args.dataset in ['wiki500k', 'amazon670k','amazontitles300k']:
    group_y = load_group(args.dataset, args.group_y_group)
    _group_y = []
    for idx, labels in enumerate(group_y):
        _group_y.append([])
        for label in labels:
            _group_y[-1].append(label_map[label])
        _group_y[-1] = np.array(_group_y[-1])
    group_y = np.array(_group_y)
    
    
    model = LightXML(n_labels=len(label_map), group_y=group_y, bert=args.bert,
                  update_count=args.update_count,
                  use_swa=args.swa, swa_warmup_epoch=args.swa_warmup, swa_update_step=args.swa_step,
                  candidates_topk=args.group_y_candidate_topk,
                  hidden_dim=args.hidden_dim)
    
train(model, df, label_map)