In [1]:
import os
import logging
import argparse
from tqdm import tqdm, trange
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from utils import init_logger, load_tokenizer, get_intent_labels, get_slot_labels, MODEL_CLASSES, MODEL_PATH_MAP

logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_device(pred_config):
    return "cuda" if torch.cuda.is_available() and not pred_config.no_cuda else "cpu"

In [3]:
def get_args(pred_config):
    return torch.load(os.path.join(pred_config.model_dir, 'training_args.pt'))
    # return torch.load(os.path.join(pred_config.model_dir, 'training_args.bin'))

In [4]:
def load_model(args, device):
    # Check whether model exists
    if not os.path.exists(pred_config.model_dir):
        raise Exception("Model doesn't exists! Train first!")
    try:
        intent_label_lst = get_intent_labels(args)
        slot_label_lst=get_slot_labels(args)
        model = MODEL_CLASSES[args.model_type][1].from_pretrained(args.model_dir,
                                                                  intent_label_lst=intent_label_lst,
                                                                  slot_label_lst=slot_label_lst)
        model.to(device)
        model.eval()
        logger.info("***** Model Loaded *****")
    except:
        raise Exception("Some model files might be missing...")
    return model



In [8]:
def read_input_file(pred_config):
    lines = []
    with open(pred_config.input_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            words = line.split()
            lines.append(words)
    return lines

In [5]:
def convert_input_file_to_tensor_dataset(lines,
                                         pred_config,
                                         tokenizer,
                                         pad_token_label_id,
                                         cls_token_segment_id=0,
                                         pad_token_segment_id=0,
                                         sequence_a_segment_id=0,
                                         mask_padding_with_zero=True):

    max_seq_len = 32
    pro_lst = ['him','her','it','its']
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    all_input_ids = []
    all_attention_mask = []
    all_token_type_ids = []
    all_slot_label_mask = []
    all_pro_labels_ids = []
    for words in lines:
        tokens = []
        slot_label_mask = []
        pro_labels_ids = []
        for word in words:
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            if word in pro_lst:
                pro_label = 1
            else:
                pro_label = 0
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_label_mask.extend([pad_token_label_id + 1] + [pad_token_label_id] * (len(word_tokens) - 1))
            pro_labels_ids.extend([pro_label] + [pad_token_label_id] * (len(word_tokens) - 1)) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[: (max_seq_len - special_tokens_count)]
            slot_label_mask = slot_label_mask[:(max_seq_len - special_tokens_count)]
            pro_labels_ids = pro_labels_ids[:(max_seq_len - special_tokens_count)] #!!!!!!!!!!!!!!!!!!!!!!!!!!!

        # Add [SEP] token
        tokens += [sep_token]
        token_type_ids = [sequence_a_segment_id] * len(tokens)
        slot_label_mask += [pad_token_label_id]
        pro_labels_ids += [pad_token_label_id]#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        # Add [CLS] token
        tokens = [cls_token] + tokens
        token_type_ids = [cls_token_segment_id] + token_type_ids
        slot_label_mask = [pad_token_label_id] + slot_label_mask
        pro_labels_ids = [pad_token_label_id] + pro_labels_ids#!!!!!!!!!!!!!!!!!!!!!!!
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_label_mask = slot_label_mask + ([pad_token_label_id] * padding_length)
        pro_labels_ids = pro_labels_ids + ([pad_token_label_id] * padding_length) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        all_input_ids.append(input_ids)
        all_attention_mask.append(attention_mask)
        all_token_type_ids.append(token_type_ids)
        all_slot_label_mask.append(slot_label_mask)
        all_pro_labels_ids.append(pro_labels_ids) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        # print('padding_length: \n',padding_length,'\n')
        # print('input_ids: \n',input_ids,'\n')
        # print('slot_label_mask: \n',slot_label_mask,'\n')
        # print(f'attention_mask: \n{attention_mask}\n')

    # Change to Tensor
    all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
    all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
    all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
    all_slot_label_mask = torch.tensor(all_slot_label_mask, dtype=torch.long)
    all_pro_labels_ids = torch.tensor(all_pro_labels_ids, dtype=torch.long) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_slot_label_mask)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids,all_slot_label_mask, all_pro_labels_ids)

    return dataset

In [9]:
device = get_device(pred_config)
# model = load_model(pred_config, device)

ignore_index = 0
pad_token_label_id = ignore_index
tokenizer = load_tokenizer(pred_config)
lines = read_input_file(pred_config)
dataset = convert_input_file_to_tensor_dataset(lines, pred_config, tokenizer, pad_token_label_id)
batch = dataset[1]


In [15]:
def predict(pred_config):
    args = pred_config
    device = 'cpu'#get_device(pred_config)
    model = load_model(pred_config, device)

    intent_label_lst = get_intent_labels(pred_config)
    slot_label_lst = get_slot_labels(pred_config)

    ignore_index = 0
    # Convert input file to TensorDataset
    pad_token_label_id = ignore_index
    tokenizer = load_tokenizer(args)
    lines = read_input_file(pred_config)
    dataset = convert_input_file_to_tensor_dataset(lines, pred_config, tokenizer, pad_token_label_id)

    # Predict
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=pred_config.batch_size)

    intent_token_preds = None
    all_referee_preds = None
    slot_preds = None

    for batch in tqdm(data_loader, desc="Predicting"):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
                      "pro_labels_ids": batch[4]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = batch[2]

            print(inputs['input_ids'])


            outputs = model(**inputs)
            slot_logits, intent_token_logits, referee_token_logits,all_referee_token_logits = outputs


             # ============================= Slot prediction ==============================
            if slot_preds is None:
                slot_preds = slot_logits.detach().cpu().numpy()
                all_slot_label_mask = batch[3].detach().cpu().numpy()

                #out_slot_labels_ids = slot_labels_ids.detach().cpu().numpy()
            else:
                slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)
                all_slot_label_mask = np.append(all_slot_label_mask, batch[3].detach().cpu().numpy(), axis=0)

                #out_slot_labels_ids = np.append(out_slot_labels_ids, slot_labels_ids.detach().cpu().numpy(),axis=0)


            # ============================= Pronoun referee prediction ==============================



            if all_referee_preds is None:
                all_referee_preds = all_referee_token_logits.detach().cpu().numpy()
                referee_preds = referee_token_logits.detach().cpu().numpy()

                pro_sample_mask_np = (torch.max(inputs["pro_labels_ids"],dim = 1)[0] ==1 ).detach().cpu().numpy()

                # all_out_referee_labels_ids = referee_labels_ids.detach().cpu().numpy()
                # out_referee_labels_ids = all_out_referee_labels_ids[pro_sample_mask_np]


            else:
                all_referee_preds = np.append(all_referee_preds,all_referee_token_logits.detach().cpu().numpy(), axis = 0)
                referee_preds = np.append(referee_preds, referee_token_logits.detach().cpu().numpy(), axis = 0)

                # pro_sample_mask_np = (torch.max(inputs["pro_labels_ids"],dim = 1)[0] == 1).detach().cpu().numpy()
                # new_all_out_referee_labels_ids = referee_labels_ids.detach().cpu().numpy()
                # all_out_referee_labels_ids = np.append(all_out_referee_labels_ids,new_all_out_referee_labels_ids,axis = 0)
                # small_new_out_referee_labels_ids = new_all_out_referee_labels_ids[pro_sample_mask_np]
                # out_referee_labels_ids = np.append(out_referee_labels_ids, small_new_out_referee_labels_ids, axis = 0)




            # ============================== Intent Token Seq =============================
            if intent_token_preds is None:
                intent_token_preds = intent_token_logits.detach().cpu().numpy()
                # out_intent_token_ids = intent_token_ids.detach().cpu().numpy()
            else:
                intent_token_preds = np.append(intent_token_preds, intent_token_logits.detach().cpu().numpy(),axis=0)
                # out_intent_token_ids = np.append(out_intent_token_ids,intent_token_ids.detach().cpu().numpy(), axis=0)


    slot_preds = np.argmax(slot_preds, axis=2)
    slot_label_map = {i: label for i, label in enumerate(slot_label_lst)}
    # out_slot_label_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
    slot_preds_list = [[] for _ in range(slot_preds.shape[0])]

    # generate mask
    for i in range(slot_preds.shape[0]):
        for j in range(slot_preds.shape[1]):
            if all_slot_label_mask[i, j] != pad_token_label_id:
                # out_slot_label_list[i].append(slot_label_map[out_slot_labels_ids[i][j]])
                slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])



    referee_token_map = {0:'PAD', 1:'O' ,2: 'B-referee'} # All referee are just one word in EGPSR

    referee_preds = np.argmax(referee_preds, axis=2)
    all_referee_preds = np.argmax(all_referee_preds, axis=2)


    referee_preds_list = [[] for _ in range(referee_preds.shape[0])]
    all_referee_preds_list = [[] for _ in range(all_referee_preds.shape[0])]


    # for i in range(referee_preds.shape[0]):
    #     for j in range(referee_preds.shape[1]):
    #         if all_slot_label_mask[i, j] != pad_token_label_id: #out_slot_labels_ids,out_referee_labels_ids
    #             referee_preds_list[i].append(referee_token_map[referee_preds[i][j]])

    for i in range(all_referee_preds.shape[0]):
        for j in range(all_referee_preds.shape[1]):
            if all_slot_label_mask[i, j] != pad_token_label_id: #all_out_referee_labels_ids
                all_referee_preds_list[i].append(referee_token_map[all_referee_preds[i][j]])



    # ============================= Intent Seq Prediction ============================
    intent_token_map = {i: label for i, label in enumerate(intent_label_lst)}

    intent_token_preds = np.argmax(intent_token_preds, axis=2)
    # out_intent_token_list = [[] for _ in range(out_intent_token_ids.shape[0])]
    intent_token_preds_list = [[] for _ in range(intent_token_preds.shape[0])]

    for i in range(intent_token_preds.shape[0]):
        for j in range(intent_token_preds.shape[1]):
            if all_slot_label_mask[i, j] != pad_token_label_id:
                # out_intent_token_list[i].append(intent_token_map[out_intent_token_ids[i][j]])
                intent_token_preds_list[i].append(intent_token_map[intent_token_preds[i][j]])


    # print('slot_preds_list: ',len(slot_preds_list),len(slot_preds_list[0]))
    # print('intent_token_preds_list: ',len(intent_token_preds_list),len(intent_token_preds_list[0]))
    # print('all_referee_preds_list: ',len(all_referee_preds_list),len(all_referee_preds_list[0]))




    print(intent_token_map)
    print(slot_label_map)

    # Write to output file
    pronouns = ['him','her','it','its']
    with open(pred_config.output_file, "w", encoding="utf-8") as f:
        for idx,(words, slot_preds, intent_preds,referee_preds) in enumerate(zip(lines, slot_preds_list, intent_token_preds_list,all_referee_preds_list)):
            if idx <= 10:
                print('words:              ',words, len(words))
                print('slot_preds:         ',slot_preds, len(slot_preds))
                print('referee_preds:      ',referee_preds, len(referee_preds))
                print('intent_preds:       ',intent_preds, len(intent_preds))

            line = ""
            if 'B-referee' not in referee_preds:#all([word not in pronouns for word in words]):
                for word, i_pred, s_pred in zip(words, intent_preds, slot_preds):
                    if s_pred == 'O' and i_pred == 'O':
                        line = line + word + " "
                    else:
                        line = line + "[{}:{}:{}] ".format(word, i_pred,s_pred)
                f.write(line.strip()+'\n')
            else:
                r_idx = referee_preds.index('B-referee')
                for word, i_pred, s_pred, r_pred in zip(words, intent_preds, slot_preds,referee_preds):
                    if s_pred == 'O' and i_pred == 'O':
                        line = line + word + " "
                    else:
                        if word not in pronouns:
                            line = line + "[{}:{}:{}] ".format(word, i_pred,s_pred)
                            if r_pred == 'B-referee':
                                ref = word
                        else:
                            line = line + "[{}:{}:{}:{}] ".format(word,words[r_idx], i_pred,s_pred)

                f.write('\n')
                f.write('---------------------------------------------------------------------\n')
                f.write('* Pro Case: \n')
                f.write(line.strip()+'\n')
                f.write('---------------------------------------------------------------------\n \n')
            print(line)
            print('=====================================')


    logger.info("Prediction Done!")
    return model, slot_preds_list, intent_token_preds_list, all_referee_preds_list,inputs

In [26]:
from data_loader import load_and_cache_examples
from transformers import BertModel
# train_dataset = load_and_cache_examples(args, tokenizer, mode="train")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", default="sample_pred_in.txt", type=str, help="Input file for prediction")

    #
    # parser.add_argument("--input_file", default="data/gpsr_pro_instance/test/seq.in", type=str, help="Input file for prediction")

    # OOV test
    # parser.add_argument("--input_file", default="data/gpsr_pro_instance_say_vocab/checked/seq.in", type=str, help="Input file for prediction")


    # parser.add_argument("--task", default='gpsr_pro_instance', type=str, help="The name of the task to train")
    parser.add_argument("--task", default='gpsr_pro_instance_say', type=str, help="The name of the task to train")

    # parser.add_argument("--model_type", default="multibert", type=str,
    #                     help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))

    parser.add_argument("--model_type", default="multibert", type=str,
                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument("--data_dir", default="./data", type=str, help="The input data dir")
    parser.add_argument("--intent_label_file", default="intent_label.txt", type=str, help="Intent Label file")
    parser.add_argument("--slot_label_file", default="slot_label.txt", type=str, help="Slot Label file")
    parser.add_argument("--intent_seq", type=int, default=1, help="whether we use intent seq setting")


    parser.add_argument("--pro", type=int, default=1, help="support pronoun disambiguition")#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!



    parser.add_argument("--num_mask", type=int, default=7, help="assumptive number of slot in one sentence")
    parser.add_argument("--ignore_index", default=0, type=int,
                        help='Specifies a target value that is ignored and does not contribute to the input gradient')

    parser.add_argument("--output_file", default="final_predict.txt", type=str, help="Output file for prediction")
    parser.add_argument("--model_dir", default="bert_based_model_04-07-14:03:07", type=str, help="Path to save, load model")
    parser.add_argument("--max_seq_len", default=32, type=int,
                        help="The maximum total input sequence length after tokenization.")
    parser.add_argument("--batch_size", default=128, type=int, help="Batch size for prediction")
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument('-f')  #########################

    pred_config = parser.parse_args()

    pred_config.model_name_or_path = MODEL_PATH_MAP[pred_config.model_type]

    tokenizer = load_tokenizer(pred_config)

    model, slot_preds_list, intent_token_preds_list, all_referee_preds_list,pred_inputs =  predict(pred_config)

Predicting:   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[ 101, 2424, 1996, 6207, 2006, 1996, 2795,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])





AttributeError: 'NoneType' object has no attribute 'detach'

In [None]:
# from utils import compute_metrics_final
# def eval(slot_preds_list,intent_token_preds_list,all_referee_preds_list):
#     out_slot_label_list = [line.strip().split() for line in open('./data/gpsr_pro_instance_say_vocab/checked/seq.out')]
#     out_intent_token_list = [line.strip().split() for line in open('./data/gpsr_pro_instance_say_vocab/checked/seq_intent.out')]
#     out_referee_label_list = [line.strip().split() for line in open('./data/gpsr_pro_instance_say_vocab/checked/seq_pro.out')]
#     out_referee_label_list = [[ele if ele != 'referral' else 'O' for ele in lst] for lst in out_referee_label_list]
#
#     # change 'pad' yo 'O', because some cases do not have pro, so 'O' is 'PAD' in preds
#     all_referee_preds_list = [[ele if ele != 'PAD' else 'O' for ele in lst] for lst in all_referee_preds_list]
#
#
#     # remove none pro cases
#     all_referee_preds_list = [all_referee_preds_list[i] for i in range(len(all_referee_preds_list)) if 'B-referee' in out_referee_label_list[i]]
#     out_referee_label_list = [out_referee_label_list[i] for i in range(len(out_referee_label_list)) if 'B-referee' in out_referee_label_list[i]]
#
#     total_result = compute_metrics_final(
#                                            slot_preds_list,
#                                            out_slot_label_list,
#                                            intent_token_preds_list,
#                                            out_intent_token_list,
#                                            all_referee_preds_list,
#                                            out_referee_label_list
#                                           )
#
#     com_lst = [line.strip().split() for line in open('./data/gpsr_pro_instance_say_vocab/checked/seq.in')]
#     for i in range(len(slot_preds_list)):
#         pred = slot_preds_list[i]
#         lab = out_slot_label_list[i]
#
#         if pred != lab:
#             print(i)
#             print(com_lst[i])
#             print(lab)
#             print(pred)
#             print('-------------------------------------')
#
#
#     return total_result
# eval(slot_preds_list,intent_token_preds_list,all_referee_preds_list)

In [None]:
# def print_pro(all_referee_preds_list):
#     out_referee_label_list = [line.strip().split() for line in open('./data/gpsr_pro_instance_say_vocab/checked/seq_pro.out')]
#     out_referee_label_list = [[ele if ele != 'referral' else 'O' for ele in lst] for lst in out_referee_label_list]
#     all_referee_preds_list = [[ele if ele != 'PAD' else 'O' for ele in lst] for lst in all_referee_preds_list]
#     for i in range(len(out_referee_label_list)):
#         lab = out_referee_label_list[i]
#         pred = all_referee_preds_list[i]
#         # if 'B-referee' in  lab:
#         #     print(i)
#         #     print(lab)
#         #     print(pred)
#         #     print('-------------------------------------')
#
#         print(i)
#         print(lab)
#         print(pred)
#         print('-------------------------------------')
# print_pro(all_referee_preds_list)

In [11]:
model.modules

<bound method Module.modules of JointBERTMultiIntent(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [None]:
import onnx
import onnxruntime



# dummy_input = inputs
# torch.onnx.export(model,               # model being run
#                 dummy_input,               # model input (or a tuple for multiple inputs)
#                 "normal_resolution.onnx",   # where to save the model (can be a file or file-like object)
#                 export_params=True,        # store the trained parameter weights inside the model file
#                 opset_version=11,          # the ONNX version to export the model to
#                 do_constant_folding=True,  # whether to execute constant folding for optimization
#                 input_names = ['input_ids','attention_mask','token_type_ids'],   # the model's input names
#                 output_names = ['slot_logits', 'intent_token_logits', 'referee_token_logits','all_referee_token_logits'], ##['all_logits','other'],
#                 )
#
# # ['slot_logits', 'intent_token_logits', 'referee_token_logits','all_referee_token_logits']
#
#
# from onnxruntime.quantization import quantize_dynamic, QuantType
#
# model_fp32 = 'normal_resolution.onnx'
# model_quant = 'normal_resolution.quant.onnx'
# quantized_model = quantize_dynamic(model_fp32, model_quant)
# # for quantization
# # propagate through the model
# # outputs = model(dummy_input)

# BERT

In [19]:
inputs

{'input_ids': tensor([[  101,  8957,  3531,  ...,     0,     0,     0],
         [  101,  3531,  2425,  ...,     0,     0,     0],
         [  101,  2071,  2017,  ...,     0,     0,     0],
         ...,
         [  101,  2404,  1037,  ...,     0,     0,     0],
         [  101, 17021,  1996,  ...,     0,     0,     0],
         [  101,  3288,  1996,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'pro_labels_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
      

In [21]:
# inputs = {"input_ids": batch[0][None,:].to('cuda'),
#           "attention_mask": batch[1][None,:].to('cuda'),
#           'token_type_ids':batch[2][None,:].to('cuda')}


inputs = {"input_ids": pred_inputs['input_ids'][0][None,:],
          "attention_mask": pred_inputs['attention_mask'][1][None,:],
          'token_type_ids':pred_inputs['token_type_ids'][2][None,:]}

dummy_input = inputs
torch.onnx.export(model.bert,               # model being run
                dummy_input,               # model input (or a tuple for multiple inputs)
                "bert.onnx",   # where to save the model (can be a file or file-like object)
                export_params=True,        # store the trained parameter weights inside the model file
                opset_version=11,          # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names = ['input_ids','attention_mask','token_type_ids'],   # the model's input names
                output_names = ['sequence_output','pooled_output'], ##['all_logits','other'],
                )


from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'bert.onnx'
model_quant = './quantized_models/bert.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

Ignore MatMul due to non constant B: /[MatMul_81]
Ignore MatMul due to non constant B: /[MatMul_86]
Ignore MatMul due to non constant B: /[MatMul_139]
Ignore MatMul due to non constant B: /[MatMul_144]
Ignore MatMul due to non constant B: /[MatMul_197]
Ignore MatMul due to non constant B: /[MatMul_202]
Ignore MatMul due to non constant B: /[MatMul_255]
Ignore MatMul due to non constant B: /[MatMul_260]
Ignore MatMul due to non constant B: /[MatMul_313]
Ignore MatMul due to non constant B: /[MatMul_318]
Ignore MatMul due to non constant B: /[MatMul_371]
Ignore MatMul due to non constant B: /[MatMul_376]
Ignore MatMul due to non constant B: /[MatMul_429]
Ignore MatMul due to non constant B: /[MatMul_434]
Ignore MatMul due to non constant B: /[MatMul_487]
Ignore MatMul due to non constant B: /[MatMul_492]
Ignore MatMul due to non constant B: /[MatMul_545]
Ignore MatMul due to non constant B: /[MatMul_550]
Ignore MatMul due to non constant B: /[MatMul_603]
Ignore MatMul due to non constant

In [None]:
# out = model.bert(**inputs)
# print(out[0].shape)
# print(out[1].shape)


# slot_classifier

In [None]:
dummy_input = out[0]
torch.onnx.export(model.slot_classifier,               # model being run
                dummy_input,               # model input (or a tuple for multiple inputs)
                "slot_classifier.onnx",   # where to save the model (can be a file or file-like object)
                export_params=True,        # store the trained parameter weights inside the model file
                opset_version=11,          # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names = ['sequence_output'],   # the model's input names
                output_names = ['slot_logits'], ##['all_logits','other'],
                )

# ['slot_logits', 'intent_token_logits', 'referee_token_logits','all_referee_token_logits']


from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'slot_classifier.onnx'
model_quant = './quantized_models/slot_classifier.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

# intent_token_classifier

In [None]:
dummy_input = out[0]
torch.onnx.export(model.intent_token_classifier,               # model being run
                dummy_input,               # model input (or a tuple for multiple inputs)
                "intent_token_classifier.onnx",   # where to save the model (can be a file or file-like object)
                export_params=True,        # store the trained parameter weights inside the model file
                opset_version=11,          # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names = ['sequence_output'],   # the model's input names
                output_names = ['intent_token_logits'], ##['all_logits','other'],
                )

# ['slot_logits', 'intent_token_logits', 'referee_token_logits','all_referee_token_logits']


from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'intent_token_classifier.onnx'
model_quant = './quantized_models/intent_token_classifier.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

# pro_classifier

In [None]:
# inputs = {"input_ids": batch[0],
#           "attention_mask": batch[1],
#           "pro_labels_ids": batch[4],
#           'token_type_ids':batch[2]}

pro_labels_ids = batch[4][None,:]

sequence_output = out[0]
if 1 in pro_labels_ids:  # if use pro, and the batch contain pronouns
    # 1. concate pronoun to each word in the sequence
    pro_token_mask = pro_labels_ids > 0
    pro_sample_mask = torch.max(pro_token_mask.long(),dim = 1)[0] > 0
    print(pro_sample_mask)
    pro_vec = sequence_output[pro_token_mask]
    pro_sequence_output = sequence_output[pro_sample_mask]
    pro_vec = pro_vec[:, None, :]  # add new dimention
    repeat_pro = pro_vec.repeat(1, 32, 1)#self.args.max_seq_len
    concated_input = torch.cat((pro_sequence_output, repeat_pro), dim=2)


print(concated_input.shape)

In [None]:
dummy_input = concated_input
torch.onnx.export(model.pro_classifier,               # model being run
                dummy_input,               # model input (or a tuple for multiple inputs)
                "pro_classifier.onnx",   # where to save the model (can be a file or file-like object)
                export_params=True,        # store the trained parameter weights inside the model file
                opset_version=11,          # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names = ['concated_input'],   # the model's input names
                output_names = ['referee_token_logits'], ##['all_logits','other'],
                )

# ['slot_logits', 'intent_token_logits', 'referee_token_logits','all_referee_token_logits']


from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'pro_classifier.onnx'
model_quant = './quantized_models/pro_classifier.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

# Try Onnx Models

In [None]:
import onnxruntime
import time

def initONNX(path):
    start = time.time()
    sess_options = onnxruntime.SessionOptions()
    #sess_options.enable_profiling = True

    sess_options.intra_op_num_threads = 1#4
    sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL

    ort_session  = onnxruntime.InferenceSession(path, sess_options)
    print("Loading time ONNX: ", time.time() - start)
    return ort_session



bert_ort_session = initONNX('./quantized_models/bert.quant.onnx')
slot_classifier_ort_session = initONNX('./quantized_models/slot_classifier.quant.onnx')
intent_token_classifier_ort_session = initONNX('./quantized_models/intent_token_classifier.quant.onnx')
pro_classifier_ort_session = initONNX('./quantized_models/pro_classifier.quant.onnx')

## bert

In [None]:
bert_inputs = {"input_ids": np.array(batch[0][None,:]),
            "attention_mask": np.array(batch[1][None,:]),
             "token_type_ids": np.array(batch[2][None,:])}
sequence_output,pooled_output = bert_ort_session.run(None, bert_inputs)
sequence_output.shape

## intent token

In [None]:
intent_token_logits = slot_classifier_ort_session.run(None, {'sequence_output':sequence_output})
print(len(intent_token_logits),len(intent_token_logits[0]),len(intent_token_logits[0][0]),len(intent_token_logits[0][0][0]))

## slot token

In [None]:
slot_logits = intent_token_classifier_ort_session.run(None, {'sequence_output':sequence_output})
print(len(slot_logits),len(slot_logits[0]),len(slot_logits[0][0]),len(slot_logits[0][0][0]))

## pro

In [None]:
pro_labels_ids = batch[4][None,:]

sequence_output = out[0]
if 1 in pro_labels_ids:  # if use pro, and the batch contain pronouns
    # 1. concate pronoun to each word in the sequence
    pro_token_mask = pro_labels_ids > 0
    pro_sample_mask = torch.max(pro_token_mask.long(),dim = 1)[0] > 0
    print(pro_sample_mask)
    pro_vec = sequence_output[pro_token_mask]
    pro_sequence_output = sequence_output[pro_sample_mask]
    pro_vec = pro_vec[:, None, :]  # add new dimention
    repeat_pro = pro_vec.repeat(1, 32, 1)#self.args.max_seq_len
    concated_input = torch.cat((pro_sequence_output, repeat_pro), dim=2)

concated_input = concated_input.detach().numpy()
print(concated_input.shape)

In [None]:
referee_token_logits = pro_classifier_ort_session.run(None, {'concated_input':concated_input})
print(len(referee_token_logits),len(referee_token_logits[0]),len(referee_token_logits[0][0]),len(referee_token_logits[0][0][0]))

In [None]:
print(np.array(referee_token_logits).shape)
print(np.squeeze(np.array(referee_token_logits)).shape)
a = np.squeeze(np.array(referee_token_logits))
np.argmax(a,axis =1)

# Mobile BERT

In [22]:
inputs = {"input_ids": batch[0][None,:],
          "attention_mask": batch[1][None,:],
          'token_type_ids':batch[2][None,:]}

dummy_input = inputs
torch.onnx.export(model.mobilebert,               # model being run
                dummy_input,               # model input (or a tuple for multiple inputs)
                "mobile_bert.onnx",   # where to save the model (can be a file or file-like object)
                export_params=True,        # store the trained parameter weights inside the model file
                opset_version=11,          # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names = ['input_ids','attention_mask','token_type_ids'],   # the model's input names
                output_names = ['sequence_output','pooled_output'], ##['all_logits','other'],
                )


from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'mobile_bert.onnx'
model_quant = './quantized_models/mobile_bert.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

  torch.tensor(1000),


Ignore MatMul due to non constant B: /[MatMul_196]
Ignore MatMul due to non constant B: /[MatMul_201]
Ignore MatMul due to non constant B: /[MatMul_266]
Ignore MatMul due to non constant B: /[MatMul_271]
Ignore MatMul due to non constant B: /[MatMul_336]
Ignore MatMul due to non constant B: /[MatMul_341]
Ignore MatMul due to non constant B: /[MatMul_406]
Ignore MatMul due to non constant B: /[MatMul_411]
Ignore MatMul due to non constant B: /[MatMul_476]
Ignore MatMul due to non constant B: /[MatMul_481]
Ignore MatMul due to non constant B: /[MatMul_546]
Ignore MatMul due to non constant B: /[MatMul_551]
Ignore MatMul due to non constant B: /[MatMul_616]
Ignore MatMul due to non constant B: /[MatMul_621]
Ignore MatMul due to non constant B: /[MatMul_686]
Ignore MatMul due to non constant B: /[MatMul_691]
Ignore MatMul due to non constant B: /[MatMul_756]
Ignore MatMul due to non constant B: /[MatMul_761]
Ignore MatMul due to non constant B: /[MatMul_826]
Ignore MatMul due to non consta

# DistilBERT

In [35]:
inputs = {"input_ids": batch[0][None,:],
          "attention_mask": batch[1][None,:]}
dummy_input = inputs
torch.onnx.export(model.distilbert,               # model being run
                dummy_input,               # model input (or a tuple for multiple inputs)
                "distil_bert.onnx",   # where to save the model (can be a file or file-like object)
                export_params=True,        # store the trained parameter weights inside the model file
                opset_version=11,          # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names = ['input_ids','attention_mask'],   # the model's input names
                output_names = ['sequence_output'], ##['all_logits','other'],
                )


from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = 'distil_bert.onnx'
model_quant = './quantized_models/distil_bert.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)

Ignore MatMul due to non constant B: /[MatMul_56]
Ignore MatMul due to non constant B: /[MatMul_66]
Ignore MatMul due to non constant B: /[MatMul_121]
Ignore MatMul due to non constant B: /[MatMul_131]
Ignore MatMul due to non constant B: /[MatMul_186]
Ignore MatMul due to non constant B: /[MatMul_196]
Ignore MatMul due to non constant B: /[MatMul_251]
Ignore MatMul due to non constant B: /[MatMul_261]
Ignore MatMul due to non constant B: /[MatMul_316]
Ignore MatMul due to non constant B: /[MatMul_326]
Ignore MatMul due to non constant B: /[MatMul_381]
Ignore MatMul due to non constant B: /[MatMul_391]


In [None]:
# ! pip install onnx
# ! pip install onnxruntime
import onnx
model = onnx.load('./quantized_models/bert.quant.onnx')
output =[node.name for node in model.graph.output]

input_all = [node.name for node in model.graph.input]
input_initializer =  [node.name for node in model.graph.initializer]
net_feed_input = list(set(input_all)  - set(input_initializer))

print('Inputs: ', net_feed_input)
print('Outputs: ', output)

In [None]:
model.graph.input
model.graph.output

# Numpy Classifier

In [22]:
model

JointBERTMultiIntent(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [None]:
model.slot_classifier.linear.weight.shape
model.slot_classifier._modules['dropout'].p
model.slot_classifier._modules

In [None]:
import onnx
from numpy import save
from numpy import load

In [23]:
from numpy import save
from numpy import load


model_type = 'bert'

slot_classifier_w = model.slot_classifier._modules['linear'].weight.detach().numpy()
save(f'./numpy_para/{model_type}/slot_classifier/weights.npy', slot_classifier_w)
slot_classifier_b = model.slot_classifier._modules['linear'].bias.detach().numpy()
save(f'./numpy_para/{model_type}/slot_classifier/bias.npy', slot_classifier_b)


intent_token_classifier_w = model.intent_token_classifier._modules['linear'].weight.detach().numpy()
save(f'./numpy_para/{model_type}/intent_token_classifier/weights.npy', intent_token_classifier_w)
intent_token_classifier_b = model.intent_token_classifier._modules['linear'].bias.detach().numpy()
save(f'./numpy_para/{model_type}/intent_token_classifier/bias.npy', intent_token_classifier_b)



pro_classifier_w1 = model.pro_classifier._modules['linear1'].weight.detach().numpy()
save(f'./numpy_para/{model_type}/pro_classifier/weights1.npy', pro_classifier_w1)
pro_classifier_b1 = model.pro_classifier._modules['linear1'].bias.detach().numpy()
save(f'./numpy_para/{model_type}/pro_classifier/bias1.npy', pro_classifier_b1)

pro_classifier_w2 = model.pro_classifier._modules['linear2'].weight.detach().numpy()
save(f'./numpy_para/{model_type}/pro_classifier/weights2.npy', pro_classifier_w2)
pro_classifier_b2 = model.pro_classifier._modules['linear2'].bias.detach().numpy()
save(f'./numpy_para/{model_type}/pro_classifier/bias2.npy', pro_classifier_b2)

In [25]:
class slot_classifier_np():
    def __init__(self,dir):
        self.linear_weights = load(dir+'/weights.npy')
        self.linear_bias = load(dir+'/bias.npy')
    # def ReLU(self,x):
    #     return x * (x > 0)
    def forward(self,x):
        x = np.squeeze(x)
        # x = self.ReLU(x)
        x = x @ np.transpose(self.linear_weights) + self.linear_bias
        return x

s =  slot_classifier_np(f'./numpy_para/{model_type}/slot_classifier')
s

<__main__.slot_classifier_np at 0x7fc09fccbdf0>

In [19]:
class intent_token_classifier_np():
    def __init__(self,dir):
        self.linear_weights = load(dir + '/weights.npy')
        self.linear_bias = load(dir + '/bias.npy')

    # def ReLU(self,x):
    #     return x * (x > 0)
    def forward(self,x):
        x = np.squeeze(x)
        # x = self.ReLU(x)
        x = x @ np.transpose(self.linear_weights) + self.linear_bias
        return x

i =  intent_token_classifier_np('./numpy_para/intent_token_classifier')
i

<__main__.intent_token_classifier_np at 0x7f48c6d260d0>

In [20]:
class pro_classifier_np():
    def __init__(self,dir):
        self.linear_weights1 = load(dir + '/weights1.npy')
        self.linear_bias1 = load(dir + '/bias1.npy')
        self.linear_weights2 = load(dir + '/weights2.npy')
        self.linear_bias2 = load(dir + '/bias2.npy')
    def ReLU(self,x):
        return x * (x > 0)
    def forward(self,x):
        x = np.squeeze(x)
        x = x @ np.transpose(self.linear_weights1) + self.linear_bias1
        x = self.ReLU(x)
        x = x @ np.transpose(self.linear_weights2) + self.linear_bias2
        return x

p =  pro_classifier_np('./numpy_para/pro_classifier')
p

<__main__.pro_classifier_np at 0x7f48c65a11f0>

In [None]:
import time
import onnxruntime
import numpy as np
from transformers import BertTokenizer

class CommandProcessor():
    def __init__(self,session = None):
        # self.INTENT_CLASSES = ['PAD','O','B-greet','I-greet','B-guide','I-guide','B-follow','I-follow','B-find','I-find','B-take','I-take','B-go','I-go','B-know','I-know']
        # self.SLOT_CLASSES = ['PAD','O','B-obj','B-dest','I-sour','I-obj','I-dest','B-per','B-sour','I-per']

        self.INTENT_CLASSES = ['PAD','O', 'B-greet', 'I-greet', 'B-know', 'I-know', 'B-follow', 'I-follow', 'B-take', 'I-take', 'B-tell', 'I-tell', 'B-guide', 'I-guide', 'B-go', 'I-go', 'B-answer', 'I-answer', 'B-find','I-find']
        self.SLOT_CLASSES = ['PAD', 'O','I-obj', 'B-sour', 'B-dest','I-sour','B-what','B-obj','I-dest','I-per', 'I-what', 'B-per']

        self.PRO_CLASSES = ['PAD','O','B-referee']

        self.referee_token_map = {i:label for i,label in enumerate(self.PRO_CLASSES)}
        self.intent_token_map = {i:label for i,label in enumerate(self.INTENT_CLASSES)}
        self.slot_label_map = {i:label for i,label in enumerate(self.SLOT_CLASSES)}

        self.max_seq_len = 32
        self.pro_lst = ['him','her','it','its']
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.pad_token_label_id = 0

        self.input_text_path = './sample_pred_in.txt'#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        self.output_file = './outputs'

        self.bert_ort_session = self.initONNX('./quantized_models/bert.quant.onnx')
        # self.slot_classifier_ort_session = self.initONNX('./quantized_models/slot_classifier.quant.onnx')
        # self.intent_token_classifier_ort_session = self.initONNX('./quantized_models/intent_token_classifier.quant.onnx')
        # self.pro_classifier_ort_session = self.initONNX('./quantized_models/pro_classifier.quant.onnx')

    def initONNX(self,path):
        start = time.time()
        sess_options = onnxruntime.SessionOptions()

        sess_options.intra_op_num_threads = 1#4
        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL

        ort_session  = onnxruntime.InferenceSession(path, sess_options)
        print("Loading time ONNX: ", time.time() - start)
        return ort_session

    def read_input_file(self):
        with open(self.input_text_path, "r", encoding="utf-8") as f:
            words = f.readline().strip().split()
            # for line in f:
            #     line = line.strip()
            #     words = line.split()
            #     break # I should delete precessed commands!!!!!!!!!!!!!

        return words

    def convert_input_file_to_dataloader(self,words,
                                         cls_token_segment_id=0,
                                         pad_token_segment_id=0,
                                         sequence_a_segment_id=0,
                                         mask_padding_with_zero=True):

        tokenizer = self.tokenizer

        # Setting based on the current model type
        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        unk_token = tokenizer.unk_token
        pad_token_id = tokenizer.pad_token_id

        tokens = []
        slot_label_mask = []
        pro_labels_ids = []
        for word in words:
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            if word in self.pro_lst:
                pro_label = 1
            else:
                pro_label = 0
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_label_mask.extend([self.pad_token_label_id + 1] + [self.pad_token_label_id] * (len(word_tokens) - 1))
            pro_labels_ids.extend([pro_label] + [self.pad_token_label_id] * (len(word_tokens) - 1)) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > self.max_seq_len - special_tokens_count:
            tokens = tokens[: (self.max_seq_len - special_tokens_count)]
            slot_label_mask = slot_label_mask[:(self.max_seq_len - special_tokens_count)]
            pro_labels_ids = pro_labels_ids[:(self.max_seq_len - special_tokens_count)] #!!!!!!!!!!!!!!!!!!!!!!!!!!!

        # Add [SEP] token
        tokens += [sep_token]
        token_type_ids = [sequence_a_segment_id] * len(tokens)
        slot_label_mask += [self.pad_token_label_id]
        pro_labels_ids += [self.pad_token_label_id]#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        # Add [CLS] token
        tokens = [cls_token] + tokens
        token_type_ids = [cls_token_segment_id] + token_type_ids
        slot_label_mask = [self.pad_token_label_id] + slot_label_mask
        pro_labels_ids = [self.pad_token_label_id] + pro_labels_ids#!!!!!!!!!!!!!!!!!!!!!!!
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = self.max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_label_mask = slot_label_mask + ([self.pad_token_label_id] * padding_length)
        pro_labels_ids = pro_labels_ids + ([self.pad_token_label_id] * padding_length)


        input_ids = np.array(input_ids)
        attention_mask = np.array(attention_mask)
        token_type_ids = np.array(token_type_ids)
        pro_labels_ids = np.array(pro_labels_ids)
        # print('input_ids shape: ',input_ids.shape)
        # print('attention_mask shape: ',attention_mask.shape)
        # print('token_type_ids shape: ',token_type_ids.shape)
        # print('pro_labels_ids shape: ',pro_labels_ids.shape)
        sample = {'input_ids':input_ids[None,:], 'attention_mask':attention_mask[None,:], 'token_type_ids': token_type_ids[None,:]}

        return sample,slot_label_mask,pro_labels_ids

    def predict(self):
        lines = self.read_input_file()
        sample,slot_label_mask,pro_labels_ids = self.convert_input_file_to_dataloader(lines)
        start = time.time()

        sequence_output, _ = self.bert_ort_session.run(None, sample)
        # ============================= Slot prediction ==============================
        slot_classifier = slot_classifier_np('./numpy_para/slot_classifier')
        slot_logits = slot_classifier.forward(sequence_output)
        # slot_logits = self.slot_classifier_ort_session.run(None, {'sequence_output':sequence_output})
        slot_preds = np.squeeze(np.array(slot_logits))
        slot_preds = np.argmax(slot_preds, axis=1)

        # ============================== Intent Token Seq =============================
        intent_token_classifier = intent_token_classifier_np('./numpy_para/intent_token_classifier')
        intent_token_logits = intent_token_classifier.forward(sequence_output)
        # intent_token_logits = self.intent_token_classifier_ort_session.run(None, {'sequence_output':sequence_output})
        intent_token_preds = np.squeeze(np.array(intent_token_logits))
        intent_token_preds = np.argmax(intent_token_preds, axis=1)

        # ============================= Pronoun referee prediction ==============================
        if any(pro_labels_ids):
            sq_sequence_output = np.squeeze(sequence_output)
            pro_token = sq_sequence_output[pro_labels_ids == 1]
            # gpsr has 2 pronouns referred to the same referral, we only need to encode one pronoun
            if pro_token.shape[0] != 1:
                pro_token = pro_token[0, :]
            repeat_pro = np.tile(pro_token,(self.max_seq_len,1))
            concated_input = np.concatenate((sq_sequence_output,repeat_pro),axis = 1)[None,:]

            pro_classifier = pro_classifier_np('./numpy_para/pro_classifier')
            referee_token_logits = pro_classifier.forward(concated_input)
            # referee_token_logits = self.pro_classifier_ort_session.run(None, {'concated_input':concated_input})
            referee_preds = np.squeeze(np.array(referee_token_logits))
            referee_preds = np.argmax(referee_preds, axis=1)

        else:
            referee_preds = np.ones(self.max_seq_len)


        print('------------------------------------------------------')
        print("Total inference time: ", time.time() - start)


        slot_preds_list = []
        intent_token_preds_list = []
        referee_preds_list = []

        for token_idx in range(len(slot_label_mask)):
            if slot_label_mask[token_idx] != self.pad_token_label_id:
                referee_preds_list.append(self.referee_token_map[referee_preds[token_idx]])
                intent_token_preds_list.append(self.intent_token_map[intent_token_preds[token_idx]])
                slot_preds_list.append(self.slot_label_map[slot_preds[token_idx]])


        self.write_readable_outputs(slot_preds_list,intent_token_preds_list,referee_preds_list)

        return

    def write_readable_outputs(self,slot_preds_list,intent_token_preds_list,referee_preds_list):
        words = self.read_input_file()
        line = ''
        for token_idx,(word, i_pred, s_pred, r_pred) in enumerate(zip(words, intent_token_preds_list, slot_preds_list,referee_preds_list)):
            if s_pred == 'O' and i_pred == 'O' and r_pred == 'O':
                line = line + word + " "
            elif i_pred != 'O':
                if word not in self.pro_lst:
                    line = line + "[{}:{}:{}] ".format(word, i_pred,s_pred)

                    if r_pred == 'B-referee':
                        r_idx = token_idx

                else:
                    line = line + "[{}({}):{}:{}] ".format(word,words[r_idx], i_pred,s_pred)

        with open(self.output_file, "a", encoding="utf-8") as f:
            if 'B-referee' in referee_preds_list:
                f.write('\n')
                f.write('---------------------------------------------------------------------\n')
                f.write('* Pro Case: \n')
                f.write(line.strip()+'\n')
                f.write('---------------------------------------------------------------------\n \n')
            else:
                f.write(line.strip()+'\n')
            print(line)
            print('=====================================')
        return

inference = CommandProcessor()
inference.predict()