In [None]:
import os
import copy
import json
import logging
import torch
from torch.utils.data import TensorDataset

from utils import get_intent_labels, get_slot_labels
from seqeval.metrics.sequence_labeling import get_entities

logger = logging.getLogger(__name__)

In [None]:
class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    Args:
        guid: Unique id for the example.
        words: list. The words of the sequence.
        intent_label: (Optional) string. The intent label of the example.
        slot_labels: (Optional) list. The slot labels of the example.
    """
    def __init__(self, guid, words, intent_label=None, slot_labels=None):
        self.guid = guid
        self.words = words
        self.intent_label = intent_label
        self.slot_labels = slot_labels
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [None]:
class InputExampleMultiIntent(InputExample):
    def __init__(self,
                 guid,
                 words,
                 intent_label=None,
                 slot_labels=None,
                 intent_tokens=None,
                 B_tag_mask=None,
                 BI_tag_mask=None,
                 tag_intent_label=None):
        super().__init__(guid, words, intent_label, slot_labels)
        self.intent_tokens=intent_tokens
        self.B_tag_mask=B_tag_mask
        self.BI_tag_mask=BI_tag_mask
        self.tag_intent_label=tag_intent_label

In [None]:
class InputExampleMultiIntent_Pro(InputExample):
    def __init__(self,
                 guid,
                 words,
                 intent_label=None,
                 slot_labels=None,
                 pro_labels = None,#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                 intent_tokens=None,
                 B_tag_mask=None,
                 BI_tag_mask=None,
                 tag_intent_label=None):

        super().__init__(guid, words, intent_label, slot_labels)
        self.intent_tokens = intent_tokens
        self.B_tag_mask = B_tag_mask
        self.BI_tag_mask = BI_tag_mask
        self.tag_intent_label = tag_intent_label
        self.pro_labels = pro_labels #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [None]:
class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.intent_label_id = intent_label_id
        self.slot_labels_ids = slot_labels_ids
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [None]:
class InputFeaturesMultiIntent(InputFeatures):
    def __init__(self,
                 input_ids,
                 attention_mask,
                 token_type_ids,
                 intent_label_id,
                 slot_labels_ids,
                 intent_tokens_ids,
                 B_tag_mask,
                 BI_tag_mask,
                 tag_intent_label):
        super().__init__(input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids)
        self.intent_tokens_ids = intent_tokens_ids
        self.B_tag_mask = B_tag_mask
        self.BI_tag_mask = BI_tag_mask
        self.tag_intent_label = tag_intent_label

In [None]:
class InputFeaturesMultiIntent_Pro(InputFeatures):
    def __init__(self,
                 input_ids,
                 attention_mask,
                 token_type_ids,
                 intent_label_id,
                 slot_labels_ids,
                 intent_tokens_ids,
                 B_tag_mask,
                 BI_tag_mask,
                 tag_intent_label,
                 pro_labels_ids): #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        super().__init__(input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids)
        self.intent_tokens_ids = intent_tokens_ids
        self.B_tag_mask = B_tag_mask
        self.BI_tag_mask = BI_tag_mask
        self.tag_intent_label = tag_intent_label
        self.pro_labels_ids = pro_labels_ids #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        print(pro_labels_ids)

In [None]:
class JointProcessor(object):
    """Processor for the JointBERT data set """
    def __init__(self, args):
        self.args = args
        self.intent_labels = get_intent_labels(args)
        self.slot_labels = get_slot_labels(args)
        self.input_text_file = 'seq.in'
        self.intent_label_file = 'label'
        self.slot_labels_file = 'seq.out'
    @classmethod
    def _read_file(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                lines.append(line.strip())
            return lines
    def _create_examples(self, texts, intents, slots, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
            guid = "%s-%s" % (set_type, i)
            # 1. input_text
            words = text.split()  # Some are spaced twice
            # 2. intent
            intent_label = self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
            # 3. slot
            slot_labels = []
            for s in slot.split():
                slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))
            assert len(words) == len(slot_labels)
            examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
        return examples
    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test
        """
        data_path = os.path.join(self.args.data_dir, self.args.task, mode)
        logger.info("LOOKING AT {}".format(data_path))
        return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
                                     intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
                                     slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
                                     set_type=mode)

In [None]:
class JointProcessorMultiIntent(object):
    """Processor for the JointBERT data set """
    def __init__(self, args):
        self.args = args
        # data/atis/intent_label.txt
        self.intent_labels = get_intent_labels(args)
        # data/atis/slot_label.txt
        self.slot_labels = get_slot_labels(args)
        self.input_text_file = 'seq.in'
        self.intent_label_file = 'label'
        self.slot_labels_file = 'seq.out'
        self.intent_tokens_file = 'seq_intent.out'
    @classmethod
    def _read_file(cls, input_file, quotechar=None):
        """
        Read text file as lines
        """
        with open(input_file, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                lines.append(line.strip())
            return lines
    def _create_examples(self, texts, intents, slots, intent_tokens, set_type):
        """
        Creates examples for the training and dev sets.
        
        Args:
            texts: list of utterance (str; concat of tokens)
            intents: list of intents
            slots: bio tokens (str)
            intent_tokens (str)
            
        Return:
            examples: a list of examples (example will contains an id, list_of_words, intent, list_of_bio)
        """
        examples = []
        for i, (text, intent, slot, intent_token) in enumerate(zip(texts, intents, slots, intent_tokens)):
            # train-i
            guid = "%s-%s" % (set_type, i)
            # 1. input_text
            words = text.split()  # Some are spaced twice
            # 2. intent to list list(index)
            intent_label_token = [self.intent_labels.index(int_tok) if int_tok in self.intent_labels else self.intent_labels.index('UNK') for int_tok in intent.split('#')]
            # we have to convert it to an indicating list with the length of intents
            intent_label = [0 for _ in self.intent_labels]
            for i in intent_label_token:
                intent_label[i] = 1
            # 3. slot to list of index list(list(index))
            slot_labels = []
            for s in slot.split():
                slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))
            
            # 4. intent_token_str to index list(list(index))
            intent_token_list = []
            for s in intent_token.split():
                intent_token_list.append(self.intent_labels.index(s) if s in self.intent_labels else self.intent_labels.index('UNK'))
                
            # get entities in one utterance
            MAX_SLOT = self.args.num_mask
            # seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
            # [('PER', 0, 1), ('LOC', 3, 3)]
            
            entities = get_entities(slot.split())
            
            if len(entities) > MAX_SLOT:
                entities = entities[:MAX_SLOT]
           
            # 5. B tag mask: B * M * L
            # BI tag mask: B * M * L
            # tag intent label: B * M
            B_tag_mask = [[0 for _ in slot.split()] for utter in range(MAX_SLOT)]
            BI_tag_mask = [[0 for _ in slot.split()] for utter in range(MAX_SLOT)]
            tag_intent_label = [self.intent_labels.index("PAD") for _ in range(MAX_SLOT)]
            
#             if i == 0:
#                 print('words \n',words,'\n \n')
#                 print('entities \n',entities,'\n \n')
#                 print('B_tag_mask \n',B_tag_mask,'\n \n')
#                 print('BI_tag_mask \n',BI_tag_mask,'\n \n')
#                 print('tag_intent_label \n',tag_intent_label,'\n \n')
            
            try:
                for idx, tag in enumerate(entities):
                    B_tag_mask[idx][tag[1]] = 1
                    BI_tag_mask[idx][tag[1]:tag[2]+1] = [1./(tag[2]-tag[1]+1)] * (tag[2]-tag[1]+1)
                    # BI_tag_mask[idx][tag[1]:tag[2]+1] = [1] * (tag[2]-tag[1]+1)
                    tag_intent_label[idx] = intent_token_list[tag[1]]
                    assert tag_intent_label[idx] != self.intent_labels.index("O"), 'The intent tagged is UNK or O!' # we don't have unk
#                     assert tag_intent_label[idx] != self.intent_labels.index("UNK") and \
#                     tag_intent_label[idx] != self.intent_labels.index("O"), 'The intent tagged is UNK or O!'
            except:
                logger.info('Error')
                logger.info(text)
                logger.info(slot.split())
                logger.info(entities)
                logger.info(intent_token_list)
                
            assert len(words) == len(slot_labels) == len(intent_token_list)
            examples.append(InputExampleMultiIntent(guid=guid,
                                                    words=words,
                                                    intent_label=intent_label,
                                                    slot_labels=slot_labels,
                                                    intent_tokens=intent_token_list,
                                                    B_tag_mask=B_tag_mask,
                                                    BI_tag_mask=BI_tag_mask,
                                                    tag_intent_label=tag_intent_label))
        return examples
    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test
        
        Returns:
            list of example
        """
        data_path = os.path.join(self.args.data_dir, self.args.task, mode)
        logger.info("LOOKING AT {}".format(data_path))
        return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
                                     intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
                                     slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
                                     intent_tokens=self._read_file(os.path.join(data_path, self.intent_tokens_file)),
                                     set_type=mode)

In [None]:
class JointProcessorMultiIntent_Pro(object):
    """Processor for the JointBERT data set """
    def __init__(self, args):
        self.args = args
        # data/atis/intent_label.txt
        self.intent_labels = get_intent_labels(args)
        # data/atis/slot_label.txt
        self.slot_labels = get_slot_labels(args)
        self.input_text_file = 'seq.in'
        self.intent_label_file = 'label'
        self.slot_labels_file = 'seq.out'
        self.intent_tokens_file = 'seq_intent.out'
        self.pro_token_file = 'seq_pro.out' #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    @classmethod
    def _read_file(cls, input_file, quotechar=None):
        """
        Read text file as lines
        """
        with open(input_file, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                lines.append(line.strip())
            return lines
    def _create_examples(self, texts, intents, slots, intent_tokens, pro_token,set_type):
        """
        Creates examples for the training and dev sets.

        Args:
            texts: list of utterance (str; concat of tokens)
            intents: list of intents
            slots: bio tokens (str)
            intent_tokens (str)

        Return:
            examples: a list of examples (example will contains an id, list_of_words, intent, list_of_bio)
        """
        examples = []
        for i, (text, intent, slot, intent_token,pro_token) in enumerate(zip(texts, intents, slots, intent_tokens,pro_token)):
            print('pro_token: ',pro_token)
            print('slots: ',slots)
            # train-i
            guid = "%s-%s" % (set_type, i)
            # 1. input_text
            words = text.split()  # Some are spaced twice
            # 2. intent to list list(index)
            intent_label_token = [self.intent_labels.index(int_tok) if int_tok in self.intent_labels else self.intent_labels.index('UNK') for int_tok in intent.split('#')]
            # we have to convert it to an indicating list with the length of intents
            intent_label = [0 for _ in self.intent_labels]
            for i in intent_label_token:
                intent_label[i] = 1
            # 3. slot to list of index list(list(index))
            slot_labels = []
            for s in slot.split():
                slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))

            # 4. intent_token_str to index list(list(index))
            intent_token_list = []
            for s in intent_token.split():
                intent_token_list.append(self.intent_labels.index(s) if s in self.intent_labels else self.intent_labels.index('UNK'))

            #---------------------------- Pro labels: True / False for the pairs of referral and refereen-----------------------------------
            pro_labels = []
            for pro in pro_token.split():
                pro_labels.append(int(0) if pro == 'I-referee' or pro == 'B-referee'  else int(1))

            # get entities in one utterance
            MAX_SLOT = self.args.num_mask
            # seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
            # [('PER', 0, 1), ('LOC', 3, 3)]

            entities = get_entities(slot.split())

            if len(entities) > MAX_SLOT:
                entities = entities[:MAX_SLOT]

            # 5. B tag mask: B * M * L
            # BI tag mask: B * M * L
            # tag intent label: B * M
            B_tag_mask = [[0 for _ in slot.split()] for utter in range(MAX_SLOT)]
            BI_tag_mask = [[0 for _ in slot.split()] for utter in range(MAX_SLOT)]
            tag_intent_label = [self.intent_labels.index("PAD") for _ in range(MAX_SLOT)]

#             if i == 0:
#                 print('words \n',words,'\n \n')
#                 print('entities \n',entities,'\n \n')
#                 print('B_tag_mask \n',B_tag_mask,'\n \n')
#                 print('BI_tag_mask \n',BI_tag_mask,'\n \n')
#                 print('tag_intent_label \n',tag_intent_label,'\n \n')

            try:
                for idx, tag in enumerate(entities):
                    B_tag_mask[idx][tag[1]] = 1
                    BI_tag_mask[idx][tag[1]:tag[2]+1] = [1./(tag[2]-tag[1]+1)] * (tag[2]-tag[1]+1)
                    # BI_tag_mask[idx][tag[1]:tag[2]+1] = [1] * (tag[2]-tag[1]+1)
                    tag_intent_label[idx] = intent_token_list[tag[1]]
                    assert tag_intent_label[idx] != self.intent_labels.index("O"), 'The intent tagged is UNK or O!' # we don't have unk
#                     assert tag_intent_label[idx] != self.intent_labels.index("UNK") and \
#                     tag_intent_label[idx] != self.intent_labels.index("O"), 'The intent tagged is UNK or O!'
            except:
                logger.info('Error')
                logger.info(text)
                logger.info(slot.split())
                logger.info(entities)
                logger.info(intent_token_list)

            print(pro_labels)

            assert len(words) == len(slot_labels) == len(intent_token_list)
            examples.append(InputExampleMultiIntent_Pro(guid=guid,
                                                    words=words,
                                                    intent_label=intent_label,
                                                    slot_labels=slot_labels,
                                                    pro_labels = pro_labels,
                                                    intent_tokens=intent_token_list,
                                                    B_tag_mask=B_tag_mask,
                                                    BI_tag_mask=BI_tag_mask,
                                                    tag_intent_label=tag_intent_label)) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        return examples
    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test

        Returns:
            list of example
        """
        data_path = os.path.join(self.args.data_dir, self.args.task, mode)
        logger.info("LOOKING AT {}".format(data_path))
        return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
                                     intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
                                     slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
                                     intent_tokens=self._read_file(os.path.join(data_path, self.intent_tokens_file)),
                                     pro_token = self._read_file(os.path.join(data_path, self.pro_token_file)),
                                     set_type=mode)

In [None]:
processors = {
    "atis": JointProcessorMultiIntent,
    "snips": JointProcessorMultiIntent,
    'mixsnips': JointProcessorMultiIntent,
    'mixatis': JointProcessorMultiIntent,
    'mixsnips_large': JointProcessorMultiIntent,
    'atis_seq': JointProcessorMultiIntent,
    'snips_seq': JointProcessorMultiIntent,
    'mixsnips_single': JointProcessor,
    'dstc4': JointProcessorMultiIntent,
    'gpsr': JointProcessorMultiIntent,
    'gpsr_pro' : JointProcessorMultiIntent_Pro
}

In [None]:
def convert_examples_to_features(examples, max_seq_len, tokenizer,
                                 pad_token_label_id=-100,
                                 cls_token_segment_id=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0,
                                 mask_padding_with_zero=True):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        # Tokenize word by word (for NER)
        tokens = []
        slot_labels_ids = []
        for word, slot_label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)
        intent_label_id = int(example.intent_label)
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % example.guid)
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("intent_label: %s (id = %d)" % (example.intent_label, intent_label_id))
            logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))
        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          intent_label_id=intent_label_id,
                          slot_labels_ids=slot_labels_ids
                          ))
    return features

In [None]:
def convert_examples_to_features_multi(examples, max_seq_len, tokenizer,
                                 pad_token_label_id=-100,
                                 cls_token_segment_id=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0,
                                 mask_padding_with_zero=True):
    """
    Convert the example (text, id, ...) into feature (different types of tensor)
    Args:
        examples: list of example
        max_seq_len: upper bound of token_length
        args: two functions:
        pad_token_label_id:
        cls_token_segment_id:
        sequence_a_segment_id:
    Returns:
        features:
        
    """
    
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id
    
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        # Tokenize word by word (for NER)
        tokens = []
        slot_labels_ids = []
        intent_tokens_ids = []
        B_tag_mask_list = []
        BI_tag_mask_list = []
        # for the B_tag_mask and BI_tag_mask in example, we need to zip them to make tokenization and padding simpler
        B_tag_mask = list(zip(*example.B_tag_mask))
        BI_tag_mask = list(zip(*example.BI_tag_mask))
        
        # the number of mask
        try:
            num_mask = len(B_tag_mask[0])
        except:
            print(example.words)
            print(example.slot_labels)
            print(example.intent_tokens)
        for word, slot_label, intent_token, B_pos_mask, BI_pos_mask in zip(
                example.words,
                example.slot_labels,
                example.intent_tokens,
                B_tag_mask,
                BI_tag_mask,
            ):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            #### IMPORTANT: This is the case mentioned in the paper ####
            # redbreast => red, ##bre, ##ast => we will only put the first one as the token
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))
            intent_tokens_ids.extend([int(intent_token)] + [pad_token_label_id] * (len(word_tokens) - 1))
            B_tag_mask_list.extend([B_pos_mask] + [tuple([0 for _ in range(num_mask)])] * (len(word_tokens) - 1))
            BI_tag_mask_list.extend([BI_pos_mask] + [tuple([0 for _ in range(num_mask)])] * (len(word_tokens) - 1))
            
        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        # limit the maximum length, please note no padding yet
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]
            intent_tokens_ids = intent_tokens_ids[:(max_seq_len - special_tokens_count)]
            B_tag_mask_list = B_tag_mask_list[:(max_seq_len - special_tokens_count)]
            BI_tag_mask_list = BI_tag_mask_list[:(max_seq_len - special_tokens_count)]
            
        # Add [SEP] token
        # sequence_a_segment_id: 0
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        intent_tokens_ids += [pad_token_label_id]
        B_tag_mask_list += [tuple([0 for _ in range(num_mask)])]
        BI_tag_mask_list += [tuple([0 for _ in range(num_mask)])]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        # cls_token_segment_id: 0
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        intent_tokens_ids = [pad_token_label_id] + intent_tokens_ids
        B_tag_mask_list = [tuple([0 for _ in range(num_mask)])] + B_tag_mask_list
        BI_tag_mask_list = [tuple([0 for _ in range(num_mask)])] + BI_tag_mask_list
        token_type_ids = [cls_token_segment_id] + token_type_ids
        
        # convert tokens to ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
        intent_tokens_ids = intent_tokens_ids + ([pad_token_label_id] * padding_length)
        B_tag_mask_list = B_tag_mask_list + ([tuple([0 for _ in range(num_mask)])] * padding_length)
        BI_tag_mask_list = BI_tag_mask_list + ([tuple([0 for _ in range(num_mask)])] * padding_length)
        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)
        assert len(intent_tokens_ids) == max_seq_len, "Error with intent tokens length {} vs {}".format(len(intent_tokens_ids), max_seq_len)
        assert len(B_tag_mask_list) == max_seq_len, "Error with B_tag_mask_list length {} vs {}".format(len(B_tag_mask_list), max_seq_len)
        assert len(BI_tag_mask_list) == max_seq_len, "Error with BI_tag_mask_list length {} vs {}".format(len(BI_tag_mask_list), max_seq_len)
        
        # for multi-intent process, it is a list of int
        intent_label_id = [int(i) for i in example.intent_label]
        tag_intent_label = [int(i) for i in example.tag_intent_label]
        
        # convert the B_tag_mask and BI_tag_mask back
        B_tag_mask_list = list(zip(*B_tag_mask_list))
        BI_tag_mask_list = list(zip(*BI_tag_mask_list))
        B_tag_mask_list = [list(i) for i in B_tag_mask_list]
        BI_tag_mask_list = [list(i) for i in BI_tag_mask_list]
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % example.guid)
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("intent_label: %s (id = %s)" % (" ".join([str(i) for i in example.intent_label]),\
                                                        " ".join([str(i) for i in intent_label_id])))
            logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))
            logger.info("intent_tokens: %s" % " ".join([str(x) for x in intent_tokens_ids]))
        features.append(
            InputFeaturesMultiIntent(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          intent_label_id=intent_label_id,
                          slot_labels_ids=slot_labels_ids,
                          intent_tokens_ids=intent_tokens_ids,
                          B_tag_mask=B_tag_mask_list,
                          BI_tag_mask=BI_tag_mask_list,
                          tag_intent_label=tag_intent_label,
                          ))
    return features

In [None]:
def convert_examples_to_features_multi_Pro(examples, max_seq_len, tokenizer,
                                 pad_token_label_id=-100,
                                 cls_token_segment_id=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0,
                                 mask_padding_with_zero=True):
    """
    Convert the example (text, id, ...) into feature (different types of tensor)
    Args:
        examples: list of example
        max_seq_len: upper bound of token_length
        args: two functions:
        pad_token_label_id:
        cls_token_segment_id:
        sequence_a_segment_id:
    Returns:
        features:

    """

    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        # Tokenize word by word (for NER)
        tokens = []
        slot_labels_ids = []
        pro_labels_ids = [] #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        intent_tokens_ids = []
        B_tag_mask_list = []
        BI_tag_mask_list = []
        # for the B_tag_mask and BI_tag_mask in example, we need to zip them to make tokenization and padding simpler
        B_tag_mask = list(zip(*example.B_tag_mask))
        BI_tag_mask = list(zip(*example.BI_tag_mask))


        # the number of mask
        try:
            num_mask = len(B_tag_mask[0])
        except:
            print(example.words)
            print(example.slot_labels)
            print(example.intent_tokens)
        for word, slot_label, intent_token, B_pos_mask, BI_pos_mask, pro_label in zip(
                example.words,
                example.slot_labels,
                example.intent_tokens,
                B_tag_mask,
                BI_tag_mask,
                example.pro_labels,#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            ):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            #### IMPORTANT: This is the case mentioned in the paper ####
            # redbreast => red, ##bre, ##ast => we will only put the first one as the token
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))
            pro_labels_ids.extend([int(pro_label)] + [pad_token_label_id] * (len(word_tokens) - 1))#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            intent_tokens_ids.extend([int(intent_token)] + [pad_token_label_id] * (len(word_tokens) - 1))
            B_tag_mask_list.extend([B_pos_mask] + [tuple([0 for _ in range(num_mask)])] * (len(word_tokens) - 1))
            BI_tag_mask_list.extend([BI_pos_mask] + [tuple([0 for _ in range(num_mask)])] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        # limit the maximum length, please note no padding yet
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]
            pro_labels_ids = pro_labels_ids[:(max_seq_len - special_tokens_count)]#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            intent_tokens_ids = intent_tokens_ids[:(max_seq_len - special_tokens_count)]
            B_tag_mask_list = B_tag_mask_list[:(max_seq_len - special_tokens_count)]
            BI_tag_mask_list = BI_tag_mask_list[:(max_seq_len - special_tokens_count)]

        # Add [SEP] token
        # sequence_a_segment_id: 0
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        pro_labels_ids += [pad_token_label_id]#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        intent_tokens_ids += [pad_token_label_id]
        B_tag_mask_list += [tuple([0 for _ in range(num_mask)])]
        BI_tag_mask_list += [tuple([0 for _ in range(num_mask)])]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        # cls_token_segment_id: 0
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        pro_labels_ids = [pad_token_label_id] + pro_labels_ids#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        intent_tokens_ids = [pad_token_label_id] + intent_tokens_ids
        B_tag_mask_list = [tuple([0 for _ in range(num_mask)])] + B_tag_mask_list
        BI_tag_mask_list = [tuple([0 for _ in range(num_mask)])] + BI_tag_mask_list
        token_type_ids = [cls_token_segment_id] + token_type_ids

        # convert tokens to ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
        pro_labels_ids = pro_labels_ids + ([pad_token_label_id] * padding_length)#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        intent_tokens_ids = intent_tokens_ids + ([pad_token_label_id] * padding_length)
        B_tag_mask_list = B_tag_mask_list + ([tuple([0 for _ in range(num_mask)])] * padding_length)
        BI_tag_mask_list = BI_tag_mask_list + ([tuple([0 for _ in range(num_mask)])] * padding_length)
        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)
        assert len(pro_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(pro_labels_ids), max_seq_len) #!!!!!!!!!!!!!!!!!!!!!!!!!
        assert len(intent_tokens_ids) == max_seq_len, "Error with intent tokens length {} vs {}".format(len(intent_tokens_ids), max_seq_len)
        assert len(B_tag_mask_list) == max_seq_len, "Error with B_tag_mask_list length {} vs {}".format(len(B_tag_mask_list), max_seq_len)
        assert len(BI_tag_mask_list) == max_seq_len, "Error with BI_tag_mask_list length {} vs {}".format(len(BI_tag_mask_list), max_seq_len)

        # for multi-intent process, it is a list of int
        intent_label_id = [int(i) for i in example.intent_label]
        tag_intent_label = [int(i) for i in example.tag_intent_label]

        # convert the B_tag_mask and BI_tag_mask back
        B_tag_mask_list = list(zip(*B_tag_mask_list))
        BI_tag_mask_list = list(zip(*BI_tag_mask_list))
        B_tag_mask_list = [list(i) for i in B_tag_mask_list]
        BI_tag_mask_list = [list(i) for i in BI_tag_mask_list]
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % example.guid)
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("intent_label: %s (id = %s)" % (" ".join([str(i) for i in example.intent_label]),\
                                                        " ".join([str(i) for i in intent_label_id])))
            logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))
            logger.info("pro_labels_ids: %s" % " ".join([str(x) for x in pro_labels_ids]))#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            logger.info("intent_tokens: %s" % " ".join([str(x) for x in intent_tokens_ids]))

        print('pro_labels_ids: ',pro_labels_ids)
        features.append(
            InputFeaturesMultiIntent_Pro(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          intent_label_id=intent_label_id,
                          slot_labels_ids=slot_labels_ids,
                          intent_tokens_ids=intent_tokens_ids,
                          B_tag_mask=B_tag_mask_list,
                          BI_tag_mask=BI_tag_mask_list,
                          tag_intent_label=tag_intent_label,
                          pro_labels_ids = pro_labels_ids,
                          ))
    return features

In [None]:
def load_and_cache_examples(args, tokenizer, mode):
    """
    Generate the different types of dataloader
    
    Args:
        args:
        tokenizer:
        mode: train/dev/test
    
    Return:
        dataset: dataloader
    """
    
    #processors = {'gpsr': JointProcessorMultiIntent}
    
    processor = JointProcessorMultiIntent_Pro(args)#processors[args.task](args)
    print('processor: ',processor)

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        'cached_{}_{}_{}_{}_{}'.format(
            mode,
            args.task,
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            args.max_seq_len,
            args.num_mask,
        )
    )
    
    # try to load from the cached data first
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        # Load data features from dataset file
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if mode == "train":
            examples = processor.get_examples("train")
        elif mode == "dev":
            examples = processor.get_examples("dev")
        elif mode == "test":
            examples = processor.get_examples("test")
        else:
            raise Exception("For mode, Only train, dev, test is available")

        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        # Defaultly, pad id will be set to 0
        pad_token_label_id = args.ignore_index
        if args.pro:
            features = convert_examples_to_features_multi_Pro(examples, args.max_seq_len, tokenizer,
                                                        pad_token_label_id=pad_token_label_id)
        elif args.multi_intent:
            features = convert_examples_to_features_multi(examples, args.max_seq_len, tokenizer,
                                                        pad_token_label_id=pad_token_label_id)
        else:
            features = convert_examples_to_features(examples, args.max_seq_len, tokenizer,
                                                   pad_token_label_id=pad_token_label_id)
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)
    if args.pro: #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        all_pro_labels_ids = torch.tensor([f.pro_labels_ids for f in features], dtype=torch.long)#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.float)
        all_intent_tokens_ids = torch.tensor([f.intent_tokens_ids for f in features], dtype=torch.long)
        all_B_tag_mask = torch.tensor([f.B_tag_mask for f in features], dtype=torch.long)
        all_BI_tag_mask = torch.tensor([f.BI_tag_mask for f in features], dtype=torch.float)
        all_tag_intent_label = torch.tensor([f.tag_intent_label for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids,
                                all_attention_mask,
                                all_token_type_ids,
                                all_intent_label_ids,
                                all_slot_labels_ids,
                                all_intent_tokens_ids,
                                all_B_tag_mask,
                                all_BI_tag_mask,
                                all_tag_intent_label,
                                all_pro_labels_ids)
    elif args.multi_intent:
        # as the intent has been transfer to multiple intent
        # we have to transfer the intent to a list of binary
        all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.float)
        all_intent_tokens_ids = torch.tensor([f.intent_tokens_ids for f in features], dtype=torch.long)
        all_B_tag_mask = torch.tensor([f.B_tag_mask for f in features], dtype=torch.long)
        all_BI_tag_mask = torch.tensor([f.BI_tag_mask for f in features], dtype=torch.float)
        all_tag_intent_label = torch.tensor([f.tag_intent_label for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids,
                                all_attention_mask,
                                all_token_type_ids,
                                all_intent_label_ids,
                                all_slot_labels_ids,
                                all_intent_tokens_ids,
                                all_B_tag_mask,
                                all_BI_tag_mask,
                                all_tag_intent_label)
    else:
        all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids,
                                all_attention_mask,
                                all_token_type_ids,
                                all_intent_label_ids,
                                all_slot_labels_ids)
    return dataset

In [None]:
import argparse
import random
from datetime import datetime
import time
import argparse
from utils import init_logger, load_tokenizer, read_prediction_text, set_seed, MODEL_CLASSES, MODEL_PATH_MAP



if __name__ == '__main__':
    time_wait = random.uniform(0, 10)
    time.sleep(time_wait)
    parser = argparse.ArgumentParser()
    
    
    parser.add_argument("--task", default='gpsr_pro', type=str, help="The name of the task to train")

#     parser.add_argument("--model_dir", default='./gpsr_model', required=True, type=str, help="Path to save, load model")
    parser.add_argument("--model_dir", default='./gpsr_model', type=str, help="Path to save, load model")

    parser.add_argument("--data_dir", default="./data", type=str, help="The input data dir")
    parser.add_argument("--intent_label_file", default="intent_label.txt", type=str, help="Intent Label file")
    parser.add_argument("--slot_label_file", default="slot_label.txt", type=str, help="Slot Label file")
    parser.add_argument("--model_type", default="multibert", type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
#     parser.add_argument("--intent_seq", type=int, default=0, help="whether we use intent seq setting")
    parser.add_argument("--intent_seq", type=int, default=1, help="whether we use intent seq setting")

    parser.add_argument("--pro", type=int, default=1, help="support pronoun disambiguition")#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    parser.add_argument("--multi_intent", type=int, default=1, help="whether we use multi intent setting")
    parser.add_argument("--tag_intent", type=int, default=1, help="whether we can use tag to predict intent")
    
    parser.add_argument("--BI_tag", type=int, default=1, help='use BI sum or just B')
    parser.add_argument("--cls_token_cat", type=int, default=1, help='whether we cat the cls to the slot output of bert')
    parser.add_argument("--intent_attn", type=int, default=1, help='whether we use attention mechanism on the CLS intent output')
    parser.add_argument("--num_mask", type=int, default=7, help="assumptive number of slot in one sentence")
                                           #max slot num = 7
    
    
    parser.add_argument('--seed', type=int, default=25, help="random seed for initialization")
    parser.add_argument("--train_batch_size", default=64, type=int, help="Batch size for training.")
#     parser.add_argument("--train_batch_size", default=64, type=int, help="Batch size for training.")

    parser.add_argument("--eval_batch_size", default=128, type=int, help="Batch size for evaluation.")
    parser.add_argument("--max_seq_len", default=35, type=int, help="The maximum total input sequence length after tokenization.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
#     parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.")
                                            #####
    
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1, type=float, help="Max gradient norm.")
    parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--dropout_rate", default=0.1, type=float, help="Dropout for fully-connected layers")
    parser.add_argument('--logging_steps', type=int, default=500, help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=300, help="Save checkpoint every X updates steps.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument("--ignore_index", default=0, type=int,
                        help='Specifies a target value that is ignored and does not contribute to the input gradient')
    parser.add_argument('--slot_loss_coef', type=float, default=2.0, help='Coefficient for the slot loss.')
    parser.add_argument('--tag_intent_coef', type=float, default=1.0, help='Coefficient for the tag intent loss')

    # CRF option
    parser.add_argument("--use_crf", action="store_true", help="Whether to use CRF")
    parser.add_argument("--slot_pad_label", default="PAD", type=str, help="Pad token for slot label pad (to be ignore when calculate loss)")
    parser.add_argument("--patience", default=0, type=int, help="The initial learning rate for Adam.")    
    
    parser.add_argument('-f')#########################
    
    args = parser.parse_args()
    
    args.model_name_or_path = MODEL_PATH_MAP[args.model_type]
    
    #processors['gpsr_pro'](args)

In [None]:
p = processors['gpsr_pro'](args)
examples = p.get_examples("train")
features = convert_examples_to_features_multi_Pro(examples, args.max_seq_len, load_tokenizer(args),
                                                        pad_token_label_id=args.ignore_index)
features[0].pro_labels_ids

pro_labels_ids:  [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
pro_labels_ids:  [0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
pro_labels_ids:  [0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
pro_labels_ids:  [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
pro_labels_ids:  [0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

[0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
tokenizer = load_tokenizer(args)
train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
train_dataset[1]


processor:  <__main__.JointProcessorMultiIntent_Pro object at 0x7f3d7dfabd00>


AttributeError: 'InputFeaturesMultiIntent_Pro' object has no attribute 'pro_labels_ids'

In [None]:
# slot = ['O', 'O', 'O', 'B-obj', 'I-obj', 'I-obj', 'O', 'B-dest', 'I-dest', 'I-dest', 'I-dest', 'I-dest', 'O', 'O', 'B-dest', 'I-dest']
# entities = get_entities(slot)
# print(entities)
#
# MAX_SLOT = 7
# if len(entities) > MAX_SLOT:
#     entities = entities[:MAX_SLOT]
#
# # 5. B tag mask: B * M * L
# # BI tag mask: B * M * L
# # tag intent label: B * M
# B_tag_mask = [[0 for _ in slot] for utter in range(MAX_SLOT)]
# BI_tag_mask = [[0 for _ in slot] for utter in range(MAX_SLOT)]
# tag_intent_label = [0 for _ in range(MAX_SLOT)] #[self.intent_labels.index("PAD") for _ in range(MAX_SLOT)]
#
# for idx, tag in enumerate(entities):
#     B_tag_mask[idx][tag[1]] = 1
#     BI_tag_mask[idx][tag[1]:tag[2]+1] = [1./(tag[2]-tag[1]+1)] * (tag[2]-tag[1]+1)
#     # BI_tag_mask[idx][tag[1]:tag[2]+1] = [1] * (tag[2]-tag[1]+1)
#
#     intent_token_list = ['O', 'O', 'O', 'take', 'take', 'take', 'O', 'take', 'take', 'take', 'take', 'take', 'O', 'O', 'take', 'take']
#     tag_intent_label[idx] = intent_token_list[tag[1]]
#     assert tag_intent_label[idx] != 1, 'The intent tagged is UNK or O!' #self.intent_labels.index("O"), 'The intent tagged is UNK or O!'
#
#
# # print('words \n',words,'\n \n')
# print('entities \n',entities,'\n \n')
# print('B_tag_mask \n',B_tag_mask,'\n \n')
# print('BI_tag_mask \n',BI_tag_mask,'\n \n')
# print('tag_intent_label \n',tag_intent_label,'\n \n')