In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 21.4MB/s eta 0:00:01[K     |▍                               | 20kB 28.6MB/s eta 0:00:01[K     |▋                               | 30kB 21.1MB/s eta 0:00:01[K     |▊                               | 40kB 19.8MB/s eta 0:00:01[K     |█                               | 51kB 20.9MB/s eta 0:00:01[K     |█▏                              | 61kB 15.7MB/s eta 0:00:01[K     |█▎                              | 71kB 16.2MB/s eta 0:00:01[K     |█▌                              | 81kB 16.9MB/s eta 0:00:01[K     |█▊                              | 92kB 15.0MB/s eta 0:00:01[K     |█▉                              | 102kB 16.1MB/s eta 0:00:01[K     |██                              | 112kB 16.1MB/s eta 0:00:01[K     |██▎                             | 

In [None]:
import json
import os

class RawData(object):
    def __init__(self, id, intent, positions, slots, text):
        self.id = id
        self.intent = intent
        self.positions = positions
        self.slots = slots
        self.text = text

    def __repr__(self):
        return str(json.dumps(self.__dict__, indent=2))


"""
reads json from data file
returns a list containing DataInstance objects
"""


def read_train_json_file(filename):
    if os.path.exists(filename):
        intents = []

        with open(filename, "r", encoding="utf-8") as json_file:
            data = json.load(json_file)

            for k in data.keys():
                intent = data[k]["intent"]
                positions = data[k]["positions"]
                slots = data[k]["slots"]
                text = data[k]["text"]

                temp = RawData(k, intent, positions, slots, text)
                intents.append(temp)

        return intents
    else:
        raise FileNotFoundError("No file found with that path!")

# read from json file
train_data = read_train_json_file("train.json")

In [None]:
example = train_data[0]
example

{
  "id": "0",
  "intent": "AddToPlaylist",
  "positions": {
    "music_item": [
      6,
      9
    ],
    "playlist_owner": [
      14,
      15
    ],
    "playlist": [
      17,
      32
    ]
  },
  "slots": {
    "music_item": "tune",
    "playlist_owner": "my",
    "playlist": "elrow Guest List"
  },
  "text": "Add a tune to my elrow Guest List"
}

In [None]:
# encode this example

text = example.text
slots = example.slots
intent = example.intent

In [None]:
import numpy as np

# encode text
max_len = 45



In [None]:
import tensorflow as tf
from transformers import AutoModel, AutoTokenizer, BertTokenizer
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [None]:
td = tokenizer(text)
td

{'input_ids': [101, 24930, 1181, 170, 9253, 1106, 1139, 8468, 7596, 12044, 5619, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.decode(td["input_ids"])

'[CLS] Add a tune to my elrow Guest List [SEP]'

In [None]:
# https://huggingface.co/transformers/preprocessing.html

def encode_texts(tokenizer, texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="tf")

texts = [d.text for d in train_data]
tds = encode_texts(tokenizer, texts)
tds.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
encoded_texts = tds

In [None]:
# encode labels
intents = [d.intent for d in train_data]
unique_intents = list(set(intents))
unique_intents

['PlayMusic',
 'BookRestaurant',
 'RateBook',
 'GetWeather',
 'SearchCreativeWork',
 'AddToPlaylist',
 'SearchScreeningEvent']

In [None]:
intent_dict = dict() # index -> intent
for idx, ui in enumerate(unique_intents):
    intent_dict[ui] = idx
intent_dict

{'AddToPlaylist': 5,
 'BookRestaurant': 1,
 'GetWeather': 3,
 'PlayMusic': 0,
 'RateBook': 2,
 'SearchCreativeWork': 4,
 'SearchScreeningEvent': 6}

In [None]:
# map to train_data values
def encode_intents(intents, intent_dict):
    encoded = []
    for i in intents:
        encoded.append(intent_dict[i])
    # convert to tf tensor
    return tf.convert_to_tensor(encoded, dtype="int32")

encoded_intents = encode_intents(intents, intent_dict)

In [None]:
# test intent classification
from transformers import TFBertModel
from tensorflow.keras.layers import Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [None]:
tds.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
import calendar
import time

def get_time_stamp():
    ts = calendar.timegm(time.gmtime())
    return ts

get_time_stamp()

1611906001

In [None]:
# encode slots
unique_slots = set()
for td in train_data:
    slots = td.slots
    for slot in slots:
        unique_slots.add(slot)
unique_slots = list(unique_slots)
unique_slots.insert(0, "nil")
unique_slots

['nil',
 'spatial_relation',
 'city',
 'location_name',
 'timeRange',
 'movie_type',
 'object_part_of_series_type',
 'playlist',
 'year',
 'poi',
 'country',
 'geographic_poi',
 'current_location',
 'artist',
 'best_rating',
 'party_size_number',
 'movie_name',
 'served_dish',
 'restaurant_name',
 'sort',
 'rating_unit',
 'track',
 'condition_description',
 'state',
 'cuisine',
 'object_name',
 'party_size_description',
 'facility',
 'rating_value',
 'object_location_type',
 'music_item',
 'entity_name',
 'object_type',
 'restaurant_type',
 'object_select',
 'album',
 'playlist_owner',
 'genre',
 'condition_temperature',
 'service']

In [None]:
slot_map = dict() # slot -> index
for idx, us in enumerate(unique_slots):
    slot_map[us] = idx
slot_map

{'album': 35,
 'artist': 13,
 'best_rating': 14,
 'city': 2,
 'condition_description': 22,
 'condition_temperature': 38,
 'country': 10,
 'cuisine': 24,
 'current_location': 12,
 'entity_name': 31,
 'facility': 27,
 'genre': 37,
 'geographic_poi': 11,
 'location_name': 3,
 'movie_name': 16,
 'movie_type': 5,
 'music_item': 30,
 'nil': 0,
 'object_location_type': 29,
 'object_name': 25,
 'object_part_of_series_type': 6,
 'object_select': 34,
 'object_type': 32,
 'party_size_description': 26,
 'party_size_number': 15,
 'playlist': 7,
 'playlist_owner': 36,
 'poi': 9,
 'rating_unit': 20,
 'rating_value': 28,
 'restaurant_name': 18,
 'restaurant_type': 33,
 'served_dish': 17,
 'service': 39,
 'sort': 19,
 'spatial_relation': 1,
 'state': 23,
 'timeRange': 4,
 'track': 21,
 'year': 8}

In [None]:
def get_slot_from_word(word, slot_dict):
    for slot_label,value in slot_dict.items():
        if word in value.split():
            return slot_label
    return None

print(train_data[0].text)
print(train_data[0].slots)
print("slot_name for my is : ",get_slot_from_word("my", train_data[0].slots))

Add a tune to my elrow Guest List
{'music_item': 'tune', 'playlist_owner': 'my', 'playlist': 'elrow Guest List'}
slot_name for my is :  playlist_owner


In [None]:
def encode_slots(all_slots, all_texts, 
                 toknizer, slot_map, max_len=43):
    encoded_slots = np.zeros(shape=(len(all_texts), max_len), dtype=np.int32)
    
    for idx, text in enumerate(all_texts):
        enc = [] # for this idx, to be added at the end to encoded_slots
        
        # slot names for this idx
        slot_names = all_slots[idx]
        
        # raw word tokens
        # not using bert for this block because bert uses
        # a wordpiece tokenizer which will make 
        # the slot label to word mapping
        # difficult
        raw_tokens = text.split()

        # words or slot_values associated with a certain
        # slot_name are contained in the values of the
        # dict slots_names
        # now this becomes a two way lookup
        # first we check if a word belongs to any
        # slot label or not and then we add the value from
        # slot map to encoded for that word
        for rt in raw_tokens:
            # use bert tokenizer
            # to get wordpiece tokens
            bert_tokens = tokenizer.tokenize(rt)
            
            # find the slot name for a token
            rt_slot_name = get_slot_from_word(rt, slot_names)
            if rt_slot_name is not None:
                # fill with the slot_map value for all ber tokens for rt
                enc.append(slot_map[rt_slot_name])
                enc.extend([slot_map[rt_slot_name]] * (len(bert_tokens) - 1))

            else:
                # rt is not associated with any slot name
                enc.append(0)

        
        # now add to encoded_slots
        # ignore the first and the last elements
        # in encoded text as they're special chars
        encoded_slots[idx, 1:len(enc)+1] = enc
    
    return encoded_slots
    

In [None]:
all_slots = [td.slots for td in train_data]
all_texts = [td.text for td in train_data]

In [None]:
encoded_slots = encode_slots(all_slots, all_texts, tokenizer, slot_map)

In [None]:
encoded_slots[0]

array([ 0,  0,  0, 30,  0, 36,  7,  7,  7,  7,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [None]:
class JointIntentAndSlotFillingModel(tf.keras.Model):

    def __init__(self, intent_num_labels=None, slot_num_labels=None,
                 model_name=model_name, dropout_prob=0.1):
        super().__init__(name="joint_intent_slot")
        self.bert = TFBertModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        self.intent_classifier = Dense(intent_num_labels,
                                       name="intent_classifier")
        self.slot_classifier = Dense(slot_num_labels,
                                     name="slot_classifier")

    def call(self, inputs, **kwargs):
        trained_bert = self.bert(inputs, **kwargs)
        pooled_output = trained_bert.pooler_output
        sequence_output = trained_bert.last_hidden_state
        
        # The first output of the main BERT layer has shape:
        # (batch_size, max_length, output_dim)
        sequence_output = self.dropout(sequence_output,
                                       training=kwargs.get("training", False))
        slot_logits = self.slot_classifier(sequence_output)

        # The second output of the main BERT layer has shape:
        # (batch_size, output_dim)
        # and gives a "pooled" representation for the full sequence from the
        # hidden state that corresponds to the "[CLS]" token.
        pooled_output = self.dropout(pooled_output,
                                     training=kwargs.get("training", False))
        intent_logits = self.intent_classifier(pooled_output)

        return slot_logits, intent_logits

In [None]:
joint_model = JointIntentAndSlotFillingModel(
    intent_num_labels=len(intent_dict), slot_num_labels=len(slot_map))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
losses = [SparseCategoricalCrossentropy(from_logits=True),
          SparseCategoricalCrossentropy(from_logits=True)]
metrics = [SparseCategoricalAccuracy("accuracy")]
joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

In [None]:
x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],  "attention_mask": encoded_texts["attention_mask"]}

history = joint_model.fit(
    x, (encoded_slots, encoded_intents), epochs=2, batch_size=32, shuffle=True)

Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fa5b5319110> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fa5b5319110> is not a module, class, method, function, traceback, frame, or code object


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).


Cause: while/else statement not yet supported


The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Cause: while/else statement not yet supported


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 2/2


In [None]:
def nlu(text, tokenizer, model, intent_names, slot_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    outputs = model(inputs)
    slot_logits, intent_logits = outputs

    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, :]
    intent_id = intent_logits.numpy().argmax(axis=-1)[0]

    info = {"intent": intent_names[intent_id], "slots": {}}

    out_dict = {}
    # get all slot names and add to out_dict as keys
    predicted_slots = set([slot_names[s] for s in slot_ids if s != 0])
    for ps in predicted_slots:
      out_dict[ps] = []

    # check if the text starts with a small letter
    if text[0].islower():
      tokens = tokenizer.tokenize(text, add_special_tokens=True)
    else:
      tokens = tokenizer.tokenize(text)
    for token, slot_id in zip(tokens, slot_ids):
        # add all to out_dict
        slot_name = slot_names[slot_id]

        if slot_name == "nil":
            continue

        # collect tokens
        collected_tokens = [token]
        idx = tokens.index(token)

        # see if it starts with ##
        # then it belongs to the previous token
        if token.startswith("##"):
          # check if the token already exists or not
          if tokens[idx - 1] not in out_dict[slot_name]:
            collected_tokens.insert(0, tokens[idx - 1])

        # add collected tokens to slots
        out_dict[slot_name].extend(collected_tokens)

    # process out_dict
    for slot_name in out_dict:
        tokens = out_dict[slot_name]
        slot_value = tokenizer.convert_tokens_to_string(tokens)

        info["slots"][slot_name] = slot_value.strip()

    return info


In [None]:
joint_model.save_weights("bert_joint_weights/")

In [None]:
def read_dev_data(file="dev.json"):
    dev_texts = []
    with open(file, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)

        for k in data.keys():
          text = data[k]["text"]
          dev_texts.append(text)
          
    return dev_texts
dev_texts = read_dev_data()

In [None]:
from tqdm import tqdm

results = []
for i in tqdm(range(len(dev_texts))):
    res = nlu(dev_texts[i], tokenizer, joint_model, unique_intents, unique_slots)
    results.append(res)

100%|██████████| 2887/2887 [03:12<00:00, 15.02it/s]


In [None]:
# process results
results_dict = dict()

for idx, res in enumerate(results):
    results_dict[str(idx)] = res

In [None]:
with open("prediction.json", "w") as f:
    json.dump(results_dict, f, indent=2)