In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import json
import os

class RawData(object):
    def __init__(self, id, positions, slots, text):
        self.id = id
        self.positions = positions
        self.slots = slots
        self.text = text

    def __repr__(self):
        return str(json.dumps(self.__dict__, indent=2))


"""
reads json from data file
returns a list containing DataInstance objects
"""


def read_train_json_file(filename):
    if os.path.exists(filename):
        data_final = []

        with open(filename, "r", encoding="utf-8") as json_file:
            data = json.load(json_file)

            for k in data.keys():
                positions = data[k]["positions"]
                slots = data[k]["slots"]
                text = data[k]["text"]

                temp = RawData(k, positions, slots, text)
                data_final.append(temp)

        return data_final
    else:
        raise FileNotFoundError("No file found with that path!")

# read from json file
train_data = read_train_json_file("train_final.json")

In [None]:
example = train_data[0]
example

{
  "id": "0",
  "positions": {
    "genre": [
      19,
      35
    ],
    "year": [
      41,
      49
    ]
  },
  "slots": {
    "genre": "romantic comedies",
    "year": "right now"
  },
  "text": "are there any good romantic comedies out right now"
}

In [None]:
type(example)

__main__.RawData

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer

model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:

def encode_texts(tokenizer, texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="tf")

texts = [d.text for d in train_data]
tds = encode_texts(tokenizer, texts)
tds.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
encoded_texts = tds

In [None]:
encoded_texts['input_ids'][0]

<tf.Tensor: shape=(93,), dtype=int32, numpy=
array([  101,  1132,  1175,  1251,  1363,  6376, 25795,  1149,  1268,
        1208,   102,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0], dtype=int32)>

In [None]:
encoded_texts['attention_mask'][0]

<tf.Tensor: shape=(93,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int32)>

In [None]:
# encode slots
slot_names = set()
for td in train_data:
    slots = td.slots
    for slot in slots:
        slot_names.add(slot)
slot_names = list(slot_names)
slot_names.insert(0, "<PAD>")
slot_names

['<PAD>',
 'plot',
 'review',
 'origin',
 'artist',
 'year',
 'song',
 'quote',
 'character',
 'award',
 'title',
 'director',
 'trailer',
 'opinion',
 'relationship',
 'genre',
 'rating']

In [None]:
slot_map = dict() # slot -> index
for idx, us in enumerate(slot_names):
    slot_map[us] = idx
slot_map

{'<PAD>': 0,
 'artist': 4,
 'award': 9,
 'character': 8,
 'director': 11,
 'genre': 15,
 'opinion': 13,
 'origin': 3,
 'plot': 1,
 'quote': 7,
 'rating': 16,
 'relationship': 14,
 'review': 2,
 'song': 6,
 'title': 10,
 'trailer': 12,
 'year': 5}

In [None]:
# gets slot name from its values
def get_slot_from_word(word, slot_dict):
    for slot_label,value in slot_dict.items():
        if word in value.split():
            return slot_label
    return None

print(train_data[0].text)
print(train_data[0].slots)
print("slot_name for right is : ", get_slot_from_word("now", train_data[0].slots))

are there any good romantic comedies out right now
{'genre': 'romantic comedies', 'year': 'right now'}
slot_name for right is :  year


In [None]:
import numpy as np

max_len = len(encoded_texts["input_ids"][0])

def encode_slots(all_slots, all_texts, 
                 toknizer, slot_map, max_len=max_len):
    encoded_slots = np.zeros(shape=(len(all_texts), max_len), dtype=np.int32)
    
    for idx, text in enumerate(all_texts):
        enc = [] # for this idx, to be added at the end to encoded_slots
        
        # slot names for this idx
        slot_names = all_slots[idx]
        
        raw_tokens = text.split()
        for rt in raw_tokens:
            bert_tokens = tokenizer.tokenize(rt)
            rt_slot_name = get_slot_from_word(rt, slot_names)
            if rt_slot_name is not None:
                enc.append(slot_map[rt_slot_name])
                enc.extend([slot_map[rt_slot_name]] * (len(bert_tokens) - 1))

            else:
                enc.append(0)

        
        encoded_slots[idx, 1:len(enc)+1] = enc
    
    return encoded_slots
    

In [None]:
max_len

93

In [None]:
all_slots = [td.slots for td in train_data]
all_texts = [td.text for td in train_data]

In [None]:
encoded_slots = encode_slots(all_slots, all_texts, tokenizer, slot_map)

In [None]:
encoded_slots[0]

array([ 0,  0,  0,  0,  0, 15, 15,  0,  5,  5,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [None]:
from transformers import TFBertModel
from tensorflow.keras.layers import Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
class SlotFillingModel(tf.keras.Model):

    def __init__(self,  slot_num_labels=None,
                 model_name=model_name, dropout_prob=0.1):
        super().__init__(name="joint_intent_slot")
        self.bert = TFBertModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        self.slot_classifier = Dense(slot_num_labels,
                                     name="slot_classifier")

    def call(self, inputs, **kwargs):
        # two outputs from BERT
        trained_bert = self.bert(inputs, **kwargs)
        pooled_output = trained_bert.pooler_output
        sequence_output = trained_bert.last_hidden_state
        sequence_output = self.dropout(sequence_output,
                                       training=kwargs.get("training", False))
        slot_logits = self.slot_classifier(sequence_output)

        return slot_logits

In [None]:
joint_model = SlotFillingModel(slot_num_labels=len(slot_map))

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
opt = Adam(learning_rate=3e-5, epsilon=1e-08)

losses = [SparseCategoricalCrossentropy(from_logits=True)]

metrics = [SparseCategoricalAccuracy("accuracy")]
# compile model
joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

In [None]:
x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],  "attention_mask": encoded_texts["attention_mask"]}

history = joint_model.fit(x, (encoded_slots), epochs=2, batch_size=32, shuffle=True)

Epoch 1/2
Epoch 2/2


In [None]:
def nlu(text, tokenizer, model, slot_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    outputs = model(inputs)
    slot_logits = outputs

    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, :]

    info = { "slots": {}}

    out_dict = {}
    # get all slot names and add to out_dict as keys
    predicted_slots = set([slot_names[s] for s in slot_ids if s != 0])
    for ps in predicted_slots:
      out_dict[ps] = []

    # check if the text starts with a small letter
    if text[0].islower():
      tokens = tokenizer.tokenize(text, add_special_tokens=True)
    else:
      tokens = tokenizer.tokenize(text)
    for token, slot_id in zip(tokens, slot_ids):
        # add all to out_dict
        slot_name = slot_names[slot_id]

        if slot_name == "<PAD>":
            continue

        # collect tokens
        collected_tokens = [token]
        idx = tokens.index(token)

        # see if it starts with ##
        # then it belongs to the previous token
        if token.startswith("##"):
          # check if the token already exists or not
          if tokens[idx - 1] not in out_dict[slot_name]:
            collected_tokens.insert(0, tokens[idx - 1])

        # add collected tokens to slots
        out_dict[slot_name].extend(collected_tokens)

    # process out_dict
    for slot_name in out_dict:
        tokens = out_dict[slot_name]
        slot_value = tokenizer.convert_tokens_to_string(tokens)

        info["slots"][slot_name] = slot_value.strip()

    return info


In [None]:
nlu("what are the latest sci-fi movies directed by Mathew McConaughey", tokenizer, joint_model, slot_names)

{'slots': {'director': 'Mathew McConaughey', 'genre': 'sci - fi'}}

In [None]:
nlu("list the five star movies acted by Aamir Khan in 2010", tokenizer, joint_model,  slot_names)

{'slots': {'artist': 'Aamir Khan', 'rating': 'five star', 'year': '2010'}}

In [None]:
nlu("Which is the recent Kannada romantic comedy movie starring Puneeth Rajkumar", tokenizer, joint_model,  slot_names)

{'slots': {'artist': 'Puneeth Rajkumar', 'genre': 'romantic comedy movie'}}

In [None]:
test_data = read_train_json_file("test_final.json")

In [None]:
all_slots = [td.slots for td in test_data]
all_texts = [td.text for td in test_data]

In [None]:
encoded_slots = encode_slots(all_slots, all_texts, tokenizer, slot_map)

In [None]:
encoded_slots.shape

(2687, 93)

In [None]:
test_slots=[]
for text in all_texts: 
  op = nlu(text,tokenizer,joint_model,slot_names)
  test_slots.append(op['slots'])


In [None]:
op

{'slots': {'artist': 'zac efron',
  'origin': 'based on a nicholas sparks novel',
  'plot': 'a soldier searching for the woman in a photograph that saved his life'}}

In [None]:
text

'zac efron is a soldier searching for the woman in a photograph that saved his life in this movie based on a nicholas sparks novel'

In [None]:
op['slots']

{'artist': 'zac efron',
 'origin': 'based on a nicholas sparks novel',
 'plot': 'a soldier searching for the woman in a photograph that saved his life'}

In [None]:
encoded_slots[0]

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int32)

In [None]:
encoded_test_slots = encode_slots(test_slots, all_texts, tokenizer, slot_map)

In [None]:
encoded_test_slots[0]

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int32)

In [None]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for j in range(len(y_hat)):
        for i in range(len(y_actual[j])): 
          if y_actual[j][i]==y_hat[j][i] and y_actual[j][i]!=0:
            TP += 1
          if y_hat[j][i]!=0 and y_actual[j][i]!=y_hat[j][i]:
            FP += 1
          if y_actual[j][i]==y_hat[j][i]==0:
            TN += 1
          if y_hat[j][i]==0 and y_actual[j][i]!=y_hat[j][i]:
            FN += 1

    return(TP, FP, TN, FN)

In [None]:
(TP,FP,TN,FN) = perf_measure(encoded_slots,encoded_test_slots)

In [None]:
TP

39696

In [None]:
FP

5491

In [None]:
TN

200289

In [None]:
FN

4415

In [None]:
acc = (TP + TN)/(TP+FP+TN+FN)

In [None]:
acc

0.9603587164003505

In [None]:
F1_score = TP/(TP + 1/2 * (FP + FN))

In [None]:
F1_score

0.8890680642343614

In [None]:
prec = TP/(TP+FP)

In [None]:
rec = TP/(TP+FN)

In [None]:
prec

0.8784827494633413

In [None]:
rec

0.8999115866790597