# Set up

In [1]:
import os, zipfile, glob, json, string, shutil

from google.colab.files import upload
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential

import tensorflow.keras.utils as ku
import tensorflow.keras.layers as layers
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Dataset

## Kaggle

In [3]:
print("Installing kaggle")
!pip install kaggle -q
print("Upload kaggle.json")
upload()
print("Setting kaggle up...")
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json
!rm -fr sample_data
print("Done!")

Installing kaggle
Upload kaggle.json


Saving kaggle.json to kaggle.json
Setting kaggle up...
Done!


## Downloading dataset

In [4]:
data_name = "data"
train_data = "data/train"
test_data = "data/test"

In [5]:
if not os.path.exists(data_name):
  !kaggle datasets download -d taejinwoo/multiwoz-22
  with zipfile.ZipFile("multiwoz-22.zip", "r") as zip_ref: zip_ref.extractall("./")
  os.remove("multiwoz-22.zip")
  os.rename("MultiWOZ_2.2", data_name)

Downloading multiwoz-22.zip to /content
 34% 5.00M/14.8M [00:00<00:00, 48.5MB/s]
100% 14.8M/14.8M [00:00<00:00, 79.3MB/s]


## Dataframe

In [6]:
sentences = []
for i, filepath in enumerate(glob.glob(f"{train_data}/*.json")):
  with open(filepath, 'r') as f:
    data = json.load(f)

  utterances = []
  for item in data:
    for turn in item["turns"]:
      utterances.append(turn["utterance"])

  sentences.append(utterances)

df_ = pd.DataFrame({"sentences": sentences})
print("Total num of rows", len(df_.index))
df_.head()

Total num of rows 17


Unnamed: 0,sentences
0,[I am looking for the Addenbrookes Hospital wi...
1,[I am looking for something fun to do in the s...
2,[i need a place to dine in the center thats ex...
3,[I need a train to stansted airport that leave...
4,"[Hi, I'm looking for places to visit. Mainly i..."


In [7]:
max_rows = 100
all_sentences = []

for i in range(len(df_.loc[:, "sentences"]) - 1):
  sentences = df_.loc[i, "sentences"]

  for sentence in sentences:
    if len(all_sentences) + 1 > max_rows: break

    all_sentences.append(sentence)

In [8]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

corpus = [clean_text(x) for x in all_sentences]
corpus[:10]

['i am looking for the addenbrookes hospital with hepatology department',
 'the telephone number is 01223217712',
 'thank you goodbye',
 'have a wonderful day',
 'im planning a trip to cambridge and need a place to dine can you find something that serves jamaican food in the centre',
 'there are no jamaican restaurants in the centre would you like to try another area or another food type',
 'are you sure it should be expensive',
 'im sorry there doesnt seem to be a jamaican restaurant in centre would you like me to look for something else',
 'i really want jamaican food can you check another area if you find one it should be expensive if not ill try thai in the centre',
 'that should be bangkok city its address is 24 green street city centrecb23jx you need the phone number']

## Create n-gram sequences

In [9]:
tokenizer = Tokenizer()

def create_sequences(corpus):
  tokenizer.fit_on_texts(corpus)
  total_words = len(tokenizer.word_index) + 1

  sequences = []
  for sentence in corpus:
    tok_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, total_words):
      n_gram_seq = tok_sentence[:i+1]
      sequences.append(n_gram_seq)

  return sequences, total_words

inp_sequences, total_words = create_sequences(corpus)
inp_sequences[:10]

[[4, 52],
 [4, 52, 35],
 [4, 52, 35, 7],
 [4, 52, 35, 7, 2],
 [4, 52, 35, 7, 2, 171],
 [4, 52, 35, 7, 2, 171, 172],
 [4, 52, 35, 7, 2, 171, 172, 17],
 [4, 52, 35, 7, 2, 171, 172, 17, 173],
 [4, 52, 35, 7, 2, 171, 172, 17, 173, 174],
 [4, 52, 35, 7, 2, 171, 172, 17, 173, 174]]

## Pad n-gram sequences

In [10]:
def split_sequences_labels(sequences, pad_len):
  padded_sequences = np.array(ku.pad_sequences(sequences, maxlen=pad_len, padding="pre"))

  inputs, labels = padded_sequences[:,:-1], padded_sequences[:,-1]
  labels = ku.to_categorical(labels, num_classes=total_words)
  return inputs, labels

max_seq_len = max([len(seq) for seq in inp_sequences])
inputs, labels = split_sequences_labels(inp_sequences, max_seq_len)

# LSTM

## Building model

In [32]:
loss = tf.keras.losses.CategoricalCrossentropy()
optim = tf.keras.optimizers.Adam()

input_len = max_seq_len - 1
model = Sequential()

# Add Input Embedding Layer
model.add(layers.Embedding(256, 30, input_length=input_len))

# Add Dense Layers
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dropout(0.5))

# Add Hidden Layer 1 - LSTM Layer
model.add(layers.LSTM(256))
model.add(layers.Dropout(0.5))

model.add(layers.Dense(total_words, activation='softmax'))

model.compile(loss=loss, optimizer=optim)

model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 30, 30)            7680      
                                                                 
 dense_34 (Dense)            (None, 30, 512)           15872     
                                                                 
 dense_35 (Dense)            (None, 30, 256)           131328    
                                                                 
 dense_36 (Dense)            (None, 30, 128)           32896     
                                                                 
 dense_37 (Dense)            (None, 30, 64)            8256      
                                                                 
 dense_38 (Dense)            (None, 30, 32)            2080      
                                                                 
 dense_39 (Dense)            (None, 30, 16)           

## Train

In [33]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='./training_checkpoints/ckpt_{epoch}',
        save_weights_only=True),
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        patience=5,
        verbose=1,
        restore_best_weights=True),
]


epochs = 200
batch_size = 32
history = model.fit(inputs, labels, epochs=epochs, batch_size=batch_size, callbacks=callbacks)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 53: early stopping


In [34]:
model.save("sentence_completion_tf_model")
shutil.make_archive("sentence_completion_tf_model", "zip", "sentence_completion_tf_model")



'/content/sentence_completion_tf_model.zip'

## Inference

In [35]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = ku.pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word.lower()
                break
        seed_text += " "+output_word

    return seed_text.title()

In [45]:
print(generate_text("Hi", 17 , model, max_seq_len))
print(generate_text("Today I was going to", 10, model, max_seq_len))
print(generate_text("Now we shall head over to", 20, model, max_seq_len))

Hi Am Looking For Information Can You Help Me With A Place To Stay While Im In Town
Today I Was Going To And A Table Day Day Bye That In Me Up
Now We Shall Head Over To And From And What Time Will You Need The Taxi Taxi Food Type The Taxi The Of 4 For For
