# Set up

In [None]:
import os, zipfile, glob, json, string, shutil

from google.colab.files import upload
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential

import tensorflow.keras.utils as ku
import tensorflow.keras.layers as layers
import tensorflow as tf
import pandas as pd
import numpy as np

In [None]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Dataset

## Kaggle

In [None]:
print("Installing kaggle")
!pip install kaggle -q
print("Upload kaggle.json")
upload()
print("Setting kaggle up...")
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json
!rm -fr sample_data
print("Done!")

Installing kaggle
Upload kaggle.json


Saving kaggle.json to kaggle.json
Setting kaggle up...
Done!


## Downloading dataset

In [None]:
data_name = "data"
train_data = "data/train"
test_data = "data/test"

In [None]:
if not os.path.exists(data_name):
  !kaggle datasets download -d taejinwoo/multiwoz-22
  with zipfile.ZipFile("multiwoz-22.zip", "r") as zip_ref: zip_ref.extractall("./")
  os.remove("multiwoz-22.zip")
  os.rename("MultiWOZ_2.2", data_name)

Downloading multiwoz-22.zip to /content
 95% 14.0M/14.8M [00:00<00:00, 22.7MB/s]
100% 14.8M/14.8M [00:01<00:00, 15.4MB/s]


## Dataframe

In [None]:
sentences = []
for i, filepath in enumerate(glob.glob(f"{train_data}/*.json")):
  with open(filepath, 'r') as f:
    data = json.load(f)

  utterances = []
  for item in data:
    for turn in item["turns"]:
      utterances.append(turn["utterance"])

  sentences.append(utterances)

df_ = pd.DataFrame({"sentences": sentences})
print("Total num of rows", len(df_.index))
df_.head()

Total num of rows 17


Unnamed: 0,sentences
0,[I am looking for a moderately priced 4 star h...
1,"[Thankyou, I am looking for a trian that leave..."
2,"[I need a taxi going to tandoori place., Where..."
3,[I am looking for something fun to do in the s...
4,[Hello! I am looking for the address and phone...


In [None]:
max_rows = 500
all_sentences = []

for i in range(len(df_.loc[:, "sentences"]) - 1):
  sentences = df_.loc[i, "sentences"]

  for sentence in sentences:
    if len(all_sentences) + 1 > max_rows: break

    all_sentences.append(sentence)

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

corpus = [clean_text(x) for x in all_sentences]
corpus[:10]

['i am looking for a moderately priced 4 star hotel',
 'all the moderately priced 4star accommodations are guesthouses is that ok',
 'that would be just fine thank you',
 'which are would you like',
 'any area will do recommend something',
 'how about the avalon',
 'great if they can book for 6 people for 5 nights starting on wednesday can you try that for me',
 'i was able to complete that booking for you your confirmation number is o4e7tj08 ',
 'thank you are there any places to go in the east',
 'if youre going to the east i recommend taking a boat ride with camboats']

## Create n-gram sequences

In [None]:
tokenizer = Tokenizer()

def create_sequences(corpus):
  tokenizer.fit_on_texts(corpus)
  total_words = len(tokenizer.word_index) + 1

  sequences = []
  for sentence in corpus:
    tok_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, total_words):
      n_gram_seq = tok_sentence[:i+1]
      sequences.append(n_gram_seq)

  return sequences, total_words

inp_sequences, total_words = create_sequences(corpus)
inp_sequences[:10]

[[3, 67],
 [3, 67, 38],
 [3, 67, 38, 7],
 [3, 67, 38, 7, 5],
 [3, 67, 38, 7, 5, 100],
 [3, 67, 38, 7, 5, 100, 101],
 [3, 67, 38, 7, 5, 100, 101, 74],
 [3, 67, 38, 7, 5, 100, 101, 74, 91],
 [3, 67, 38, 7, 5, 100, 101, 74, 91, 47],
 [3, 67, 38, 7, 5, 100, 101, 74, 91, 47]]

## Get inputs and labels

In [None]:
def split_sequences_labels(sequences, pad_len):
  padded_sequences = np.array(ku.pad_sequences(sequences, maxlen=pad_len, padding="pre"))

  inputs, labels = padded_sequences[:,:-1], padded_sequences[:,-1]
  labels = ku.to_categorical(labels, num_classes=total_words)
  return inputs, labels

max_seq_len = max([len(seq) for seq in inp_sequences])
inputs, labels = split_sequences_labels(inp_sequences, max_seq_len)

# LSTM

## Building model

In [None]:
loss = tf.keras.losses.CategoricalCrossentropy()
optim = tf.keras.optimizers.Adam()

input_len = max_seq_len - 1

model = Sequential()

# Add Input Embedding Layer
model.add(layers.Embedding(256, 100, input_length=input_len))

# Add Dense Layers
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dropout(0.4))

# Add LSTM Layer
model.add(layers.LSTM(1024))
model.add(layers.Dropout(0.4))

model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(total_words, activation='softmax'))

model.compile(loss=loss, optimizer=optim)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 100)           25600     
                                                                 
 dense (Dense)               (None, 32, 1024)          103424    
                                                                 
 dense_1 (Dense)             (None, 32, 512)           524800    
                                                                 
 dense_2 (Dense)             (None, 32, 128)           65664     
                                                                 
 dense_3 (Dense)             (None, 32, 64)            8256      
                                                                 
 dense_4 (Dense)             (None, 32, 32)            2080      
                                                                 
 dense_5 (Dense)             (None, 32, 16)            5

## Train

In [None]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='./training_checkpoints/ckpt_{epoch}',
        save_weights_only=True),
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        patience=5,
        verbose=1,
        restore_best_weights=True),
]


epochs = 20
batch_size = 128
history = model.fit(inputs, labels, epochs=epochs, batch_size=batch_size, callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
  30/3153 [..............................] - ETA: 4:06 - loss: 0.0490

KeyboardInterrupt: ignored

In [None]:
model.save("sentence_completion_tf_model")
shutil.make_archive("sentence_completion_tf_model", "zip", "sentence_completion_tf_model")



'/content/sentence_completion_tf_model.zip'

In [None]:
history = model.fit(inputs, labels, epochs=10, batch_size=batch_size, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 10: early stopping


## Inference

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = ku.pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word.lower()
                break
        seed_text += " " + output_word

    return seed_text.title()

In [None]:
print(generate_text("Hi", 10 , model, max_seq_len))
print(generate_text("Today I was going to", 7, model, max_seq_len))
print(generate_text("My address is", 16, model, max_seq_len))
print(generate_text("I", 20, model, max_seq_len))
print(generate_text("Would you like to try", 20, model, max_seq_len))
print(generate_text("How are you", 20, model, max_seq_len))

Hi Food Was Able To Book That House The South Number
Today I Was Going To Great Day In Cambridge Town To To
My Address Is Restaurants Rd The South Number Is Ejaop5Qe And You Try To Book It For You Cb20Qq
I Have A Restaurants Serving Hills For You Thank You Cheers Take Us Price Range Would You Like To Make In
Would You Like To Try The Gandhi Guest Food Ditton Now Ditton College Some Every Cross Camboats And You Like To Book One For You
How Are You Cambridge You Any There Know Else I Can Help You With Today Today Today Today Today Today Today Today Today
