In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
import json
import cv2
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Dense
from tensorflow.keras import layers
from tensorflow.keras import activations
from sklearn.preprocessing import OneHotEncoder
from keras.regularizers import l2
import tensorflow_hub as hub
import tensorflow_text as text
import re

In [2]:
data = json.load(open('clinic150/data/data_full.json'))

In [3]:
data.keys()

dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])

In [4]:
data['val']

[['in spanish, meet me tomorrow is said how', 'translate'],
 ['in french, how do i say, see you later', 'translate'],
 ['how do you say hello in japanese', 'translate'],
 ['how do i ask about the weather in chinese', 'translate'],
 ['how can i say "cancel my order" in french', 'translate'],
 ['how do i say dinner in spanish', 'translate'],
 ['how do you say good bye in french', 'translate'],
 ['how do i say thank you in spanish', 'translate'],
 ['how do i say good bye in chinese', 'translate'],
 ['how can i say thank you very much in chinese', 'translate'],
 ['i need to know how to say hello in france', 'translate'],
 ['would you tell me how to say goodbye in france', 'translate'],
 ['how do you say hello in mexico', 'translate'],
 ['tell me how to say hello in chile', 'translate'],
 ['i want to know how to say goodbye in france', 'translate'],
 ['would you translate a sentence into russian for me, please', 'translate'],
 ['would you translate a phrase into mandarin for me', 'translate

In [5]:
df_train = pd.DataFrame(data['train'], columns =['Sentence',"Domain"])
df_val = pd.DataFrame(data['val'], columns =['Sentence',"Domain"])
df_test = pd.DataFrame(data['test'], columns =['Sentence',"Domain"])

In [6]:
df_train.describe()

Unnamed: 0,Sentence,Domain
count,15000,15000
unique,15000,150
top,what expression would i use to say i love you ...,translate
freq,1,100


In [7]:
df_val.describe()

Unnamed: 0,Sentence,Domain
count,3000,3000
unique,3000,150
top,"in spanish, meet me tomorrow is said how",translate
freq,1,20


In [8]:
df_test.describe()

Unnamed: 0,Sentence,Domain
count,4500,4500
unique,4500,150
top,how would you say fly in italian,translate
freq,1,30


In [9]:
#check balance data
df_train['Domain'].value_counts()

translate          100
order_status       100
goodbye            100
account_blocked    100
what_song          100
                  ... 
reminder           100
change_speed       100
tire_pressure      100
no                 100
card_declined      100
Name: Domain, Length: 150, dtype: int64

In [10]:
df_val['Domain'].value_counts()

translate          20
order_status       20
goodbye            20
account_blocked    20
what_song          20
                   ..
reminder           20
change_speed       20
tire_pressure      20
no                 20
card_declined      20
Name: Domain, Length: 150, dtype: int64

In [11]:
df_test['Domain'].value_counts()

translate          30
order_status       30
goodbye            30
account_blocked    30
what_song          30
                   ..
reminder           30
change_speed       30
tire_pressure      30
no                 30
card_declined      30
Name: Domain, Length: 150, dtype: int64

In [12]:
print("Train")
print(df_train.isnull().sum())
print("Val")
print(df_val.isnull().sum())
print("Test")
print(df_test.isnull().sum())

Train
Sentence    0
Domain      0
dtype: int64
Val
Sentence    0
Domain      0
dtype: int64
Test
Sentence    0
Domain      0
dtype: int64


In [13]:
#check longest sentences
def check_len(data):
    return max([len(i.split()) for i in data['Sentence']])
#check vocab
def checkvocab_size(data):
    vcab_list = []
    for i in data['Sentence']:
        for y in i.split():
            vcab_list.append(re.sub(r'[^\w\s]', '', y.lower()))
    return len(set(vcab_list))

In [14]:
#grab longest sentence in train 
max_len = check_len(df_train)
max_len

28

In [15]:
vocab_size = checkvocab_size(df_train)
vocab_size

5220

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token= "<OOV>")
tokenizer.fit_on_texts(df_train["Sentence"])

In [18]:
tokenizer.word_index

{'<OOV>': 1,
 'my': 2,
 'to': 3,
 'i': 4,
 'the': 5,
 'you': 6,
 'a': 7,
 'for': 8,
 'what': 9,
 'me': 10,
 'is': 11,
 'how': 12,
 'do': 13,
 'on': 14,
 'can': 15,
 'in': 16,
 'of': 17,
 'need': 18,
 'please': 19,
 'card': 20,
 'tell': 21,
 'have': 22,
 'are': 23,
 'know': 24,
 'it': 25,
 'get': 26,
 'at': 27,
 'and': 28,
 'credit': 29,
 'from': 30,
 'account': 31,
 'when': 32,
 "what's": 33,
 'that': 34,
 'list': 35,
 'will': 36,
 'be': 37,
 'car': 38,
 'want': 39,
 'like': 40,
 'your': 41,
 'this': 42,
 'if': 43,
 'with': 44,
 'would': 45,
 'change': 46,
 'does': 47,
 'new': 48,
 'an': 49,
 'bank': 50,
 'time': 51,
 'many': 52,
 'much': 53,
 'go': 54,
 'find': 55,
 'there': 56,
 'make': 57,
 'am': 58,
 'where': 59,
 'long': 60,
 'take': 61,
 'up': 62,
 'out': 63,
 'name': 64,
 'next': 65,
 'help': 66,
 'any': 67,
 'bill': 68,
 'set': 69,
 'should': 70,
 'about': 71,
 'oil': 72,
 'visa': 73,
 'now': 74,
 'use': 75,
 'was': 76,
 'reservation': 77,
 "i'm": 78,
 'pay': 79,
 'phone': 80,


In [19]:
#turn texts to sequences
train_sequences = tokenizer.texts_to_sequences(df_train["Sentence"])
train_sequences

[[9, 2853, 45, 4, 75, 3, 117, 4, 522, 6, 43, 4, 202, 49, 504],
 [15, 6, 21, 10, 12, 3, 117, 1868, 13, 119, 270, 53, 2854, 16, 402],
 [9, 11, 5, 2855, 17, 2856, 11, 2857, 16, 302],
 [21, 10, 12, 3, 117, 2858, 11, 7, 2216, 2859, 16, 504],
 [43, 4, 202, 2860, 12, 45, 4, 117, 34, 4, 58, 7, 644],
 [12, 13, 4, 117, 2861, 16, 2862],
 [4, 18, 6, 3, 618, 5, 2217, 2863, 36, 37, 56, 2864, 261, 1636],
 [19, 21, 10, 12, 3, 322, 8, 7, 2218, 16, 302],
 [15, 6, 21, 10, 12, 4, 45, 117, 2865, 583, 2866, 16, 302],
 [9, 11, 5, 534, 156, 3, 117, 1868, 58, 7, 2867, 16, 302],
 [12, 45, 4, 117, 1868, 186, 24, 5, 2868, 16, 402],
 [4, 45, 40, 3, 24, 5, 1166, 156, 3, 2869, 49, 2219, 16, 1636],
 [9, 2220, 45, 4, 75, 3, 21, 296, 4, 58, 136, 43, 4, 202, 1022],
 [12, 45, 4, 117, 403, 6, 43, 4, 202, 1167],
 [15, 6, 21, 10, 12, 4, 45, 2870, 117, 403, 6, 153, 7, 302, 252],
 [4, 45, 40, 3, 24, 12, 3, 117, 392, 16, 302],
 [43, 4, 202, 535, 12, 45, 4, 117, 1084],
 [4, 45, 4, 117, 1084, 43, 4, 202, 535],
 [12, 13, 6, 117, 

In [20]:
#balance the texts len by padding
pad_train_sequences = pad_sequences(train_sequences, maxlen = max_len, truncating = 'post', padding = 'post')
pad_train_sequences.shape

(15000, 28)

In [21]:
#apply with val and test
val_sequences = tokenizer.texts_to_sequences(df_val["Sentence"])
pad_val_sequences = pad_sequences(val_sequences, maxlen = max_len, truncating ='post', padding ='post')

test_sequences = tokenizer.texts_to_sequences(df_test['Sentence'])
pad_test_sequences = pad_sequences(test_sequences, maxlen = max_len, truncating ='post', padding = 'post')
                                   

In [22]:
len_labels = len(df_train['Domain'].value_counts().index)

In [23]:
labels = df_train['Domain'].to_numpy().reshape(-1,1)
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(labels)
train_labels = ohe.transform(labels).toarray()

In [24]:
#do for val and test
val_labels = ohe.transform(df_val['Domain'].to_numpy().reshape(-1,1)).toarray()
test_labels = ohe.transform(df_test['Domain'].to_numpy().reshape(-1,1)).toarray()

In [25]:
def Intentmodel(vocab_size, len_labels, MAX_SEQUENCE_LENGTH):
    EMBEDDING_DIM= 128
    filter_sizes = [2,3]
    num_filters = 1024
    drop = 0.75

    print("Creating Model...")
    inputs = layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedding = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM,
                      input_length=MAX_SEQUENCE_LENGTH, trainable=False)(inputs)
    reshape = layers.Reshape((MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,1))(embedding)

    conv_0 = layers.Conv2D(num_filters, kernel_size=(filter_sizes[0], EMBEDDING_DIM), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
    conv_1 = layers.Conv2D(num_filters, kernel_size=(filter_sizes[1], EMBEDDING_DIM), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

    maxpool_0 = layers.MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
    maxpool_1 = layers.MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
    
    dropout_0 = Dropout(drop)(maxpool_0)
    dropout_1 = Dropout(drop)(maxpool_1)
    
    concatenated_tensor = layers.Concatenate(axis=1)([dropout_0, dropout_1])
    flatten = Flatten()(concatenated_tensor)
    dropout = Dropout(0.5)(flatten)
    preds = Dense(len_labels, activation='softmax')(dropout)

    # this creates a model that includes inputs and outputs
    model = Model(inputs=inputs, outputs=preds)

    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

    model.summary()
    return model

In [26]:
%load_ext tensorboard

In [27]:
sdir = "ques3"

In [28]:
imodel = Intentmodel(vocab_size, len_labels, max_len)
callbacks = tf.keras.callbacks.TensorBoard(log_dir = sdir)
imodel.fit(pad_train_sequences, train_labels, batch_size= 128, epochs = 100, 
          validation_data = (pad_val_sequences, val_labels), callbacks =[callbacks])

Creating Model...
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 28)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 28, 128)      668160      ['input_1[0][0]']                
                                                                                                  
 reshape (Reshape)              (None, 28, 128, 1)   0           ['embedding[0][0]']              
                                                                                                  
 conv2d (Conv2D)                (None, 27, 1, 1024)  263168      ['reshape[0][0]']                
                                                                            

Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x29e370c1bd0>

In [29]:
imodel.evaluate(pad_test_sequences, test_labels)



[0.38257887959480286, 0.9079999923706055]

In [1]:
!tensorboard dev list

https://tensorboard.dev/experiment/JnTXbrpjQDanI3ygmwbTxQ/
	Name                 Q3
	Description          Q3
	Id                   JnTXbrpjQDanI3ygmwbTxQ
	Created              2022-11-28 09:29:49 (2 hours ago)
	Updated              2022-11-28 09:29:51 (2 hours ago)
	Runs                 2
	Tags                 5
	Scalars              600
	Tensor bytes         0
	Binary object bytes  56148
https://tensorboard.dev/experiment/EzDmlPoCRHeOwOLlP7dQng/
	Name                 Q1
	Description          Q1
	Id                   EzDmlPoCRHeOwOLlP7dQng
	Created              2022-11-27 22:07:54 (14 hours ago)
	Updated              2022-11-27 22:07:55 (14 hours ago)
	Runs                 2
	Tags                 5
	Scalars              120
	Tensor bytes         0
	Binary object bytes  52459


Total: 2 experiment(s)
