In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 
import pandas as pd

In [2]:
data = pd.read_csv('sentence_scoping_train.csv',index_col=False)
data.head(1)

Unnamed: 0,mr,ref,period
0,"name[nameVariable], food[Chinese], familyFrien...","it's a Chinese place, also nameVariable is nea...",2


In [3]:
txt = list(data['ref'][0:50])
text = ' '.join(str(e) for e in txt)

#print(text[0:200])
tokenizer = Tokenizer()

corpus = text.lower().split("\n")

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)


{'is': 1, 'a': 2, 'it': 3, 'namevariable': 4, 'rating': 5, 'friendly': 6, 'in': 7, 'and': 8, 'near': 9, 'nearvariable': 10, 'place': 11, "it's": 12, 'also': 13, 'has': 14, 'an': 15, 'restaurant': 16, 'riverside': 17, 'with': 18, 'family': 19, 'kid': 20, 'average': 21, 'coffee': 22, 'shop': 23, "isn't": 24, 'pub': 25, 'italian': 26, 'excellent': 27, 'chinese': 28, 'city': 29, 'centre': 30, 'high': 31, 'moderately': 32, 'priced': 33, 'fast': 34, 'food': 35, 'expensive': 36, 'french': 37, 'japanese': 38, 'mediocre': 39, 'decent': 40, 'cheap': 41, 'the': 42, 'price': 43, 'range': 44, 'of': 45, '£20': 46, '25': 47, 'low': 48, 'indian': 49, 'english': 50}
51


In [4]:
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [5]:
print(tokenizer.word_index['in'])
print(tokenizer.word_index['the'])
print(tokenizer.word_index['of'])

7
42
45


In [6]:
print(tokenizer.word_index)

{'is': 1, 'a': 2, 'it': 3, 'namevariable': 4, 'rating': 5, 'friendly': 6, 'in': 7, 'and': 8, 'near': 9, 'nearvariable': 10, 'place': 11, "it's": 12, 'also': 13, 'has': 14, 'an': 15, 'restaurant': 16, 'riverside': 17, 'with': 18, 'family': 19, 'kid': 20, 'average': 21, 'coffee': 22, 'shop': 23, "isn't": 24, 'pub': 25, 'italian': 26, 'excellent': 27, 'chinese': 28, 'city': 29, 'centre': 30, 'high': 31, 'moderately': 32, 'priced': 33, 'fast': 34, 'food': 35, 'expensive': 36, 'french': 37, 'japanese': 38, 'mediocre': 39, 'decent': 40, 'cheap': 41, 'the': 42, 'price': 43, 'range': 44, 'of': 45, '£20': 46, '25': 47, 'low': 48, 'indian': 49, 'english': 50}


In [7]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
history = model.fit(xs, ys, epochs=10, verbose=1)
#print model.summary()
print(model)


Train on 894 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<tensorflow.python.keras.engine.sequential.Sequential object at 0x00000178748D1848>


In [None]:
seed_text = "it's a Chinese place, also nameVariable is near nearVariable. it is family friendly. it's near"
next_words = 100
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)