In [17]:
import numpy as np
import tensorflow as tf

#### Split sentences to words

In [18]:
sentences = ['this tutorial wordvec scratch chapter toxic comment classification', 'this chapter focuses toxic comment classification aditionally wordvec tutorial']


In [19]:
all_words = []
for each in sentences:
    all_words.extend(each.split(' '))
len(all_words), all_words

(17,
 ['this',
  'tutorial',
  'wordvec',
  'scratch',
  'chapter',
  'toxic',
  'comment',
  'classification',
  'this',
  'chapter',
  'focuses',
  'toxic',
  'comment',
  'classification',
  'aditionally',
  'wordvec',
  'tutorial'])

#### Build the vocab and add UNK for OOV

In [20]:
word_index = {}
word_index['UNK'] = 0
index = 1
for each in all_words:
    if each in word_index.keys():
        continue
    word_index[each] = index
    index += 1

vocab_words = list(word_index.keys())
vocab_len = len(vocab_words)
word_index, vocab_len

({'UNK': 0,
  'this': 1,
  'tutorial': 2,
  'wordvec': 3,
  'scratch': 4,
  'chapter': 5,
  'toxic': 6,
  'comment': 7,
  'classification': 8,
  'focuses': 9,
  'aditionally': 10},
 11)

#### Tokenize sentences

In [21]:
tokenized_sentences = []
lengths = []
for sentence in sentences:
    split_sent = sentence.split(' ')
    lengths.append(len(split_sent))
    tokenized_ = []
    for word in split_sent:
        tokenized_.append(word_index[word])
    tokenized_sentences.append(tokenized_)
max_len = max(lengths)
tokenized_sentences


[[1, 2, 3, 4, 5, 6, 7, 8], [1, 5, 9, 6, 7, 8, 10, 3, 2]]

#### Make all sentences of equal length using padding

In [22]:
tokeinzed_padded = []
for sent in tokenized_sentences:
    if len(sent) != max_len:
        extra = np.abs(max_len - len(sent))
        tokeinzed_padded.append(([0] * extra) + sent)
    else:
        tokeinzed_padded.append(sent)

In [23]:
tokeinzed_padded

[[0, 1, 2, 3, 4, 5, 6, 7, 8], [1, 5, 9, 6, 7, 8, 10, 3, 2]]

#### Chaneg every token to one hot coding

In [24]:
one_hot_tokeinzed = []
for sent in tokeinzed_padded:
    temp = []
    for token in sent:
        word_one_hot = [0 for i in range(vocab_len)]
        word_one_hot[token] = 1
        temp.append(word_one_hot)
    one_hot_tokeinzed.append(temp)

In [25]:
one_hot_tokeinzed = np.array(one_hot_tokeinzed)
one_hot_tokeinzed.shape

(2, 9, 11)

#### LSTM autoencoder 

In [26]:
inputs = tf.keras.Input(shape = (max_len, vocab_len))
encoded = tf.keras.layers.LSTM(10, activation='relu', return_sequences = True)(inputs)
encoded = tf.keras.layers.LSTM(20, activation='relu', return_sequences = True)(encoded)
encoded = tf.keras.layers.LSTM(30, activation='relu')(encoded)
decoded = tf.keras.layers.RepeatVector(max_len)(encoded)

decoded = tf.keras.layers.LSTM(20, activation='relu', return_sequences=True)(decoded)
decoded = tf.keras.layers.LSTM(10, activation='relu', return_sequences=True)(decoded)
decoded = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(vocab_len))(decoded)



In [27]:
encoder = tf.keras.Model(inputs, encoded)

model = tf.keras.Model(inputs=inputs, outputs=decoded)
model.compile(optimizer='adam', loss='mse')

In [28]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 9, 11)]           0         
                                                                 
 lstm_5 (LSTM)               (None, 9, 10)             880       
                                                                 
 lstm_6 (LSTM)               (None, 9, 20)             2480      
                                                                 
 lstm_7 (LSTM)               (None, 30)                6120      
                                                                 
 repeat_vector_1 (RepeatVect  (None, 9, 30)            0         
 or)                                                             
                                                                 
 lstm_8 (LSTM)               (None, 9, 20)             4080      
                                                           

In [29]:
model.fit(one_hot_tokeinzed, one_hot_tokeinzed, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2c16a5ca0>

#### For new data point, tokenize it, change it to one hot encoding and get the sentence representation using encoder model.

In [32]:
x_test = 'this tutorial wordvec toxic comment classification wordvec scratch chapter'
tokenized_test = []


split_sent = x_test.split(' ')
tokenized_test = []
for word in split_sent:
    tokenized_test.append(word_index[word])

tokenized_test
 

[1, 2, 3, 6, 7, 8, 3, 4, 5]

In [31]:
one_hot_tokeinzed_test = []

temp = []
for token in tokenized_test:
    word_one_hot = [0 for i in range(vocab_len)]
    word_one_hot[token] = 1
    temp.append(word_one_hot)
one_hot_tokeinzed_test.append(temp)
one_hot_tokeinzed_test

[[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]]]

In [16]:
sentence_vector = encoder.predict(one_hot_tokeinzed_test)
sentence_vector.shape



(1, 30)