In [1]:
import os
import re
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import spacy

SEED = 42
np.random.seed(SEED)

In [2]:
def convert_to_int(label):
    if label == "positive":
        return 1
    elif label == "negative":
        return 0
    
dataset = pd.read_csv(os.path.expanduser("~\Downloads\datasets\IMDB-Dataset.csv"))
dataset['sentiment'] = dataset['sentiment'].map(convert_to_int)
# dataset = pd.read_csv("/content/drive/MyDrive/IMDB-Dataset.csv")
train_ae = dataset.iloc[:44000, 0]
val_ae = dataset.iloc[44000:47000, 0]
test_ae = dataset.iloc[47000:, 0]

train_y = dataset.iloc[:44000, 1]
val_y = dataset.iloc[44000:47000, 1]
test_y = dataset.iloc[47000:, 1]

print(train_ae.head())
print("\n", train_y.head())

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

 0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64


# Tokenization

In [3]:
print(train_ae[1][:190])
html_re = re.compile(r'<[^>]+>')
train_ae = train_ae.replace(html_re, "", regex=True)
print(train_ae[1][:190])

A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the en
A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. 


In [4]:
tokenizer = keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(train_ae)
print("Word Indices: ")
print(dict(itertools.islice(tokenizer.index_word.items(), 20)), "...")
print(f"Word Count: {len(tokenizer.word_index)}")

Word Indices: 
{1: 'the', 2: 'and', 3: 'a', 4: 'of', 5: 'to', 6: 'is', 7: 'in', 8: 'it', 9: 'i', 10: 'this', 11: 'that', 12: 'was', 13: 'as', 14: 'for', 15: 'with', 16: 'movie', 17: 'but', 18: 'film', 19: 'on', 20: 'not'} ...
Word Count: 119224


In [5]:
MAX_NUM_TOKENS = 10000
tokenizer.num_words = MAX_NUM_TOKENS
train_ae = train_ae.apply(lambda x: tokenizer.texts_to_sequences([x])[0])
val_ae = val_ae.apply(lambda x: tokenizer.texts_to_sequences([x])[0])
test_ae = test_ae.apply(lambda x: tokenizer.texts_to_sequences([x])[0])

train_ae.head()

0    [26, 4, 1, 79, 2077, 44, 1059, 11, 98, 146, 38...
1    [3, 393, 119, 353, 1, 1350, 2947, 6, 51, 51, 1...
2    [9, 190, 10, 12, 3, 393, 94, 5, 1133, 54, 19, ...
3    [689, 221, 3, 232, 116, 3, 119, 426, 3595, 128...
4    [111, 7, 1, 54, 4, 287, 6, 3, 2074, 1438, 18, ...
Name: review, dtype: object

# Padding

In [6]:
print('sequence_length mean: ', train_ae.apply(lambda x: len(x)).mean())
print('sequence_length median: ', train_ae.apply(lambda x: len(x)).median())
print('sequence_length std deviation: ', np.sqrt(train_ae.apply(lambda x: len(x)).var()))
MAX_SEQUENCE_LENGTH = 300 # mean + std deviation*0.5

sequence_length mean:  217.23727272727274
sequence_length median:  164.0
sequence_length std deviation:  158.58784251928319


In [7]:
train_ae = keras.preprocessing.sequence.pad_sequences(train_ae, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
val_ae = keras.preprocessing.sequence.pad_sequences(val_ae, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_ae = keras.preprocessing.sequence.pad_sequences(test_ae, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
print(train_ae)

[[  26    4    1 ...    0    0    0]
 [   3  393  119 ...    0    0    0]
 [   9  190   10 ...    0    0    0]
 ...
 [   1 3047    5 ...  391    3  701]
 [ 528   10   26 ...    0    0    0]
 [   9  152   24 ...    0    0    0]]


# Word to Vector

In [8]:
word2vec_model = spacy.load("en_core_web_sm")

df_index_word = pd.Series(tokenizer.index_word) # Get all the words in the vocabulary
df_index_word = df_index_word[:MAX_NUM_TOKENS-1]
df_index_word = pd.Series(["temp_oov_word"]).append(df_index_word) # Add a temporary row for oov words
df_index_word = df_index_word.reset_index() # New column for token_id
df_index_word.columns = ['token_id', 'token']

df_index_word['word2vec'] = df_index_word.token.apply(lambda x: word2vec_model(x).vector) # Embedding for each word
df_index_word.at[0, "word2vec"] = np.zeros_like(df_index_word.at[0, "word2vec"]) # Replace the embedding for "temp_oov_word" with 0s
df_index_word.head()

Unnamed: 0,token_id,token,word2vec
0,0,temp_oov_word,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,the,"[-0.2032743, -0.4009009, -0.73730505, 0.536398..."
2,2,and,"[-0.9291044, -0.13172856, -0.814826, 0.6040276..."
3,3,a,"[0.02819018, 0.009864252, -0.095220566, 0.5522..."
4,4,of,"[-1.1193825, 0.18575507, -0.59173, -0.4724092,..."


In [9]:
embedding_matrix = np.array([vec for vec in df_index_word.word2vec.values])
print(embedding_matrix.shape) # MAX_NUM_TOKENS x vector dim 
embedding_matrix[0:3, :5]

(10000, 96)


array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.2032743 , -0.4009009 , -0.73730505,  0.5363982 ,  0.3764372 ],
       [-0.9291044 , -0.13172856, -0.814826  ,  0.6040276 , -0.67340827]],
      dtype=float32)

In [10]:
# MAX_NUM_TOKENS = 20000
# MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = embedding_matrix.shape[1] #96
embedding_layer = keras.layers.Embedding(
    input_dim=MAX_NUM_TOKENS,
    output_dim=EMBEDDING_DIM,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    input_length=MAX_SEQUENCE_LENGTH,
    mask_zero=True,
    trainable=False)

# test
embedding_result = embedding_layer(inputs=train_ae[0])
print(embedding_result.shape) # MAX_SEQUENCE_LENGTH x EMBEDDING_DIM
print("Embeddings of the first 5 words (each vector is restricted to size 5)")
print(train_ae[0][:5])
print(embedding_result[:5, :5]) 

train_ae = embedding_layer(inputs=train_ae)
val_ae = embedding_layer(inputs=val_ae)
test_ae = embedding_layer(inputs=test_ae)

(300, 96)
Embeddings of the first 5 words (each vector is restricted to size 5)
[  26    4    1   79 2077]
tf.Tensor(
[[ 0.49959356  0.704657    0.4497372   0.32913145  0.01701519]
 [-1.1193825   0.18575507 -0.59173    -0.4724092   0.04607756]
 [-0.2032743  -0.4009009  -0.73730505  0.5363982   0.3764372 ]
 [-0.33528453 -0.5446908   0.7285543  -0.51831293  0.10952553]
 [-0.39640257  0.09997782 -0.0515418   0.10987999  0.6886112 ]], shape=(5, 5), dtype=float32)


# Convolutional Autoencoder

In [11]:
conv_encoder = keras.models.Sequential([
    keras.layers.Conv1D(filters=96, kernel_size=8, strides=2, input_shape=[MAX_SEQUENCE_LENGTH, EMBEDDING_DIM]),
    keras.layers.MaxPool1D(pool_size=2),
    keras.layers.Activation('selu'),
    keras.layers.BatchNormalization(),
    
    keras.layers.Conv1D(filters=128, kernel_size=16, strides=1),
    keras.layers.MaxPool1D(pool_size=4),
    keras.layers.Activation('selu'),
    keras.layers.BatchNormalization(),
    
    keras.layers.Conv1D(filters=50, kernel_size=14),
    # keras.layers.BatchNormalization(),
    # keras.layers.Conv2D(filters=50, kernel_size=(1,1)), # fully conv layer
    keras.layers.Reshape([50])
])
conv_encoder.summary()
conv_decoder = keras.models.Sequential([
    # kernel size and strides are chosen such that we get 300x96 = 28800 = 180x160 size.
    keras.layers.Reshape([50,1], input_shape=[50]),
    keras.layers.Conv1DTranspose(64, kernel_size=8, padding='valid', input_shape=[1,50]),
    keras.layers.BatchNormalization(),
    keras.layers.Conv1DTranspose(128, kernel_size=32, strides=2, padding='valid', activation='selu'),
    keras.layers.BatchNormalization(),
    keras.layers.Conv1DTranspose(160, kernel_size=37, strides=1, padding='valid', activation='selu'),
    keras.layers.BatchNormalization(),
    # keras.layers.Conv1DTranspose(1, kernel_size=22, strides=2, padding='valid', activation='linear'), 
    keras.layers.Reshape([MAX_SEQUENCE_LENGTH, EMBEDDING_DIM])
])
conv_decoder.summary()
conv_ae = keras.models.Sequential([conv_encoder, conv_decoder])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 147, 96)           73824     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 73, 96)            0         
_________________________________________________________________
activation (Activation)      (None, 73, 96)            0         
_________________________________________________________________
batch_normalization (BatchNo (None, 73, 96)            384       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 58, 128)           196736    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 14, 128)           0         
_________________________________________________________________
activation_1 (Activation)    (None, 14, 128)           0

In [12]:
conv_ae.compile(optimizer='adam', loss='mse') # metrics='accuracy' is useful only when doing classification
conv_ae.fit(x=train_ae, y=train_ae, epochs=10, batch_size=256, validation_data=(val_ae, val_ae))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20106503608>

# Pre Trained Classifier

In [13]:
encoder_classifier = keras.models.clone_model(conv_encoder)
encoder_classifier.add(keras.layers.Dense(40, activation='selu', activity_regularizer='l2'))
encoder_classifier.add(keras.layers.Dense(20, activation='selu', activity_regularizer='l2'))
encoder_classifier.add(keras.layers.Dense(1, activation='sigmoid'))
encoder_classifier.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 147, 96)           73824     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 73, 96)            0         
_________________________________________________________________
activation (Activation)      (None, 73, 96)            0         
_________________________________________________________________
batch_normalization (BatchNo (None, 73, 96)            384       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 58, 128)           196736    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 14, 128)           0         
_________________________________________________________________
activation_1 (Activation)    (None, 14, 128)           0

In [14]:
early_stopping = keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3, verbose=1, restore_best_weights=True)
encoder_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
encoder_classifier.fit(x=train_ae, y=tf.convert_to_tensor(train_y), epochs=30, batch_size=256, validation_data=(val_ae, tf.convert_to_tensor(val_y)))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2011fe50cc8>

# Encoded Features as Input

In [16]:
train_new = conv_encoder.predict(train_ae)
val_new = conv_encoder.predict(val_ae)
classifier = keras.models.Sequential([
    keras.layers.Dense(40, activation='relu', input_shape=[50]),
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
classifier.summary()
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
classifier.fit(x=train_new, y=tf.convert_to_tensor(train_y), epochs=30, batch_size=256, validation_data=(val_new, tf.convert_to_tensor(val_y)))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 40)                2040      
_________________________________________________________________
dense_4 (Dense)              (None, 20)                820       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 21        
Total params: 2,881
Trainable params: 2,881
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x204f2c01848>