# Sentiment Analysis IMDB

This notebook is a simple straight-forward way to achieve 90% accuracy on IMDB dataset. Note that this is not the only way to achieve such accuracy.

## Load Data

In [None]:
import nlp_proj_utils as utils
import pandas as pd

pd.set_option('max_colwidth', 500)  # Set display column width to show more content

# Load dataset, download if necessary
train, test = utils.get_imdb_dataset()

# Get a sample (head) of the data frame
train.sample(3)

## Prepare Data 

In this part,I will remove all the html label,punctuation and stopwords from the dataset. In order to reach a higher accuracy, I have selected 3000 most common word in the training data, and only the word in this list will be kept for further anylysis.
1. Remove HTML tag (<br /> in this case) from the review text
2. Remove punctuations (replace with whitespace)
3. Split review text into tokens
4. Remove tokens that are considered as "stopwords"
5. For the rest, do lemmatization

In [None]:
import string
import nltk

transtbl = str.maketrans(string.punctuation, ' '*len(string.punctuation))
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()

In [None]:
#Take a text input and return the preprocessed string.
def preprocessing(line: str) -> str:
    """
    Take a text input and return the preprocessed string.
    i.e.: preprocessed tokens concatenated by whitespace
    """
    line = line.replace('<br />','').translate(transtbl)
    
    tokens = [lemmatizer.lemmatize(t.lower(),'v')
              for t in nltk.word_tokenize(line)
              if t.lower() not in stopwords]
    
    return ' '.join(tokens)

preprocessing("I bought several books yesterday<br /> and I really love them!")

In [None]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

for df in train, test:
    df['text_prep'] = df['text'].progress_apply(preprocessing)

In [None]:
train.sample(2)

### Keep the most common words

In [None]:
all_words = [w for text in tqdm_notebook(train['text_prep']) 
             for w in text.split()]

In [None]:
# Use FreqDist to get count for each word
voca = nltk.FreqDist(all_words)
print(voca)

In [None]:
voca.most_common(10)

In [None]:
topwords = [word for word, _ in voca.most_common(3000)]

In [None]:
# import 
import numpy as np
import nlp_proj_utils as utils
from tensorflow.keras.models import Model  
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation, Embedding
from tensorflow.keras.preprocessing import sequence

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'


np.random.seed(1)

In [None]:
word_to_index, word_to_vec_map = utils.load_glove_vecs()

### Select the first 200 words for embedding

In [None]:
maxlen = 200
print('max number of words in a sentence:', maxlen)

In [None]:
# Convert training/testing features into index list
train_text = utils.sentences_to_indices(train['text_prep'], word_to_index, maxlen, topwords)
test_text = utils.sentences_to_indices(test['text_prep'], word_to_index, maxlen, topwords)

In [None]:
train_text

Convert label to 0 and 1

In [None]:
train_y = train['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)
test_y = test['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)

### Embedding layer

In [None]:
def pretrained_embedding_layer(word_to_index, word_to_vec_map):
    """
    Build and return a Keras Embedding Layer given word_to_vec mapping and word_to_index mapping
    
    Args:
        word_to_index (dict[str->int]): map from a word to its index in vocabulary
        word_to_vec_map (dict[str->np.ndarray]): map from a word to a vector with shape (N,) where N is the length of a word vector (50 in our case)

    Return:
        Keras.layers.Embedding: Embedding layer
    """
    
    # Keras requires vocab length start from index 1
    vocab_len = len(word_to_index) + 1  
    emb_dim = list(word_to_vec_map.values())[0].shape[0]
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    return Embedding(
        input_dim=vocab_len,
        output_dim=emb_dim,
        trainable=False,  # Indicating this is a pre-trained embedding 
        weights=[emb_matrix])

## Build a LSTM Model

I will use a two layer LSTM Model to train the data.

In [None]:
def build_model(input_dim, word_to_index, word_to_vec_map):
    """
    Build and return the Keras model
    
    Args:
        input_dim: The dim of input layer
        word_to_vec_map (dict[str->np.ndarray]): map from a word to a vector with shape (N,) where N is the length of a word vector (50 in our case)
        word_to_index (dict[str->int]): map from a word to its index in vocabulary
    
    Returns:
        Keras.models.Model: 2-layer LSTM model
    """
    
    # Input layer
    sentence_indices = Input(shape=(input_dim,), dtype='int32')
    
    # Build embedding layer
    embedding_layer = pretrained_embedding_layer(word_to_index, word_to_vec_map)
    embeddings = embedding_layer(sentence_indices)   
    
    # 2-layer LSTM
    X = LSTM(128, return_sequences=True, recurrent_dropout=0.5)(embeddings)  # N->N RNN，得到所有的a
    X = Dropout(rate=0.8)(X)
    X = LSTM(128, recurrent_dropout=0.5)(X)  # N -> 1 RNN
    X = Dropout(rate=0.8)(X)
    X = Dense(1, activation='sigmoid')(X)
    
    # Create and return model
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [None]:
imdb_model = build_model(
    maxlen, 
    word_to_index, 
    word_to_vec_map)

In [None]:
imdb_model.summary()

## Compile the Model

In [None]:
imdb_model.compile(
    loss='binary_crossentropy', 
    optimizer='adam',
    metrics=['accuracy'])

In [None]:
history = imdb_model.fit(
    train_text, 
    train_y, 
    epochs = 200,  
    shuffle=True,
    validation_data=[test_text, test_y]
)

utils.plot_history(history, ['loss', 'val_loss'])

utils.plot_history(history, ['acc', 'val_acc'])

imdb_model.evaluate(train_text, train_y)
imdb_model.evaluate(test_text, test_y)

## Callbacks

Callbacks (aka hooks) are functions called every N epochs that help you monitor and log the training process. By default, they will be called every 1 epoch. We will be using two common callbacks here: `EarlyStopping` and `ModelCheckpoint`. The first is used to prevent overfitting and the second is used to keep track of the best models we got so far.

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
early_stoppping_hook = EarlyStopping(
    monitor='val_loss',  # what metrics to track
    patience=20,  # maximum number of epochs allowed without imporvement on monitored metrics 
)

CPK_PATH = 'model_cpk.hdf5'    # path to store checkpoint

model_cpk_hook = ModelCheckpoint(
    CPK_PATH,
    monitor='val_loss', 
    save_best_only=True,  # Only keep the best model
)

## Train the Model

In [None]:
history = imdb_model.fit(
    train_text, 
    train_y, 
    epochs = 200,  
    shuffle=True,
    validation_data=[test_text, test_y]
)
print('Training finished')

## Evaluation

Load the best model and do evaluation:

In [None]:
# Load the model checkpoint
imdb_model.load_weights(CPK_PATH)

# Accuracy on validation 
imdb_model.evaluate(test_text, test_y)

## Save the model and data

In [None]:
import pickle
import h5py
import os

In [None]:
model_root = 'resources/model'
os.makedirs(model_root, exist_ok=True)

# Save model structure as json
with open(os.path.join(model_root, "model.json"), "w") as fp:
    fp.write(imdb_model.to_json())

# Save model weights
imdb_model.save_weights(os.path.join(model_root, "weights.h5"))

In [None]:
with open('word_to_index.pkl', 'wb') as fp:
    pickle.dump(word_to_index, fp)
    
with open('word_to_vec_map.pkl', 'wb') as fp:
    pickle.dump(word_to_vec_map, fp)
    
with open('topwords.pkl', 'wb') as fp:
    pickle.dump(topwords, fp)