# Preparing the Data

In [None]:
import tensorflow as tf
import pyprind
import pandas as pd
from string import punctuation
import re
import numpy as np
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df = df.iloc[:4000]

In [None]:
# Preprocessing the data:

# Separate words and count each word's occurrence
from collections import Counter

counts = Counter()
pbar = pyprind.ProgBar(len(df['review']), title='Counting words occurrences')
for i, review in enumerate(df['review']):
    text = ''.join(
        [c if c not in punctuation else ' ' + c + ' ' for c in review]
    ).lower()
    df.loc[i, 'review'] = text
    pbar.update()
    counts.update(text.split())

# Create a mapping
# Map each unique word to an integer
word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}


mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to ints')
for review in df['review']:
    mapped_reviews.append([word_to_int[word] for word in review.split()])
    pbar.update()

In [None]:
# Define same-length sequences
# if sequence length < 200: left-pad with zeros
# if sequence length > 200: use the last 200 elements

sequence_length = 200   # (Known as T in our RNN formulas)
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)

for i, row in enumerate(mapped_reviews):
    review_arr = np.array(row)
    sequences[i, -len(row):] = review_arr[-sequence_length:]

In [None]:
X_train = sequences[:2000, :]
y_train = df.loc[:2000, 'sentiment'].values
X_test = sequences[2000:, :]
y_test = df.loc[2000:, 'sentiment'].values

In [None]:
np.random.seed(123)  # for reproducibility


# Define a function to generate mini-batches:
def create_batch_generator(x, y=None, batch_size=64):
    n_batches = len(x) // batch_size
    x = x[:n_batches * batch_size]
    if y is not None:
        y = y[:n_batches * batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii: ii + batch_size], y[ii: ii + batch_size]
        else:
            yield x[ii: ii + batch_size]

# Building an RNN Model

In [None]:
from sentimentrnn import create_batch_generator, SentimentRNN

n_words = max(list(word_to_int.values())) + 1

rnn = SentimentRNN(
    n_words=n_words, seq_len=sequence_length,
    embed_size=256, lstm_size=128,
    num_layers=1, batch_size=200,
    learning_rate=0.001
)

In [None]:
rnn.train(X_train, y_train, num_epochs=20)

In [None]:
preds = rnn.predict(X_test)
y_true = y_test[:len(preds)]
print(f'Test Acc.: {np.sum(preds == y_true) / len(y_true):.3f}')