# NLP project

In this project I will use NLP on the IMDB dataset. The first step is to read this dataset and prepare it for the NLP model.

In [8]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.layers import Input, LSTM, Dense, Embedding
from keras.models import Model
from keras.utils import pad_sequences

# Read in the dataset
data = pd.read_csv("IMDB Dataset.csv")



In [9]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [21]:
# Convert text to lowercase
data['review'] = data['review'].apply(lambda x: x.lower())

# Tokenize the text
review_tokenizer = Tokenizer()
review_tokenizer.fit_on_texts(data['review'])
sentiment_tokenizer = Tokenizer()
sentiment_tokenizer.fit_on_texts(data['sentiment'])

# Convert text to sequences of integers
review_sequences = review_tokenizer.texts_to_sequences(data['review'])
sentiment_sequences = sentiment_tokenizer.texts_to_sequences(data['sentiment'])

# Pad sequences to a fixed length
max_sequence_length = 100
review_data = pad_sequences(review_sequences, maxlen=max_sequence_length, padding='post')
sentiment_data = pad_sequences(sentiment_sequences, maxlen=max_sequence_length, padding='post')

In this code, we first read in the dataset. We then convert the text to lowercase and tokenize the text using Keras' Tokenizer class. We also pad the sequences to a fixed length of 100.

Next, we split the data into training and validation sets.

In [22]:
review_data

array([[   34,  1637,     9, ...,   125,  4103,   486],
       [ 9719,    31,     1, ...,  1977,    69,   221],
       [ 3059,    12,  2971, ...,    63,    16,   350],
       ...,
       [   26,     3,  1156, ..., 22840,     2,  6050],
       [    5,    68,   135, ...,    67,   739,    42],
       [  699,   479,    11, ...,   794,    11,    17]])

In [15]:
sentiment_data

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [2, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0]])

In [23]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(review_data, sentiment_data, test_size=0.2, random_state=42)


We can now define our encoder and decoder models.

In [24]:
# Define input sequence
encoder_inputs = Input(shape=(None,))
# Define output sequence
decoder_inputs = Input(shape=(None,))

# Define encoder embedding layer
encoder_embedding = Embedding(len(review_tokenizer.word_index) + 1, 256)
encoder_embedding_output = encoder_embedding(encoder_inputs)

# Define encoder LSTM layer
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding_output)
encoder_states = [state_h, state_c]

# Define decoder embedding layer
decoder_embedding = Embedding(len(sentiment_tokenizer.word_index) + 1, 256)
decoder_embedding_output = decoder_embedding(decoder_inputs)

# Define decoder LSTM layer
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)

# Define output layer
decoder_dense = Dense(len(sentiment_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In this code, we define the input and output sequences, as well as the embedding and LSTM layers for the encoder and decoder. We also define the output layer and the entire model.

We can now compile and train the model.

In [25]:
from keras.utils import to_categorical


# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
# Define batch size and number of epochs
batch_size = 64
epochs = 10

# Define generator for training data
def generate_batch(X=X_train, y=y_train, batch_size=batch_size):
    while True:
        for i in range(0, len(X), batch_size):
            encoder_input_data = X[i:i + batch_size]
            decoder_input_data = y[i:i + batch_size, :-1]
            decoder_output_data = y[i:i + batch_size, 1:]
            encoder_input_data = np.array(encoder_input_data)
            decoder_input_data = np.array(decoder_input_data)
            decoder_output_data = np.array(decoder_output_data)
            decoder_output_data = to_categorical(decoder_output_data, num_classes=len(sentiment_tokenizer.word_index) + 1)
            yield ([encoder_input_data, decoder_input_data], decoder_output_data)

# Define generator for validation data
def generate_validation(X=X_val, y=y_val):
    encoder_input_data = np.array(X)
    decoder_input_data = np.array(y[:, :-1])
    decoder_output_data = np.array(y[:, 1:])
    decoder_output_data = to_categorical(decoder_output_data, num_classes=len(sentiment_tokenizer.word_index) + 1)
    return ([encoder_input_data, decoder_input_data], decoder_output_data)

# Train model
model.fit_generator(generator=generate_batch(),
                    steps_per_epoch=len(X_train)//batch_size,
                    epochs=epochs,
                    validation_data=generate_validation(),
                    validation_steps=len(X_val)//batch_size)

  model.fit_generator(generator=generate_batch(),


Epoch 1/10
Epoch 2/10
Epoch 3/10
  8/625 [..............................] - ETA: 22:10 - loss: 1.2041e-08 - accuracy: 1.0000

KeyboardInterrupt: 