### Import libraries

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Load data

In [2]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Preprocess data

In [3]:
dataset['sentiment'] = dataset['sentiment'].map({'positive': 1, 'negative': 0})

In [4]:
reviews = dataset['review'].values
labels = dataset['sentiment'].values

### Split data

In [5]:
X_train_texts = reviews[:40000]
y_train = labels[:40000]
X_test_texts = reviews[40000:]
y_test = labels[40000:]

### Tokenize

In [6]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_texts)

In [7]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1 

In [8]:
X_train_sequences = tokenizer.texts_to_sequences(X_train_texts)
X_train_padded = pad_sequences(X_train_sequences, padding='post', maxlen=2500)

X_test_sequences = tokenizer.texts_to_sequences(X_test_texts)
X_test_padded = pad_sequences(X_test_sequences, padding='post', maxlen=2500)

### Model

In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=10, input_length=2500),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
model.fit(X_train_padded, y_train, epochs=20, validation_data=(X_test_padded, y_test), verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1b74eb558d0>

### Prediction

In [44]:
review = [
    "Scene with the CGI Leia was horrible. I wouldn't recommend this movie it to others.",
    "The movie was great. I enjoyed every secon of it although actors play wasn't perfect but plot made up for it. I would recommend it to others."
]

review_sequence = tokenizer.texts_to_sequences(review)
review_padded = pad_sequences(review_sequence, maxlen=2500, padding='post')

prediction = model.predict(review_padded) > .5
print(prediction.flatten())

[False  True]
