In [None]:
import os
import io
import string
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [157]:
data = pd.read_csv('IMDB Dataset.csv',on_bad_lines='skip',engine='python')

data.shape

(50000, 2)

In [158]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [159]:
shuffled_data = data.sample(frac=1, random_state=2023)

train_size = 0.8  # 80% for training, 20% for testing
train_data, test_data = train_test_split(shuffled_data, train_size=train_size, random_state=2023)

In [160]:
print(f'Train shappe: {train_data.shape}')
print(f'Test shappe: {test_data.shape}')

Train shappe: (40000, 2)
Test shappe: (10000, 2)


In [161]:
X_train = train_data['review']
y_train = train_data['sentiment']

X_test = test_data['review']
y_test = test_data['sentiment']

Preprocessing - Clean text

In [163]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [164]:
table = str.maketrans('', '', string.punctuation)

In [165]:
X_train_cleaned = []

for item in tqdm(X_train):
    sentence = str(item).lower()
    # if we have 2 joined words
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    # remove <br> tag
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    # remove stop words and punctuations
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table) # remove all punctuations
        if word not in stop_words:
            filtered_sentence += word + ' '

    X_train_cleaned.append(filtered_sentence.strip())

  soup = BeautifulSoup(sentence)
100%|██████████| 40000/40000 [00:31<00:00, 1259.22it/s]


In [167]:
all_words = [word for sentence in X_train_cleaned for word in sentence.split()]

# Calculate the vocabulary size (number of unique words)
total_vocab_size = len(set(all_words))

print("Total Vocabulary Size:", total_vocab_size)

Total Vocabulary Size: 110111


Tokenization

In [168]:
vocab_size = 50000
max_length = 2000
trunc_type='post'
padding_type='post'
oov_tok = ""

In [169]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train_cleaned)

In [170]:
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(X_train_cleaned)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

Split the train into train and validation

In [171]:
training_padded, validation_padded, y_train, y_valid = train_test_split(training_padded, y_train, test_size=0.2, random_state=2023)

In [172]:
training_padded = np.array(training_padded)
training_labels = np.array(y_train)

validation_padded = np.array(validation_padded)
validation_labels = np.array(y_valid)

testing_padded = np.array(testing_padded)
testing_labels = np.array(y_test)

In [173]:
print(f'Train shappe: {train_data.shape}')
print(f'Validation shappe: {validation_padded.shape}')
print(f'Test shappe: {test_data.shape}')

Train shappe: (40000, 2)
Validation shappe: (8000, 2000)
Test shappe: (10000, 2)


In [174]:
embedding_dim = round(vocab_size ** 0.25) # use the fourth root of the vocab size
l2_regularization = 0.01 # tends to amplify differences between nonzero values and zero or close-tozero ones


In [175]:
def create_model(vocab_size, embedding_dim):
    model = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size, embedding_dim), # turns positive integers into dense vectors of fixed size
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dense(24, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_regularization)),
            tf.keras.layers.Dropout(.25),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)

    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

    return model

In [176]:
model = create_model(vocab_size, embedding_dim)
# Display the model's architecture
model.summary()

In [180]:
num_epochs = 100
history = model.fit(training_padded,
                    training_labels,
                    epochs=num_epochs,
                    validation_data=(validation_padded, validation_labels),
                    verbose=2)

Epoch 1/100
1000/1000 - 18s - 18ms/step - accuracy: 0.4965 - loss: 0.8296 - val_accuracy: 0.4979 - val_loss: 0.7833
Epoch 2/100
1000/1000 - 20s - 20ms/step - accuracy: 0.4988 - loss: 0.7542 - val_accuracy: 0.4979 - val_loss: 0.7310
Epoch 3/100
1000/1000 - 20s - 20ms/step - accuracy: 0.5013 - loss: 0.7169 - val_accuracy: 0.5099 - val_loss: 0.7062
Epoch 4/100
1000/1000 - 21s - 21ms/step - accuracy: 0.4991 - loss: 0.7005 - val_accuracy: 0.4979 - val_loss: 0.6964
Epoch 5/100
1000/1000 - 16s - 16ms/step - accuracy: 0.5041 - loss: 0.6947 - val_accuracy: 0.4991 - val_loss: 0.6936
Epoch 6/100
1000/1000 - 21s - 21ms/step - accuracy: 0.5003 - loss: 0.6933 - val_accuracy: 0.5021 - val_loss: 0.6932
Epoch 7/100
1000/1000 - 20s - 20ms/step - accuracy: 0.5007 - loss: 0.6932 - val_accuracy: 0.4979 - val_loss: 0.6931
Epoch 8/100
1000/1000 - 20s - 20ms/step - accuracy: 0.5000 - loss: 0.6932 - val_accuracy: 0.5023 - val_loss: 0.6931
Epoch 9/100
1000/1000 - 21s - 21ms/step - accuracy: 0.4996 - loss: 0.693

In [214]:
sentences = ["The worst movie I have ever seen. A complete waste of time",
             ]
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
predictions = model.predict(padded)
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[[0.29735908]]


In [215]:
yhat = np.zeros_like(predictions)
for i in range(len(predictions)):
    if predictions[i] >= 0.5:
        yhat[i] = 1
    else:
        yhat[i] = 0
print(f"decisions = \n{yhat}")

decisions = 
[[0.]]


In [216]:
if max(yhat) == 1:
    print("Positive")
else:
    print("Negative")


#print(f"reviews = \n{review_labels}")

Negative
