<a href="https://colab.research.google.com/github/OormiC/IMDB_review_sentiment/blob/main/Project1IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --q --upgrade keras-nlp
!pip install --q --upgrade keras

In [None]:
# Import dependencies

# Tools for data manipulation
import pandas as pd
import re
import numpy as np

# Tools for building model
from sklearn.model_selection import train_test_split
import keras_nlp
from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

# Tools for preprocessing data
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Read in parquet files as pandas dataframes
df_train = pd.read_parquet('/train-00000-of-00001.parquet', engine='pyarrow')
df_test = pd.read_parquet('/test-00000-of-00001.parquet', engine='pyarrow')
df = pd.concat([df_train, df_test])
print(len(df))
print(df.head())

50000
                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0


In [None]:
# Define function variables
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def _clean(data):
    # Filter out the non-meaningful words
    meaning = [x for x in data.split() if x not in stop_words]
    review_text = " ".join(meaning)
    # Remove HTML
    review_text = BeautifulSoup(review_text, "lxml").get_text()
    # Remove non-letters
    letters_only = re.compile(r'[^A-Za-z\s]').sub("", review_text)
    # Convert to lower case
    data = letters_only.lower()

    return data

def _lemmatize(tokens: list) -> list:
    # 1. Lemmatize
    lemmatized_tokens = [wnl.lemmatize(word) for word in tokens]

    return lemmatized_tokens

def _preprocess(review):
    # 1. Clean text
    review = _clean(review)

    # 2. Tokenize
    tokens = word_tokenize(review)

    # 3. Lemmatize
    lemma = _lemmatize(tokens)

    # return the result.
    return lemma

data_to_list = df['text'].values.tolist()
review_list = [_preprocess(review) for review in data_to_list]

print(review_list[0])

  review_text = BeautifulSoup(review_text, "lxml").get_text()


['i', 'rented', 'i', 'am', 'curiousyellow', 'video', 'store', 'controversy', 'surrounded', 'first', 'released', 'i', 'also', 'heard', 'first', 'seized', 'u', 'custom', 'ever', 'tried', 'enter', 'country', 'therefore', 'fan', 'film', 'considered', 'controversial', 'i', 'really', 'see', 'myselfthe', 'plot', 'centered', 'around', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'want', 'learn', 'everything', 'life', 'in', 'particular', 'want', 'focus', 'attention', 'making', 'sort', 'documentary', 'average', 'swede', 'thought', 'certain', 'political', 'issue', 'vietnam', 'war', 'race', 'issue', 'united', 'state', 'in', 'asking', 'politician', 'ordinary', 'denizen', 'stockholm', 'opinion', 'politics', 'sex', 'drama', 'teacher', 'classmate', 'married', 'menwhat', 'kill', 'i', 'am', 'curiousyellow', 'year', 'ago', 'considered', 'pornographic', 'really', 'sex', 'nudity', 'scene', 'far', 'between', 'even', 'shot', 'like', 'cheaply', 'made', 'porno', 'while', 'countryman', 'mind', 'find

In [None]:
# Transform our array of text into 2D numeric arrays
max_words = 5000
max_len = 200

# Train set
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(review_list)
sequences = tokenizer.texts_to_sequences(review_list)
padded_list = pad_sequences(sequences, maxlen=max_len)

# Examples of padded review
print(padded_list[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    1 1424    1 2251  298  921 3305   30
  549    1   27  480   30   91   57  678 2212  502 1445  124    3 1118
 2685    1   18   17   48  112  113 3383  368  711  694   68  751  201
   37   44  756   68  803  601  182  312  507  785  109  681  912  742
 2317  199 1163  742 2073  632   44 1980 3627 1737  529 2382  314  368
 1270  979  301    1 2251   56  546 1118   18  314  986   19  151 3384
   13  150    7   33 3575  367  253   65 1472  524  314  986  565 3383
  362   13 2848 4298  994   10   94  230  237 1519  314   19  561  101
  314  516    3  516 1512 1018  168 1262   23   16  220  516  538  750
    1 2251   10    3  174 1549 1567 3114  224 3434 1263 3383  362   40
   18 

In [None]:
# Define train and test data
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(padded_list, y, test_size=0.20, random_state=42)

# Convert the labels to one-hot encoded vectors
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Build, compile, and fit an RNN (single LSTM) model
model1 = Sequential()
model1.add(layers.Embedding(max_words, 20)) #The embedding layer
model1.add(layers.LSTM(15,dropout=0.5)) #Our LSTM layer
model1.add(layers.Dense(2,activation='sigmoid')) # Two possible outcomes (negative or positive)

model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

checkpoint1 = ModelCheckpoint("best_model1.keras", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', save_weights_only=False)
history = model1.fit(X_train, y_train, epochs=5,validation_data=(X_test, y_test),callbacks=[checkpoint1])

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.7464 - loss: 0.4882
Epoch 1: val_accuracy improved from -inf to 0.87240, saving model to best_model1.keras
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 71ms/step - accuracy: 0.7465 - loss: 0.4881 - val_accuracy: 0.8724 - val_loss: 0.3248
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.8852 - loss: 0.2884
Epoch 2: val_accuracy did not improve from 0.87240
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 65ms/step - accuracy: 0.8852 - loss: 0.2884 - val_accuracy: 0.8656 - val_loss: 0.3789
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.8959 - loss: 0.2658
Epoch 3: val_accuracy improved from 0.87240 to 0.88640, saving model to best_model1.keras
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 65ms/step - accuracy: 0.89

In [None]:
# Display model loss and accuracy
model_loss, model_accuracy = model1.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

313/313 - 6s - 19ms/step - accuracy: 0.8854 - loss: 0.2842
Loss: 0.28420761227607727, Accuracy: 0.8853999972343445


In [None]:
# Put in a review to test prediction
sentiment = ['Negative','Positive']
sequence = tokenizer.texts_to_sequences(['this movie is the best ever!'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(model1.predict(test), decimals=0).argmax(axis=1)[0]]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step


'Positive'