# Sentiment Analysis on IMDB Reviews using LSTM


In [2]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

<hr>
<i>Preview dataset</i>

In [2]:
data = pd.read_csv('IMDB Dataset.csv')
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


<hr>
<b>Stop Word</b> is a commonly used words in a sentence, usually a search engine is programmed to ignore this words (i.e. "the", "a", "an", "of", etc.)

<i>Declaring the english stop words</i>

In [3]:
english_stops = set(stopwords.words('english'))

## data cleaning

<hr>

### Load and Clean Dataset

In the original dataset, the reviews are still dirty. There are still html tags, numbers, uppercase, and punctuations. This will not be good for training, so in <b>load_dataset()</b> function, beside loading the dataset using <b>pandas</b>, I also pre-process the reviews by removing html tags, non alphabet (punctuations and numbers), stop words, and lower case all of the reviews.

### Encode Sentiments
In the same function, I also encode the sentiments into integers (0 and 1). Where 0 is for negative sentiments and 1 is for positive sentiments.

In [4]:
import re
import string
import nltk
def wordopt(text_list):
    cleaned_texts = []
    v=nltk.word_tokenize(text_list)
    for text in v:
        # print(text)
        text = text.lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub("\\W", " ", text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        cleaned_texts.append(text)
    return cleaned_texts


In [5]:
# data["review"].apply(wordopt)

## data preprocessing

In [6]:
data["review"] = data["review"].apply(wordopt)
# data

In [7]:
data

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",positive
1,"[a, wonderful, little, production, , , br, ...",positive
2,"[i, thought, this, was, a, wonderful, way, to,...",positive
3,"[basically, there, s, a, family, where, a, li...",negative
4,"[petter, mattei, s, , love, in, the, time, ...",positive
...,...,...
49995,"[i, thought, this, movie, did, a, down, right,...",positive
49996,"[bad, plot, , bad, dialogue, , bad, acting, ...",negative
49997,"[i, am, a, catholic, taught, in, parochial, el...",negative
49998,"[i, m, going, to, have, to, disagree, with, t...",negative


In [8]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
corpus = []

for index, row in data.iterrows():
    text_list = row['review']
    
    # Join the list of words into a single string
    text = ' '.join(text_list)

    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Split the words
    text = text.split()

    # Lemmatize and remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]

    # Join the lemmatized words into a single string
    text = ' '.join(text)

    # Append the processed text to the corpus
    corpus.append(text)

# Assign the processed corpus back to the "Comments" column
data["review"] = corpus


In [9]:
data

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive
...,...,...
49995,thought movie right good job n creative origin...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,going disagree previous comment side maltin on...,negative


## save the clean dataset

In [10]:
data
data.to_csv('review_clean.csv',index=False)

## check sentiment count

In [11]:
data.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

## modeling 

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

## Converting the Words to Vector using tfidf vectorizer


In [13]:
# Converting the Words to Vector using tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(ngram_range=(1,3),max_features=2500)
X=tf.fit_transform(corpus).toarray()
y=data['sentiment']

<hr>

### Split Dataset
In this work, I decided to split the data into 80% of Training and 20% of Testing set using <b>train_test_split</b> method from Scikit-Learn. By using this method, it automatically shuffles the dataset. We need to shuffle the data because in the original dataset, the reviews and sentiments are in order, where they list positive reviews first and then negative reviews. By shuffling the data, it will be distributed equally in the model, so it will be more accurate for predictions.

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1,stratify=y)


## model artitecture model

<hr>

### Build Architecture/Model
<b>Embedding Layer</b>: in simple terms, it creates word vectors of each word in the <i>word_index</i> and group words that are related or have similar meaning by analyzing other words around them.

<b>LSTM Layer</b>: to make a decision to keep or throw away data by considering the current input, previous output, and previous memory. There are some important components in LSTM.
<ul>
    <li><b>Forget Gate</b>, decides information is to be kept or thrown away</li>
    <li><b>Input Gate</b>, updates cell state by passing previous output and current input into sigmoid activation function</li>
    <li><b>Cell State</b>, calculate new cell state, it is multiplied by forget vector (drop value if multiplied by a near 0), add it with the output from input gate to update the cell state value.</li>
    <li><b>Ouput Gate</b>, decides the next hidden state and used for predictions</li>
</ul>

<b>Dense Layer</b>: compute the input with the weight matrix and bias (optional), and using an activation function. I use <b>Sigmoid</b> activation function for this work because the output is only 0 or 1.

The optimizer is <b>Adam</b> and the loss function is <b>categorical Crossentropy</b> because again the output is only 0 and 1, which is a binary number.

<hr>

### Tokenize and Pad/Truncate Reviews
A Neural Network only accepts numeric data, so we need to encode the reviews. I use <b>tensorflow.keras.preprocessing.text.Tokenizer</b> to encode the reviews into integers, where each unique word is automatically indexed (using <b>fit_on_texts</b> method) based on <b>x_train</b>. <br>
<b>x_train</b> and <b>x_test</b> is converted into integers using <b>texts_to_sequences</b> method.

Each reviews has a different length, so we need to add padding (by adding 0) or truncating the words to the same length (in this case, it is the mean of all reviews length) using <b>tensorflow.keras.preprocessing.sequence.pad_sequences</b>.


<b>post</b>, pad or truncate the words in the back of a sentence<br>
<b>pre</b>, pad or truncate the words in front of a sentence

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating

# Load the Amazon review dataset
dataset_path = 'review_clean.csv'
df = pd.read_csv(dataset_path)

# Preprocess the data
max_words = 10000
max_len = 256
# Tokenize and pad sequences (assuming you are using a simple Tokenizer)
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Convert string labels to numerical values
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
num_classes = len(label_encoder.classes_)

labels = np.array(df['sentiment'])

# Convert labels to categorical for multiclass classification
y_train = to_categorical(labels, num_classes=num_classes)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, y_train, test_size=0.2, random_state=42)

# Build the model

embedding_dim = 128

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim))  # Remove input_length here
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


# Evaluate on the test set
lstm_y_pred = np.argmax(model.predict(x_test), axis=-1)

# Calculate metrics
accuracy = accuracy_score(np.argmax(y_test, axis=-1), lstm_y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(np.argmax(y_test, axis=-1), lstm_y_pred, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Generate classification report
class_names = [str(label) for label in label_encoder.classes_]
# Generate classification report
print("\nClassification Report:")
print(classification_report(np.argmax(y_test, axis=-1), lstm_y_pred, target_names=class_names))

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 117ms/step - accuracy: 0.6465 - loss: 0.5752 - val_accuracy: 0.8719 - val_loss: 0.3083
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 110ms/step - accuracy: 0.8992 - loss: 0.2507 - val_accuracy: 0.8745 - val_loss: 0.2970
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 111ms/step - accuracy: 0.9190 - loss: 0.2056 - val_accuracy: 0.8605 - val_loss: 0.3315
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 119ms/step - accuracy: 0.9310 - loss: 0.1818 - val_accuracy: 0.8646 - val_loss: 0.3261
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 113ms/step - accuracy: 0.9410 - loss: 0.1582 - val_accuracy: 0.8675 - val_loss: 0.3501
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 120ms/step - accuracy: 0.9498 - loss: 0.1377 - val_accuracy: 0.8558 - val_loss:

In [4]:
print(f"Accuracy: {accuracy}")


Accuracy: 0.8585


## save the model

In [5]:
import pickle

# Save the model
model.save('sentiment_analysis_model.h5')

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoder
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)




## user prediction

In [9]:
def predict_sentiment(review):
    # Load tokenizer and label encoder
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    with open('label_encoder.pickle', 'rb') as handle:
        label_encoder = pickle.load(handle)

    # Preprocess the user input
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post')

    # Load the model
    model = load_model('sentiment_analysis_model.h5')

    # Make prediction
    prediction = model.predict(padded_sequence)
    predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])

    return predicted_class[0]

# Example usage:
user_review = "Hatts of to JAMES CAMERON for thinking and creating a vision like this. It takes a lotseof hardwork & research to build a whole new world. And there is a reason why this movie is still the no 1 movie in the world. I've never seen this kind of visuals in any other film. And this movie was made in 2009 that was an amazing achievement by the VFX creators & the director itself. You can easily get connected with the movie plot and the way the director has shown the Pandora world was just unbelievable. Cast of this film has done a fabulous job while performing so well and get into the character that not a single one will disappoint. Emotional scenes are so powerful that you feel the characters and their pain for what their suffering. Visual Effects makes this movie so powerful that every creature and big scenery scenes looks real. That's why it has re-released once again to feel the same experience. Don't miss this one on the big screen if you haven't seen it. It's a total new world experience. Can't wait for the PART 2"
predicted_sentiment = predict_sentiment(user_review)
print("Predicted sentiment:", predicted_sentiment)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
Predicted sentiment: positive


In [37]:
import re
import string
import pickle
import nltk
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the LSTM model
from tensorflow.keras.models import load_model
model = load_model('sentiment_analysis_model.h5')

# Load the tokenizer and label encoder
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Define the maximum sequence length
max_len = 256

def preprocess_input(text):
    # Clean the text using regular expressions
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d', '', text)  # Remove digits
    text = text.lower()  # Convert to lowercase
    return text

def preprocess_and_predict(user_input):
    # Preprocess user input
    preprocessed_input = preprocess_input(user_input)

    # Tokenize and pad the preprocessed input sequence
    sequence = tokenizer.texts_to_sequences([preprocessed_input])
    padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post')

    # Predict using the LSTM model
    prediction = model.predict(padded_sequence)

    return np.argmax(prediction)  # Return the index of the predicted class

# Get user input
user_input = input('Enter Your Message: ')

# Predict and print the result
result = preprocess_and_predict(user_input)
print("Prediction:", result)

# positive - 1
# negative - 0


Prediction: 1
