# IMDB Movie Rating Sentiment Analysis

## Import Libraries

In [21]:
#!pip install gensim

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import gensim
import re
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Salma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Salma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Salma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Salma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
file_path = "movie.csv"
Movie_Data = pd.read_csv(file_path)
Movie_Data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
Movie_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


## Apply the preprocessing techniques to the data

In [4]:
def preprocessing(text):
    
    """
    Preprocesses a string of words by removing stopwords, punctuation, digits, and applying lemmatization.
    
    Args:
        text (str): Input string of words
        
    Returns:
        str: Preprocessed string of words
    """
    try:
        
        stop_words = set(stopwords.words('english'))
        lem = WordNetLemmatizer()

        # Tokenize the text to tokens
        tokens = word_tokenize(text)
    
        # Apply lower casing to each token
        tokens = [token.lower() for token in tokens]
    
        # Remove the stop words from the text
        tokens = [token for token in tokens if token not in stop_words]
    
        # Remove The Punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
    
        # Remove digits 
        tokens = [token for token in tokens if not token.isdigit()]
    
        # Apply lemmatization
        lemmatized_tokens = [lem.lemmatize(token) for token in tokens]
    
        return " ".join(lemmatized_tokens)  
    
    except Exception as e:
        print(f"Error occured during preprocessing")
        return " "

In [5]:
# Preprocessing the Dataset
Movie_Data['text'] = Movie_Data['text'].apply(lambda text: preprocessing(text))
Movie_Data.head()

Unnamed: 0,text,label
0,grew b watching loving thunderbird mate school...,0
1,put movie dvd player sat coke chip expectation...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movie bore...,0
4,im die hard dad army fan nothing ever change g...,1


## Convert the text to Words vectors using Word2Vec

In [6]:
import gensim.downloader as api
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [8]:
# Load the pre-trained GloVe model
model = api.load('glove-twitter-100')

In [9]:
def text_vectorize(text):
    """
    Converts a cleaned string of words into a word vector of 100 dimensions.

    Args: 
        text (str): Cleaned string of words
    Returns:
        numpy.ndarray: Word vector of 100 dimensions for the string of words
    
    """
    try:
        
        vectors = []
        tokens = word_tokenize(text)
        for token in tokens:
            if token in model:
                vectors.append(model[token])
            
        if vectors:
            return np.mean(vectors, axis=0) 
        else:
            return np.zeros(100)
        
    except Exception as e:
        print(f"Error occurred during text vectorization: {e}")
        return np.zeros(100)           

In [10]:
# Vectorize the text in the Dataset
Movie_Data['text_vector'] = Movie_Data['text'].apply(lambda text: text_vectorize(text))
Movie_Data.head()

Unnamed: 0,text,label,text_vector
0,grew b watching loving thunderbird mate school...,0,"[0.20546971, 0.05887542, 0.09890097, -0.023451..."
1,put movie dvd player sat coke chip expectation...,0,"[0.18131112, 0.20964852, 0.047303382, -0.11553..."
2,people know particular time past like feel nee...,0,"[0.08031409, 0.04627317, 0.09079919, 0.0920195..."
3,even though great interest biblical movie bore...,0,"[0.09096001, 0.006144219, 0.034156606, -0.0272..."
4,im die hard dad army fan nothing ever change g...,1,"[0.10691197, 0.13346525, 0.056612603, -0.04111..."


## LSTM - Model 

In [11]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [12]:
# Convert 'text_vector' to numpy array
X = np.stack(Movie_Data['text_vector'].values)
# Reshape the input data to 3D
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
y = Movie_Data['label'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
def LSTM_Model():
    """
    Create an LSTM model with 128 internal units.
    
    """
    try:
        
        model = keras.Sequential()
        # Add a LSTM layer with 128 internal units.
        model.add(layers.LSTM(128, input_shape=(100,1)))
        # Add a Dense layer with 32 units.
        model.add(layers.Dense(32, activation='relu'))
        # Add a Dense Layer with 16 units.
        model.add(layers.Dense(16, activation='relu'))
        # Add a Dense layer with 1 unit.
        model.add(layers.Dense(1, activation='sigmoid'))
        
        return model
    except Exception as e:
        print(f"Error occured during LSTM model creation")
        
        return None

In [14]:
# Create the LSTM model
lstm_model = LSTM_Model()
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               66560     
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 71,233
Trainable params: 71,233
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
# Train the model
lstm_model.fit(X_train, y_train, batch_size=512, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1e1008918d0>

# Test Example

In [26]:
def prediction(text):
    """
    Predicts the label for a given text.
    
    Args:
        text (str): text to classified.
        
    Returns:
        string: predicted label (positive or negative).  
    """
    
    try:
        preprocessed_text = preprocessing(text)
        vector = text_vectorize(preprocessed_text)
        vector = np.expand_dims(vector, axis=0) # Add batch dimension
        
        prediction = int(np.round(lstm_model.predict(vector)[0][0]))

        
        if prediction == 1:
            label = "Positive"
        else:
            label = "Negative"
            
        return label    
    except Exception as e:
        print(f"Error occured during label prediction: {e}")
        return -1  # Return -1 on error

In [27]:
# Example usage
text = "This is a great movie!"
predicted_label = prediction(text)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Positive


# Conclusion

In this task, I utilized the IMDB Movie rating sentiment analysis dataset. The dataset underwent preprocessing steps to enhance its quality. These steps included the removal of stopwords, punctuations, and digits, as well as converting the words to lowercase and applying lemmatization using the NLTK Library.

To represent the words in vector form, I employed the word2vec technique. Specifically, I utilized the Glove model, which provides vectors of 100 dimensions. This conversion facilitated the utilization of the data in the subsequent model.

Next, I partitioned the dataset into an 80% training set and a 20% testing set. The training set was utilized to train an LSTM model with 20 epochs.

To evaluate the performance of the model, I employed the accuracy metric, which is appropriate for this binary classification problem. The model was trained using the binary cross-entropy loss function and the Adam optimizer.

Upon completion of the training process, the LSTM model achieved a training loss of 0.5751 and an accuracy of 70%. For the testing data, the model obtained a loss of 0.5832 and an accuracy of 69%.

# Future Work

As future work to improve the model's performance, several approaches can be considered. Firstly, incorporating a larger and more diverse dataset during training can help the model better capture the nuances and variations in sentiment. Additionally, exploring alternative embedding techniques like BERT or ELMo, or even introducing an embedding layer within the deep learning model, may yield improved performance.

Adjusting the model's parameters, such as the number of layers, hidden units, and activation functions, can also be explored to optimize performance. Furthermore, leveraging pre-trained models, such as using a pre-trained language model for transfer learning, has the potential to enhance the model's capabilities.

It is worth noting that sentiment analysis faces challenges due to the diverse nature of human expression and language use. To address this, incorporating a lexicon-based approach could aid in detecting slang words or expressions that the classifier may struggle to analyze accurately.

Continuing to iterate and refine the model based on careful analysis of errors and misclassifications, along with incorporating domain-specific knowledge, can contribute to further performance improvements.