In [1]:
# https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from sklearn.model_selection import train_test_split
import time



In [2]:
# Step 1: Load and preprocess the dataset
df = pd.read_csv('IMDB Dataset.csv')


In [3]:

# Map 'positive' to 1 and 'negative' to 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])
X = pad_sequences(X, padding='pre', maxlen=100)
y = df['sentiment']

# Tokenizer Initialization:
# We create a Tokenizer object called tokenizer and specify that we want it to consider only the top 5000 most frequent words (num_words=5000).
# Fitting the Tokenizer:
# We provide the tokenizer with our review texts (df['review']), and it learns from them. Essentially, it creates a vocabulary of words from these reviews and assigns each word a unique number.
# Tokenizing Sequences:
# Now that the tokenizer knows the words and their corresponding numbers, we convert each review text into a sequence of numbers. Instead of words, each review is represented by a sequence of these numbers.
# Padding Sequences:
# To ensure that all sequences have the same length, we add padding to the beginning of each sequence if it's shorter than 100 numbers. This ensures that every review has the same length, which is necessary when training a neural network.
# Assigning Labels:
# We also extract the sentiment labels (positive or negative) from the DataFrame and assign them to y. This tells us whether each review is positive or negative.

In [4]:
# Step 2: Define the deep neural network model architecture
embedding_dim = 128
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [5]:
# Step 3: Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 4: Train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

start_time = time.time()
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))
end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time:.2f} seconds")



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training time: 1200.30 seconds


In [6]:
# Step 5: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')



Test Loss: 0.3378496766090393
Test Accuracy: 0.864799976348877


In [None]:
# Step 6: Make predictions on new data (dynamic input)
while True:
    user_input = input("Enter a movie review: ")
    # Preprocess the user input
    input_sequence = tokenizer.texts_to_sequences([user_input])
    input_sequence = pad_sequences(input_sequence, padding='pre', maxlen=100)
    
    start_time = time.time()
    # Make prediction
    prediction = model.predict(input_sequence)[0][0]
    end_time = time.time()
    
    # Print prediction
    if prediction >= 0.5:
        print("Predicted sentiment: Positive")
    else:
        print("Predicted sentiment: Negative")
    
    prediction_time = end_time - start_time
    print(f"Prediction time: {prediction_time:.2f} seconds")


Enter a movie review: The leafs are dry 
Predicted sentiment: Negative
Prediction time: 2.77 seconds
Enter a movie review: Outstanding Acting by Randeep. Its great to see A Hero on screen. SwatantryaVeer Savarkar is like Epitome of Patriotism and Hinduness. He is fountain head of Hindutva Idelogy who runs India now and father of Intellectuallism in India. He is one icon of Dalit Movement. This movie shown all of this with good pace and direction. Randeeps acting is incredible and must to watch. Savarkar is from Land of Hindu King Shree Chatrapati Shivaji ,Maharaj. The movie deserve all the love of Patriot Indians. Must watch movie for everyone. Randeep, Ankita Lokhande all did oustading efforts to make this movie. Epic Tale of Heroism for The Nation.
Predicted sentiment: Positive
Prediction time: 0.16 seconds


In [None]:
# Sure, let's break down each layer in detail:

# 1. **Embedding Layer**:
#    - The `Embedding` layer is responsible for converting the sequences of numbers representing words into dense vectors of fixed size. It essentially learns to represent words in a continuous vector space where words with similar meanings have similar vector representations.
#    - `input_dim=5000` specifies the size of the vocabulary, i.e., the number of unique words in our dataset.
#    - `output_dim=embedding_dim=128` specifies the dimensionality of the dense embedding. Each word will be represented by a vector of size 128.
#    - `input_length=100` specifies the length of the input sequences, i.e., the maximum length of a review after padding.

# 2. **SpatialDropout1D Layer**:
#    - The `SpatialDropout1D` layer applies dropout to the embeddings. Dropout is a regularization technique used to prevent overfitting by randomly setting a fraction of input units to zero during training. 
#    - `0.2` is the dropout rate, meaning that 20% of the elements in the embedding vectors will be randomly set to zero during each update.

# 3. **LSTM Layer**:
#    - The `LSTM` (Long Short-Term Memory) layer is a type of recurrent neural network (RNN) layer that is capable of learning long-term dependencies in sequence data.
#    - `100` is the number of units (or cells) in the LSTM layer. This parameter controls the dimensionality of the output space.
#    - `dropout=0.2` specifies the dropout rate for the input units of the LSTM layer. It means that 20% of the input units will be dropped during training.
#    - `recurrent_dropout=0.2` specifies the dropout rate for the recurrent units (the connections between the cells) of the LSTM layer.

# 4. **Dense Layer with Sigmoid Activation**:
#    - The `Dense` layer is a fully connected layer where each neuron is connected to every neuron in the previous layer.
#    - `1` specifies the number of neurons in the layer, as we are dealing with binary classification (positive or negative sentiment), so we have only one output neuron.
#    - `activation='sigmoid'` specifies the activation function for the output neuron. Sigmoid activation squashes the output to the range [0, 1], which is suitable for binary classification where the output represents the probability of the positive class.

# In summary, this architecture takes tokenized and padded sequences of words, processes them through an embedding layer, applies dropout for regularization, passes the sequences through an LSTM layer to capture sequential information, and finally, produces a binary sentiment prediction using a dense layer with a sigmoid activation function.