In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from keras.models import Sequential

from keras.layers import Embedding, LSTM, Dense
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from google.colab import drive
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import re

In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/IMDB Dataset.csv')

# Print the first 5 rows
print(df.head(5))


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
# Count the number of positive and negative reviews
sentiment_counts = df['sentiment'].value_counts()

# Print the counts
print("Number of positive reviews:", sentiment_counts['positive'])
print("Number of negative reviews:", sentiment_counts['negative'])


Number of positive reviews: 25000
Number of negative reviews: 25000


In [5]:
X = df['review'].astype(str)
y = df['sentiment'].map({'positive': 1, 'negative': 0})

In [6]:
print("Length of X:", len(X))
print("Length of y:", len(y))

Length of X: 50000
Length of y: 50000


In [7]:
X

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [8]:
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [9]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
# Load NLTK WordNet for lemmatization
nltk.download('wordnet')
# Create instances for lemmatization and stemming
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove non-word characters and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Lowercasing
    text = text.lower()
    # Tokenize the data
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Apply lemmatization and stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [11]:
Xp = X.apply(preprocess_text)

In [12]:
Xp

0        one review mention watch oz episod youll hook ...
1        wonder littl product film techniqu unassum old...
2        thought wonder way spend time hot summer weeke...
3        basic there famili littl boy jake think there ...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job wasnt creativ orig...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    im go disagre previou comment side maltin one ...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object

In [13]:
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [14]:
print("Length of X:", len(X))
print("Length of y:", len(y))

Length of X: 50000
Length of y: 50000


In [15]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Xp)
Xp_sequences = tokenizer.texts_to_sequences(Xp)

# Pad sequences to ensure uniform length
max_sequence_length = max([len(seq) for seq in Xp_sequences])
Xp_pad = pad_sequences(Xp_sequences, maxlen=max_sequence_length)

In [16]:
Xp_pad

array([[    0,     0,     0, ...,   448,  3350,   387],
       [    0,     0,     0, ...,   278,    20,   153],
       [    0,     0,     0, ...,    16,    10,   128],
       ...,
       [    0,     0,     0, ...,  3267, 16177,  1063],
       [    0,     0,     0, ...,  1799,  1437,   321],
       [    0,     0,     0, ...,   911,   611,     1]], dtype=int32)

In [17]:
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [18]:
y_encoded

array([1, 1, 1, ..., 0, 0, 0])

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(Xp_pad, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0))
model.add(Dense(64, activation='relu'))  # Adding a dense layer
model.add(Dense(1, activation='sigmoid'))

In [21]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7eefc17a7550>

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)



In [34]:
# New text data for prediction
new_texts = ["The film lacks depth and fails to engage the audience. A forgettable experience", "Poorly executed storyline and lackluster acting make it hard to stay interested. Not worth the ticket price"]

# Tokenize and pad the new text data
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)

# Make predictions
predictions = model.predict(new_padded_sequences)

# Convert predictions to sentiment labels
sentiment_labels = ['Postive' if pred > 0.5 else 'B=Negative' for pred in predictions]

# Print predictions
for text, label in zip(new_texts, sentiment_labels):
    print(f'Text: {text} --> Sentiment: {label}')



Text: The film lacks depth and fails to engage the audience. A forgettable experience --> Sentiment: B=Negative
Text: Poorly executed storyline and lackluster acting make it hard to stay interested. Not worth the ticket price --> Sentiment: B=Negative


In [33]:
# New text data for prediction
new_texts = ["Disappointing. Predictable plot and dull performances make it a waste of time", "This film is a masterpiece, captivating from beginning to end. A must-watch!"]

# Tokenize and pad the new text data
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)

# Make predictions
predictions = model.predict(new_padded_sequences)

# Convert predictions to sentiment labels
sentiment_labels = ['Positive' if pred > 0.5 else 'Negative' for pred in predictions]

# Print predictions
for text, label in zip(new_texts, sentiment_labels):
    print(f'Text: {text} --> Sentiment: {label}')



Text: Disappointing. Predictable plot and dull performances make it a waste of time --> Sentiment: Negative
Text: This film is a masterpiece, captivating from beginning to end. A must-watch! --> Sentiment: Positive
