In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load the dataset
df = pd.read_csv("IMDB Dataset.csv")


In [2]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Remove <br /> tags from all columns in the DataFrame
df = df.replace('<br />', '', regex=True)

# Write the cleaned DataFrame back to a CSV file
df.to_csv('cleaned_IMDB_Dataset.csv', index=False)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2, random_state=42)

# Convert the tweets into feature vectors using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

# Train a Random Forest classifier on the training set
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8486


In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2, random_state=42)
# Tokenize and lemmatize tweets
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(review):
    tokens = word_tokenize(review)
    tokens = [token for token in tokens if token not in stop_words] # remove stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatize
    return " ".join(tokens)

df["review"] = df["review"].apply(tokenize_and_lemmatize)

# Convert tweets into feature vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df["review"])
y = df["sentiment"]

# Train a Random Forest classifier on the dataset
rf = RandomForestClassifier()
rf.fit(X, y)

# Make predictions on the dataset
y_pred = rf.predict(X)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y, y_pred)
print("Accuracy of the model after Tokenize and lemmatize:", accuracy)

Accuracy of the model after Tokenize and lemmatize: 1.0


Designed and trained the above model using Supervised learning (Recurrent Neural Networks)


In [10]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(df['review'].values)
X = tokenizer.texts_to_sequences(df['review'].values)
X = pad_sequences(X)

# Convert the sentiment labels to numeric values
y = pd.get_dummies(df['sentiment']).values

# Split the data into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

In [11]:
X_train.shape

(40000, 1088)

In [12]:
y_train.shape

(40000, 2)

In [15]:

# Define the RNN model
model = Sequential()
model.add(Embedding(5000, 128, input_length=X.shape[1]))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=3, batch_size=128, validation_data=(X_test, y_test))

# Evaluate the model on the test data
_, accuracy = model.evaluate(X_test, y_test, batch_size=128)
print('Accuracy of RNN:', accuracy)



Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy of RNN: 0.885699987411499
