<a href="https://colab.research.google.com/github/Siddhi-S-Thakur/Movie-Review-Sentiment-Analysis/blob/main/movieReview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("IMDB Dataset.csv", engine = 'python', on_bad_lines = "skip")

## Data preprocessing

In [None]:
df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# Converting reviews to lowercase
df['review'] = df['review'].str.lower()

In [None]:

# Removing HTML tags from reviews
import re

def remove_html(text):
    pattern = r'<.*?>'
    text = re.sub(pattern, '', text)
    return text

df['review'] = df['review'].apply(remove_html)


In [None]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1


In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def remove_urls(text):
    pattern = r'https?://\S+|www\.\S+'
    return re.sub(pattern, '', text)

df['review'] = df['review'].astype(str).apply(remove_urls)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
X = df['review']
y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### TF-IDF

In [None]:
#transform the text data into numerical features using TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### ML Models

In [None]:
#Logistic Regression Model
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = log_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8802


In [None]:
# Random Forest Model

from sklearn.ensemble import RandomForestClassifier

rnd_model = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = rnd_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8421


In [None]:
#Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = dt_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7129


In [None]:
# XGBoost Model
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder= False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_tfidf, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
y_pred = xgb_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8480


In [None]:
#KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_tfidf, y_train)


In [None]:
y_pred = knn_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7353


In [None]:
#SVM Model
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8762


### DL tf-Keras Model

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Tokenize text data

tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen = 200)
X_test = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen = 200)

In [None]:
# prepareing target variables

Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [None]:
# Building LSTM model

model = Sequential()
model.add(Embedding(input_dim = 5000, output_dim=128, input_shape=(200,)))
model.add(LSTM(128, dropout=0.2, recurrent_dropout = 0.2))
model.add(Dense(1,activation="sigmoid"))

  super().__init__(**kwargs)


In [None]:
#Compiling the model

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
#Training the model

model.fit(X_train, Y_train, epochs = 8, batch_size= 100, validation_split = 0.2)

Epoch 1/8
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 753ms/step - accuracy: 0.8804 - loss: 0.2971 - val_accuracy: 0.8479 - val_loss: 0.3528
Epoch 2/8
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 751ms/step - accuracy: 0.9063 - loss: 0.2458 - val_accuracy: 0.8733 - val_loss: 0.3158
Epoch 3/8
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 752ms/step - accuracy: 0.9254 - loss: 0.2047 - val_accuracy: 0.8677 - val_loss: 0.3240
Epoch 4/8
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 749ms/step - accuracy: 0.9364 - loss: 0.1793 - val_accuracy: 0.8681 - val_loss: 0.3358
Epoch 5/8
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 753ms/step - accuracy: 0.9359 - loss: 0.1728 - val_accuracy: 0.8618 - val_loss: 0.3580
Epoch 6/8
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 748ms/step - accuracy: 0.9567 - loss: 0.1294 - val_accuracy: 0.8614 - val_loss: 0.3689
Epoch 7/8


<keras.src.callbacks.history.History at 0x7ae118a4ce90>