In [223]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk 
import scipy.sparse as sp
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report


In [224]:
df = pd.read_csv('IMDB Dataset.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [225]:
df.sample(10)

Unnamed: 0,review,sentiment
38361,"Endless repetition about the evil World Bank, ...",negative
25104,This is the biggest piece of crap ever. It loo...,negative
38661,I saw this as a child in the late eighties and...,positive
32835,I MAY have seen an episode or 2 when the show ...,positive
5796,This show is totally worth watching. It has th...,positive
34354,Why am I so convinced there's actually another...,positive
16567,"This movie was awful, plain and simple! The an...",negative
39248,The Last Command (1928) is a silent film direc...,positive
5055,I had never seen such an incredible acting job...,positive
6966,"This film is shoddily-made, unoriginal garbage...",negative


In [226]:
df.info() # get the information of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [227]:
import re
# change the ojbect to lower case and remove the sepcial characters
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['review']=df['review'].apply(preprocess_text)
df.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [228]:
nltk.download('stopwords') # download the stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [229]:
# Function to remove stopwords
def remove_stopwords(text):
    words = re.findall(r'\w+', text.lower())  # Tokenize text
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Apply stopword removal
df['review'] = df['review'].apply(remove_stopwords)

df.head()   

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


In [230]:
df['sentiment'].unique() # get the unique values of the sentiment

array(['positive', 'negative'], dtype=object)

In [231]:
# change the sentiment to 1 and 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [232]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Apply TF-IDF vectorization with a limited feature size to reduce memory usage
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['review'])

x=X
y=df['sentiment']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

# Predict the sentiment of the test set
y_pred = clf.predict(x_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 85.02%

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      4961
           1       0.86      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [233]:
# classification using logistic regression
from sklearn.linear_model import LogisticRegression

x=X
y=df['sentiment']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train a Logistic Regression Classifier
clf = LogisticRegression()
clf.fit(x_train, y_train)

# Predict the sentiment of the test set
y_pred = clf.predict(x_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 88.79%

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [234]:

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, SimpleRNN, Reshape
from sklearn.metrics import classification_report

In [239]:
# Build rnn model
rnn_model = Sequential([
    Reshape((x_train.shape[1], 1), input_shape=(x_train.shape[1],)),
    SimpleRNN(128, return_sequences=False),  # Set return_sequences=False
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])


  super().__init__(**kwargs)


In [236]:
#Build lstm model
lstm_model = Sequential(
    [
        Reshape((x_train.shape[1], 1), input_shape=(x_train.shape[1],)),
        LSTM(128, return_sequences=True),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ]
)

In [237]:
# Bidirectional LSTM model
bidirectional_lstm_model = Sequential(
    [
        Reshape((x_train.shape[1], 1), input_shape=(x_train.shape[1],)),
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ]
)

In [240]:
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
rnn_model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test), verbose=1)

# Predict the sentiment of the test set
pred = rnn_model.predict(x_test)
pred = (pred > 0.5).astype(int)  # Convert probabilities to 0 or 1

# Calculate accuracy
accuracy = accuracy_score(y_test, pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, pred))

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1017s[0m 811ms/step - accuracy: 0.4987 - loss: 0.7381 - val_accuracy: 0.5039 - val_loss: 0.6939
Epoch 2/5
[1m 851/1250[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m5:31[0m 831ms/step - accuracy: 0.5025 - loss: 0.6977

KeyboardInterrupt: 

In [None]:
#  Evaluate the lstm model

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  
lstm_model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test), verbose=1)

# Predict the sentiment of the test set
pred = lstm_model.predict(x_test)                
pred=(y_pred > 0.5, 1, 0)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Epoch 1/5


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(32, 1), output.shape=(32, 5000)

In [None]:
# Evaluate the bidirectional lstm model
model = bidirectional_lstm_model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test), verbose=1)

# Predict the sentiment of the test set
pred = model.predict(x_test)
pred=(y_pred > 0.5, 1, 0)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Epoch 1/5


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(32, 1), output.shape=(32, 5000)

In [None]:
#Deep learning model with FastText

df_p = df[df['sentiment'] == 1].iloc(0:2500)
df_n = df[df['sentiment'] == 0].iloc(0:2500)

df = pd.concat([df_p, df_n],axis=1)

In [None]:
import fasttext

model = fasttext.train_supervised(input='train.txt', lr=0.01, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='hs')
model.save_model("model_filename.bin")
print('Model trained and saved successfully!')

In [None]:
#load the BEST model
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
from transformers import AutoTokenizer

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Rename columns for compatibility
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")


In [None]:
from transformers import TrainingArguments, Trainer

# Define training parametersgmail
training_args = TrainingArguments(
    output_dir="./bert-custom",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,from tra
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    prediction = outputs.logits.argmax().item()
    return "Positive" if prediction == 1 else "Negative"

# Example reviews
reviews = [
    "Amazing movie with a great storyline!",
    "I didn't enjoy this film at all.",
    "It was okay, nothing special."
]

for review in reviews:
    print(f"Review: {review}\nPredicted Sentiment: {predict_sentiment(review)}\n")


In [241]:
import pickle

# Save model and vectorizer
with open("random_forest_sentiment.pkl", "wb") as f:
    pickle.dump((clf, vectorizer), f)

print("Model saved successfully!")


Model saved successfully!
