# Data Preprocessing

#### Libraries

In [20]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

#### Data Loading

In [21]:
df = pd.read_csv("data\IMDB Dataset.csv")

In [22]:
# Display first few rows
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


#### Data Cleaning

In [23]:
# Check dataset structure
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None


In [24]:
# Check for missing values
print(df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [25]:
# Check for duplicate rows
print(f"Duplicate rows: {df.duplicated().sum()}")

Duplicate rows: 418


In [26]:
df.drop_duplicates(inplace=True)

In [27]:
# Summary statistics of numerical features
print(df.describe())

                                                   review sentiment
count                                               49582     49582
unique                                              49582         2
top     Haven't seen the film since first released, bu...  positive
freq                                                    1     24884


In [28]:
import re

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove extra spaces
    text = ' '.join(text.split())
    return text

df['cleaned_review'] = df['review'].apply(clean_text)


In [29]:
# Display the first 5 rows of the cleaned dataset
print(df[['review', 'cleaned_review']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production the filming tech...  
2  i thought this was a wonderful way to spend ti...  
3  basically theres a family where a little boy j...  
4  petter matteis love in the time of money is a ...  


In [30]:
from nltk.tokenize import word_tokenize

# Tokenize the reviews
df['tokenized_review'] = df['cleaned_review'].apply(word_tokenize)
print(df[['cleaned_review', 'tokenized_review']].head())


                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production the filming tech...   
2  i thought this was a wonderful way to spend ti...   
3  basically theres a family where a little boy j...   
4  petter matteis love in the time of money is a ...   

                                    tokenized_review  
0  [one, of, the, other, reviewers, has, mentione...  
1  [a, wonderful, little, production, the, filmin...  
2  [i, thought, this, was, a, wonderful, way, to,...  
3  [basically, theres, a, family, where, a, littl...  
4  [petter, matteis, love, in, the, time, of, mon...  


In [31]:
from nltk.corpus import stopwords
import nltk

# Download the stopwords if you haven't already
nltk.download('stopwords')

# Set of stopwords in English
stop_words = set(stopwords.words('english'))

# Remove stopwords
df['cleaned_review_no_stopwords'] = df['tokenized_review'].apply(lambda x: [word for word in x if word not in stop_words])

# Check the cleaned review without stopwords
print(df[['tokenized_review', 'cleaned_review_no_stopwords']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                    tokenized_review  \
0  [one, of, the, other, reviewers, has, mentione...   
1  [a, wonderful, little, production, the, filmin...   
2  [i, thought, this, was, a, wonderful, way, to,...   
3  [basically, theres, a, family, where, a, littl...   
4  [petter, matteis, love, in, the, time, of, mon...   

                         cleaned_review_no_stopwords  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, filming, techn...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, theres, family, little, boy, jake,...  
4  [petter, matteis, love, time, money, visually,...  


In [32]:
from nltk.stem import WordNetLemmatizer

# Download wordnet for lemmatization if you haven't already
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize the cleaned review without stopwords
df['lemmatized_review'] = df['cleaned_review_no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Check the lemmatized reviews
print(df[['cleaned_review_no_stopwords', 'lemmatized_review']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                         cleaned_review_no_stopwords  \
0  [one, reviewers, mentioned, watching, oz, epis...   
1  [wonderful, little, production, filming, techn...   
2  [thought, wonderful, way, spend, time, hot, su...   
3  [basically, theres, family, little, boy, jake,...   
4  [petter, matteis, love, time, money, visually,...   

                                   lemmatized_review  
0  [one, reviewer, mentioned, watching, oz, episo...  
1  [wonderful, little, production, filming, techn...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, there, family, little, boy, jake, ...  
4  [petter, matteis, love, time, money, visually,...  


#### Feature Eng & Splitting & Training

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# Use the lemmatized reviews (converted back to string) if you have them
X = df['lemmatized_review'].apply(lambda x: ' '.join(x))  # Join lemmatized tokens
y = df['sentiment']  # Target labels

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Predict and print classification report
y_pred = model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save model and vectorizer (use binary mode)
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.83      0.84      4939
    positive       0.84      0.84      0.84      4978

    accuracy                           0.84      9917
   macro avg       0.84      0.84      0.84      9917
weighted avg       0.84      0.84      0.84      9917



In [34]:
# Load the saved model and vectorizer
with open('random_forest_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

# Sample input for prediction
sample_review = ["I absolutely loved this movie! The acting was fantastic, and the storyline was so heartwarming."]

# Transform the sample review using the loaded vectorizer (ensure it's in the same format as training data)
sample_review_tfidf = vectorizer.transform(sample_review)

# Predict the sentiment using the trained model
predicted_sentiment = model.predict(sample_review_tfidf)

# Output the prediction
print("Predicted Sentiment:", predicted_sentiment[0])


Predicted Sentiment: positive


In [36]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Sample training data
texts = [
    "I loved the movie, it was amazing!",
    "What a great experience, really enjoyed it!",
    "Horrible film, I hated it.",
    "Worst movie ever. Waste of time.",
    "Absolutely fantastic!",
    "Not good. Very boring.",
    "Wonderful plot and great acting.",
    "Terrible movie with a stupid story."
]

labels = [
    "positive", "positive", "negative", "negative",
    "positive", "negative", "positive", "negative"
]

# Vectorize text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Train model
model = LogisticRegression()
model.fit(X, labels)

# Save vectorizer and model
with open("D:/Sentimental_Analysis/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("D:/Sentimental_Analysis/model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model and vectorizer saved successfully!")

✅ Model and vectorizer saved successfully!
