In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Define the project path in Google Drive
project_path = "//content//drive/My Drive//Sentiment_analysis_project"

# Verify the dataset path
dataset_path = os.path.join(project_path, "Dataset//IMDB-Dataset.csv")

# Check if the file exists
print("Dataset Found:", os.path.exists(dataset_path))

Dataset Found: True


In [3]:
#installing required libraries
!pip install scikit-learn nltk joblib



In [4]:
#importing required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
import joblib
import os

In [5]:
# Download stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Set project path
project_path = "/content/drive/My Drive/Sentiment_analysis_project"

In [7]:
# Load dataset
dataset_path = os.path.join(project_path, "Dataset//IMDB-Dataset.csv")
df = pd.read_csv(dataset_path)

In [8]:
# Keep only relevant columns
df = df[['review', 'sentiment']]

In [9]:
print(df.columns)

Index(['review', 'sentiment'], dtype='object')


In [10]:
# Text Preprocessing
stop_words = set(stopwords.words("english"))

In [11]:
def preprocess_text(text):
    # Example: Convert to lowercase and remove special characters
    import re
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text

In [12]:
df["cleaned_review"] = df["review"].apply(preprocess_text)

In [13]:
#handling missing values
df["cleaned_review"] = df["review"].fillna("").apply(preprocess_text)

In [14]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...


In [15]:
#checking data type
df["cleaned_review"] = df["review"].astype(str).apply(preprocess_text)

In [16]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_review"],
                                                    df["sentiment"],
                                                    test_size=0.2,
                                                    random_state=42)

In [17]:
# Create a pipeline for TF-IDF + Naive Bayes
model_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("classifier", MultinomialNB())
])

In [18]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [19]:
#model evaluation
y_pred = model_pipeline.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8515


In [20]:
#saving model
joblib.dump(model_pipeline, "sentiment_model.pkl")

['sentiment_model.pkl']