In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd

df = pd.read_csv('Reviews.csv')  # or your actual file path
print(df.columns)


Index(['product_name', 'product_price', 'Rate', 'Review', 'Summary',
       'Sentiment'],
      dtype='object')


In [5]:
# Load CSV
df = pd.read_csv('Reviews.csv')  # replace with actual file name if different

# Select relevant columns
df = df[['product_name', 'product_price', 'Rate', 'Review', 'Summary', 'Sentiment']]

# Drop rows with missing values
df.dropna(inplace=True)

# Preview
df.head()


Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [7]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Combine summary + reviews and clean text
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    text = text.lower().split()
    return ' '.join(ps.stem(word) for word in text if word not in stop_words)

df['text'] = (df['Summary'].fillna('') + ' ' + df['Review']).apply(clean_text)


In [8]:
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['Sentiment'])  # 0=negative, 1=neutral, 2=positive


In [9]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['text']).toarray()
y = df['sentiment_encoded']


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [11]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [12]:
y_pred = model.predict(X_test)

# Confusion Matrix & Classification Report
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Confusion Matrix:
 [[ 3884    33   963]
 [  353   261  1148]
 [  541    65 28828]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.81      0.80      0.80      4880
     neutral       0.73      0.15      0.25      1762
    positive       0.93      0.98      0.95     29434

    accuracy                           0.91     36076
   macro avg       0.82      0.64      0.67     36076
weighted avg       0.91      0.91      0.90     36076



In [13]:
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

print("Model, vectorizer, and encoder saved successfully!")


Model, vectorizer, and encoder saved successfully!
