In [65]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv("C:/Users/vijay/Downloads/archive (1)/Dataset-SA.csv")

print("Columns:", df.columns.tolist())
print(df.head())



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Columns: ['product_name', 'product_price', 'Rate', 'Review', 'Summary', 'Sentiment']
                                        product_name product_price Rate  \
0  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    5   
1  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    5   
2  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    3   
3  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    1   
4  Candes 12 L Room/Personal Air Cooler??????(Whi...          3999    3   

            Review                                            Summary  \
0           super!  great cooler excellent air flow and for this p...   
1          awesome              best budget 2 fit cooler nice cooling   
2             fair  the quality is good but the power of air is de...   
3  useless product                  very bad product its a only a fan   
4             fair                                      ok ok product   

  Sentiment  
0  positive

In [66]:
text_col = 'Review'     
rating_col = 'Rate'   

df = df[df[text_col].notnull()]
df = df[df[text_col].str.strip() != ""]

df = df.drop_duplicates(subset=[text_col])


df[rating_col] = pd.to_numeric(df[rating_col], errors='coerce')
df = df[ df[rating_col].notnull() ]  

df = df[df[rating_col] != 3]  
df['sentiment'] = np.where(df[rating_col] >= 4, 'Positive', 'Negative')

print("After labeling, distribution:")
print(df['sentiment'].value_counts())


After labeling, distribution:
Positive    968
Negative    241
Name: sentiment, dtype: int64


In [67]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df['clean_review'] = df[text_col].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['clean_review'])
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [68]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision (Positive):", precision_score(y_test, y_pred, pos_label='Positive'))
print("Recall (Positive):", recall_score(y_test, y_pred, pos_label='Positive'))
print("F1 Score (Positive):", f1_score(y_test, y_pred, pos_label='Positive'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8264462809917356
Precision (Positive): 0.8220338983050848
Recall (Positive): 1.0
F1 Score (Positive): 0.9023255813953489

Classification Report:
               precision    recall  f1-score   support

    Negative       1.00      0.12      0.22        48
    Positive       0.82      1.00      0.90       194

    accuracy                           0.83       242
   macro avg       0.91      0.56      0.56       242
weighted avg       0.86      0.83      0.77       242

Confusion Matrix:
 [[  6  42]
 [  0 194]]
