In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("/content/Dataset-SA.csv")

In [None]:
data

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral
...,...,...,...,...,...,...
205047,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,must buy!,good product,positive
205048,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,super!,nice,positive
205049,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,3,nice,very nice and fast delivery,positive
205050,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,just wow!,awesome product,positive


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120552 entries, 0 to 120551
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_name   120552 non-null  object
 1   product_price  120552 non-null  object
 2   Rate           120552 non-null  object
 3   Review         95888 non-null   object
 4   Summary        120542 non-null  object
 5   Sentiment      120551 non-null  object
dtypes: object(6)
memory usage: 5.5+ MB


In [None]:
print("Missing values in columns:", data.isnull().sum())

Missing values in columns: product_name         0
product_price        0
Rate                 0
Review           24664
Summary             10
Sentiment            1
dtype: int64


In [None]:
# Drop rows with missing 'Review' or 'Sentiment' values
data.dropna(subset=['Review', 'Sentiment'], inplace=True)

In [None]:
# Map 'Sentiment' values ('positive' -> 1, 'negative' -> 0)
data['Sentiment'] = data['Sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
data

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,1.0
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,1.0
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,1.0
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,0.0
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,
...,...,...,...,...,...,...
120546,boAt Aavante Bar 2000 160 W Bluetooth Soundbar...,9999,5,excellent,sound quality is superb 160w is more than enou...,1.0
120547,boAt Aavante Bar 2000 160 W Bluetooth Soundbar...,9999,5,best in the market!,i like this product and the packing was nice t...,1.0
120548,boAt Aavante Bar 2000 160 W Bluetooth Soundbar...,9999,5,terrific purchase,excellent product and excellent music bass is ...,1.0
120549,boAt Aavante Bar 2000 160 W Bluetooth Soundbar...,9999,5,fabulous!,superb sound quality excellent bass overall ve...,1.0


In [None]:
data = data.dropna(subset=['Sentiment'])


In [None]:
# Ensure that 'Sentiment' column is in integer format
data['Sentiment'] = data['Sentiment'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sentiment'] = data['Sentiment'].astype('int')


In [None]:
# Preprocess the text data (convert to lowercase)
data['Review'] = data['Review'].str.lower()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Review'] = data['Review'].str.lower()


In [None]:
# Feature extraction with TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = vectorizer.fit_transform(data['Review'])
y = data['Sentiment'] #target value

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(73143, 500) (73143,) (18286, 500) (18286,)


#Logistic Regression

In [None]:
logreg = LogisticRegression(penalty='l1', solver='liblinear')  # L1 Regularization
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
logreg.score(X_test, y_test)

0.9663677130044843

In [None]:
from sklearn.metrics import confusion_matrix #no.of correct and incorrrect prediction
cm = confusion_matrix(y_test , y_pred)
cm

array([[ 2126,   460],
       [  155, 15545]])

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test , y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      2586
           1       0.97      0.99      0.98     15700

    accuracy                           0.97     18286
   macro avg       0.95      0.91      0.93     18286
weighted avg       0.97      0.97      0.97     18286



#DECISION TREE

In [None]:
#Decision Tree - splits the data
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [None]:
y_pred_dt = dt_model.predict(X_test)

In [None]:
y_pred_dt

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
dt_model.score(X_test, y_test)

0.9665317729410478

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_pred_dt)
cm

array([[ 2134,   452],
       [  160, 15540]])

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test , y_pred_dt)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.83      0.87      2586
           1       0.97      0.99      0.98     15700

    accuracy                           0.97     18286
   macro avg       0.95      0.91      0.93     18286
weighted avg       0.97      0.97      0.97     18286



#RANDOM FOREST

In [None]:
#random forest - it uses multiple decition tree
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test)

In [None]:
y_pred_rf

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
rf_model.score(X_test, y_test)

0.966586459586569

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_pred_rf)
cm

array([[ 2132,   454],
       [  157, 15543]])

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test , y_pred_rf)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      2586
           1       0.97      0.99      0.98     15700

    accuracy                           0.97     18286
   macro avg       0.95      0.91      0.93     18286
weighted avg       0.97      0.97      0.97     18286



#K- NEAREST NEIGHBORS

In [None]:
# K-Nearest Neighbors - among k-nearest point
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred_knn = knn_model.predict(X_test)

In [None]:
y_pred_knn

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
knn_model.score(X_test, y_test)

0.9660942797768785

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_pred_knn)
cm

array([[ 2127,   459],
       [  161, 15539]])

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test , y_pred_knn)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      2586
           1       0.97      0.99      0.98     15700

    accuracy                           0.97     18286
   macro avg       0.95      0.91      0.93     18286
weighted avg       0.97      0.97      0.97     18286



#NAVIE BAYES

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)



In [None]:
y_pred = nb.predict(X_test)

In [None]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
nb.score(X_test, y_test)

0.9594771956688177

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_pred)
cm

array([[ 2013,   573],
       [  168, 15532]])

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test , y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.92      0.78      0.84      2586
           1       0.96      0.99      0.98     15700

    accuracy                           0.96     18286
   macro avg       0.94      0.88      0.91     18286
weighted avg       0.96      0.96      0.96     18286



#K-MEANS CLUSTERING

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix

In [None]:
# K-Means Clustering (Note: No labels required for KMeans)
kmeans = KMeans(n_clusters=2, random_state=42)  # Assuming 2 clusters for binary sentiment
kmeans.fit(X)

In [None]:
y_pred = kmeans.predict(X)

In [None]:
y_pred_mapped = [1 if label == 1 else 0 for label in y_pred]

In [None]:
cm = confusion_matrix(y, y_pred_mapped)
cm

array([[12814,    57],
       [73728,  4830]])

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.15      1.00      0.26     12871
           1       0.99      0.06      0.12     78558

    accuracy                           0.19     91429
   macro avg       0.57      0.53      0.19     91429
weighted avg       0.87      0.19      0.14     91429



#ADA BOOSTING

In [None]:
from sklearn.ensemble import AdaBoostClassifier


In [None]:
# Train AdaBoost Classifier
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train, y_train)



In [None]:
y_pred = ada.predict(X_test)

In [None]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
#Accuracy
ada.score(X_test, y_test)

0.9663677130044843

In [None]:
#classification report
from sklearn.metrics import classification_report
cr = classification_report(y_test , y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      2586
           1       0.97      0.99      0.98     15700

    accuracy                           0.97     18286
   macro avg       0.95      0.91      0.93     18286
weighted avg       0.97      0.97      0.97     18286

