<a href="https://colab.research.google.com/github/Razib99/Amazon_Sales_Sentiment_Analysis_CSE445_ML/blob/main/Amazon_Sales_Sentiment_Analysis_CSE445.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [4]:
# Loading dataset
from google.colab import drive
path = "/content/amazon.csv"
df = pd.read_csv(path)

In [5]:
# Displaying initial information
print(df.head(5))
print(df.info())
print(df.isnull().sum())

   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%  

In [8]:
# Preprocessing the data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    text = [ps.stem(word) for word in text if word not in stop_words]
    return ' '.join(text)

df['cleaned_reviews'] = df['review_title'].apply(clean_text) # Change preprocess_text to clean_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Converting 'rating' column to float type, coerce errors to NaN
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Filling NaN values with a default value, for example, 0
df['rating'].fillna(0, inplace=True)

In [11]:
# Rounding 'rating' column to nearest integer
df['rating'] = df['rating'].round().astype(int)

In [12]:
# Applying the 'assign_sentiment' function to each element in the 'rating' column
def assign_sentiment(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating == 3:
        return 0  # Neutral
    else:
        return -1  # Negative

df['sentiment'] = df['rating'].apply(assign_sentiment)

In [13]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['review_title']).toarray()
y = df['sentiment']

In [14]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
# Encoding the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [16]:
# Models to train
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}

In [17]:
# Fitting the models and storing the results
for name, model in models.items():
    model.fit(X_train, y_train_encoded)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    report = classification_report(y_test_encoded, y_pred, zero_division=0)
    confusion = confusion_matrix(y_test_encoded, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'classification_report': report,
        'confusion_matrix': confusion
    }

In [18]:
# Printing the results
for name, result in results.items():
    print(f"Model: {name}")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Classification Report:\n{result['classification_report']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}\n")

Model: Naive Bayes
Accuracy: 0.9692832764505119
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         8
           2       0.97      1.00      0.98       284

    accuracy                           0.97       293
   macro avg       0.32      0.33      0.33       293
weighted avg       0.94      0.97      0.95       293

Confusion Matrix:
[[  0   0   1]
 [  0   0   8]
 [  0   0 284]]

Model: SVM
Accuracy: 0.9692832764505119
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         8
           2       0.97      1.00      0.98       284

    accuracy                           0.97       293
   macro avg       0.32      0.33      0.33       293
weighted avg       0.94      0.97      0.95       293

Confusion Matrix:
[[  0   0   1]
 [  0 