In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### 1. Data Description

In [2]:
#Load the dataset
df_OG = pd.read_csv('Amazon_Reviews.csv')

In [3]:
#Display first five rows
df_OG.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Ratings,Date,Summary
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,4/27/2011,Good Quality Dog Food
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,9/7/2012,Not as Advertised
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,8/18/2008,"""Delight"" says it all"
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,6/13/2011,Cough Medicine
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,10/21/2012,Great taffy


In [4]:
#Display Last five rows
df_OG.tail()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Ratings,Date,Summary
516,B000G6RYNE,A38KP1POQ191WT,"Judy Schinske ""Veronica""",0,1,1,7/14/2010,"I have had better ""Jalapeno Kettle Chips"""
517,B000G6RYNE,A1AVLKAXW55QQA,M. Simpson,0,1,4,2/1/2010,Spicy but good
518,B000G6RYNE,A2DQBXU2LEVWWA,V. Lowe,0,1,3,12/25/2009,boulder salt and malt vinegar chips are way be...
519,B000G6RYNE,A4NDPXCYKRCY2,Janet R. Miles,0,1,2,11/19/2009,POTATO CHIPS
520,B000G6RYNE,A3TK2IOP8UQ087,"Alabaster Jones ""perpenhopher""",0,1,5,10/11/2009,"Lightly Salted, Heavily Delicious!"


In [5]:
#Shape of Data
df_OG.shape

(521, 8)

In [6]:
df_OG.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 521 entries, 0 to 520
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ProductId               521 non-null    object
 1   UserId                  521 non-null    object
 2   ProfileName             521 non-null    object
 3   HelpfulnessNumerator    521 non-null    int64 
 4   HelpfulnessDenominator  521 non-null    int64 
 5   Ratings                 521 non-null    int64 
 6   Date                    521 non-null    object
 7   Summary                 521 non-null    object
dtypes: int64(3), object(5)
memory usage: 32.7+ KB


### 2. Data Cleaning & Preprocessing

In [7]:
#Checking for null values
df_OG.isna().sum()

ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Ratings                   0
Date                      0
Summary                   0
dtype: int64

In [8]:
#Checking for duplicate values
df_OG.duplicated().sum()

np.int64(0)

### 3. Feature Engineering

In [9]:
# Creating Sentiment column from Ratings for sentiment analysis
def get_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

df_OG['Sentiment'] = df_OG['Ratings'].apply(get_sentiment)

In [10]:
df_OG.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Ratings,Date,Summary,Sentiment
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,4/27/2011,Good Quality Dog Food,positive
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,9/7/2012,Not as Advertised,negative
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,8/18/2008,"""Delight"" says it all",positive
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,6/13/2011,Cough Medicine,negative
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,10/21/2012,Great taffy,positive


In [11]:
# Keep only necessary columns
df = df_OG[['Summary', 'Ratings', 'Sentiment']]

In [12]:
import re

In [13]:
# Manual stopwords list
stop_words = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
    'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just',
    'don', 'should', 'now'
])

In [14]:
# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text.strip()

df["Cleaned_Summary"] = df["Summary"].apply(clean_text)

In [15]:
df.head()

Unnamed: 0,Summary,Ratings,Sentiment,Cleaned_Summary
0,Good Quality Dog Food,5,positive,good quality dog food
1,Not as Advertised,1,negative,advertised
2,"""Delight"" says it all",4,positive,delight says
3,Cough Medicine,2,negative,cough medicine
4,Great taffy,5,positive,great taffy


### 4. Text Vectorization and Data Balancing

In [16]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=2000)
X_vectorized = vectorizer.fit_transform(df["Cleaned_Summary"])

# Extract target
y_target = df["Sentiment"]

In [18]:
# Apply SMOTE for balancing target 
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X_vectorized, y_target)

In [19]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### 5. Sentiment Classification Models and Evaluation Metrics

**Logistic Regression**

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

In [21]:
LR_model = LogisticRegression(max_iter=1000)

In [22]:
LR_model.fit(X_train, y_train)

In [23]:
y_predLR = LR_model.predict(X_test)

In [24]:
Accuracy_LR = accuracy_score(y_test, y_predLR)
Precision_LR = precision_score(y_test, y_predLR, average='macro', zero_division=0)
Recall_LR = recall_score(y_test, y_predLR, average='macro', zero_division=0)
F1_Score_LR = f1_score(y_test, y_predLR, average='macro', zero_division=0)

**Naive Bayes**

In [25]:
from sklearn.naive_bayes import MultinomialNB

In [26]:
NB_model = MultinomialNB()

In [27]:
NB_model.fit(X_train, y_train)

In [28]:
y_predNB = NB_model.predict(X_test)

In [29]:
Accuracy_NB = accuracy_score(y_test, y_predNB)
Precision_NB = precision_score(y_test, y_predNB, average='macro', zero_division=0)
Recall_NB = recall_score(y_test, y_predNB, average='macro', zero_division=0)
F1_Score_NB = f1_score(y_test, y_predNB, average='macro', zero_division=0)

**Random Forest**

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
RF_model = RandomForestClassifier(n_estimators=100)

In [32]:
RF_model.fit(X_train, y_train)

In [33]:
y_predRF = RF_model.predict(X_test)

In [34]:
Accuracy_RF = accuracy_score(y_test, y_predRF)
Precision_RF = precision_score(y_test, y_predRF, average='macro', zero_division=0)
Recall_RF = recall_score(y_test, y_predRF, average='macro', zero_division=0)
F1_Score_RF = f1_score(y_test, y_predRF, average='macro', zero_division=0)

**Gradient Boosting**

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
GB_model = GradientBoostingClassifier()

In [37]:
GB_model.fit(X_train, y_train)

In [38]:
y_predGB = GB_model.predict(X_test)

In [39]:
Accuracy_GB = accuracy_score(y_test, y_predGB)
Precision_GB = precision_score(y_test, y_predGB, average='macro', zero_division=0)
Recall_GB = recall_score(y_test, y_predGB, average='macro', zero_division=0)
F1_Score_GB = f1_score(y_test, y_predGB, average='macro', zero_division=0)

### 6. Model Evaluation

In [40]:
# Comparing models evaluation metrics
results = { 'Model': ['Logistic Regression', 'Naive Bayes', 'Random Forest', 'Gradient Boosting'],
            'Accuracy': [Accuracy_LR, Accuracy_NB, Accuracy_RF, Accuracy_GB],
            'Precision': [Precision_LR, Precision_NB, Precision_RF, Precision_GB],
            'Recall': [Recall_LR, Recall_NB, Recall_RF, Recall_GB],
            'F1 Score': [F1_Score_LR, F1_Score_NB, F1_Score_RF, F1_Score_GB]
        }

In [41]:
# Create a summary table
res_df = pd.DataFrame(results)
res_df.sort_values(by="F1 Score", ascending=False).reset_index(drop=True)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.937008,0.937984,0.936741,0.936497
1,Gradient Boosting,0.901575,0.908934,0.901261,0.901841
2,Naive Bayes,0.889764,0.902025,0.889916,0.887274
3,Random Forest,0.885827,0.905032,0.886181,0.884247


### Interpretations

1. **Logistic Regression outperforms all other models**, achieving the highest accuracy (93.7%) and the most balanced precision, recall, and F1 score — making it the best choice for this sentiment classification task.

2. **Gradient Boosting performs well**, with over 90% in all metrics, but slightly trails behind Logistic Regression. It’s still a strong model, especially if feature interactions are important.

3. **Random Forest and Naive Bayes show similar results**, both around 89% accuracy, but they lag in F1 score, which indicates they’re slightly less balanced in handling true positives and false positives/negatives.

4. **Precision and recall are well-balanced** across all models, which is ideal in sentiment analysis where we want to correctly identify both positive and negative sentiments.

5. **Naive Bayes, while simple and fast**, performs decently but makes strong independence assumptions that may not hold true in real-world text data, affecting its accuracy slightly.

6. **Overall, Logistic Regression offers the best trade-off** between simplicity, interpretability, and performance — making it the most practical model for deployment in this scenario.

### 7. Saving Models to .pkl file

In [42]:
import pickle

In [43]:
# Store models in a dictionary
models = {
    'Logistic_Regression': LR_model,
    'Naive Bayes': NB_model,
    'Random_Forest': RF_model,
    'Gradient_Boosting': GB_model
}

# Save the model to a .pkl file
for model_name, model in models.items():
    with open(f'{model_name}.pkl', 'wb') as file:
        pickle.dump(model, file)
    
    print(f"{model_name} saved to {model_name}.pkl")

Logistic_Regression saved to Logistic_Regression.pkl
Naive Bayes saved to Naive Bayes.pkl
Random_Forest saved to Random_Forest.pkl
Gradient_Boosting saved to Gradient_Boosting.pkl


### 8. Saving Preprocessed dataset to CSV file for dashboarding

In [44]:
df_OG.to_csv('Cleaned_reviews.csv', index=False)

print('Dataset saved as Cleaned_reviews.csv')

Dataset saved as Cleaned_reviews.csv
