# Sentiment-Based Product Recommendation System #

In [None]:

# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP & Preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

# Model Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Recommendation System
from sklearn.metrics.pairwise import cosine_similarity

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


 Data Cleaning & Preprocessing

In [None]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Capstone-Projects/sample30.csv")

# Basic Info
print("Shape of dataset:", df.shape)
print(df.info())
print(df.isnull().sum())

# Drop irrelevant columns
drop_cols = ['id','reviews_userCity','reviews_userProvince',
             'reviews_didPurchase','reviews_doRecommend']
df.drop(columns=drop_cols, inplace=True)

# Handle Missing Values
df['brand'].fillna("Unknown", inplace=True)
df['manufacturer'].fillna("Unknown", inplace=True)

# Drop rows with missing critical text/ratings
df.dropna(subset=['reviews_text','reviews_rating','reviews_username'], inplace=True)

# Convert datatypes
df['reviews_date'] = pd.to_datetime(df['reviews_date'], errors='coerce')
df['reviews_rating'] = df['reviews_rating'].astype(int)
df['user_sentiment'] = df['user_sentiment'].astype('category')

df.head()

Shape of dataset: (30000, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    30000 non-null  object
 1   brand                 30000 non-null  object
 2   categories            30000 non-null  object
 3   manufacturer          29859 non-null  object
 4   name                  30000 non-null  object
 5   reviews_date          29954 non-null  object
 6   reviews_didPurchase   15932 non-null  object
 7   reviews_doRecommend   27430 non-null  object
 8   reviews_rating        30000 non-null  int64 
 9   reviews_text          30000 non-null  object
 10  reviews_title         29810 non-null  object
 11  reviews_userCity      1929 non-null   object
 12  reviews_userProvince  170 non-null    object
 13  reviews_username      29937 non-null  object
 14  user_sentiment        29999 non-null  object
dtypes: int

Unnamed: 0,brand,categories,manufacturer,name,reviews_date,reviews_rating,reviews_text,reviews_title,reviews_username,user_sentiment
0,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",Universal Music Group / Cash Money,Pink Friday: Roman Reloaded Re-Up (w/dvd),2012-11-30 06:21:45+00:00,5,i love this album. it's very good. more to the...,Just Awesome,joshua,Positive
1,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09 00:00:00+00:00,5,Good flavor. This review was collected as part...,Good,dorothy w,Positive
2,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09 00:00:00+00:00,5,Good flavor.,Good,dorothy w,Positive
3,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,K-Y Love Sensuality Pleasure Gel,2016-01-06 00:00:00+00:00,1,I read through the reviews on here before look...,Disappointed,rebecca,Negative
4,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,K-Y Love Sensuality Pleasure Gel,2016-12-21 00:00:00+00:00,1,My husband bought this gel for us. The gel cau...,Irritation,walker557,Negative


Text Preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # lowercase
    text = text.lower()
    # remove urls
    text = re.sub(r'http\S+','',text)
    # remove punctuation & numbers
    text = re.sub(f"[{string.punctuation}0-9]"," ",text)
    # tokenize
    tokens = text.split()
    # remove stopwords + lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df['clean_review'] = df['reviews_text'].apply(clean_text)
df[['reviews_text','clean_review']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,reviews_text,clean_review
0,i love this album. it's very good. more to the...,love album good hip hop side current pop sound...
1,Good flavor. This review was collected as part...,good flavor review collected part promotion
2,Good flavor.,good flavor
3,I read through the reviews on here before look...,read review looking buying one couple lubrican...
4,My husband bought this gel for us. The gel cau...,husband bought gel u gel caused irritation fel...


Task 3: Feature Extraction (TF -IDF)

In [None]:
# Libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Feature (X) and Label (y)
X = df['clean_review']
y = df['user_sentiment'].map({'Negative': 0, 'Positive': 1})  # Convert to numeric

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# Shapes and Checks
print("TF-IDF Vectorization Complete!")
print("Train shape:", X_train_vec.shape)
print("Test shape:", X_test_vec.shape)
print("Sample features:", tfidf.get_feature_names_out()[:10])


âœ… TF-IDF Vectorization Complete!
Train shape: (23948, 5000)
Test shape: (5988, 5000)
Sample features: ['aaron' 'ability' 'able' 'abrasive' 'absence' 'absolute' 'absolutely'
 'absolutly' 'absorb' 'absorbed']


Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Logistic Regression
log_model = LogisticRegression(max_iter=300)
log_model.fit(X_train_vec, y_train)
y_pred_log = log_model.predict(X_test_vec)
print("\nðŸ”¹ Logistic Regression:\n", classification_report(y_test, y_pred_log))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)
print("\nðŸ”¹ Random Forest:\n", classification_report(y_test, y_pred_rf))

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
y_pred_nb = nb_model.predict(X_test_vec)
print("\nðŸ”¹ Naive Bayes:\n", classification_report(y_test, y_pred_nb))

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_vec, y_train)
y_pred_xgb = xgb_model.predict(X_test_vec)
print("\nðŸ”¹ XGBoost:\n", classification_report(y_test, y_pred_xgb))

# Compare Accuracies
print("\nModel Accuracies:")
print("Logistic Regression:", accuracy_score(y_test, y_pred_log))
print("Random Forest:", accuracy_score(y_test, y_pred_rf))
print("Naive Bayes:", accuracy_score(y_test, y_pred_nb))
print("XGBoost:", accuracy_score(y_test, y_pred_xgb))



ðŸ”¹ Logistic Regression:
               precision    recall  f1-score   support

           0       0.89      0.32      0.47       658
           1       0.92      1.00      0.96      5330

    accuracy                           0.92      5988
   macro avg       0.91      0.66      0.71      5988
weighted avg       0.92      0.92      0.90      5988


ðŸ”¹ Random Forest:
               precision    recall  f1-score   support

           0       0.86      0.39      0.54       658
           1       0.93      0.99      0.96      5330

    accuracy                           0.93      5988
   macro avg       0.89      0.69      0.75      5988
weighted avg       0.92      0.93      0.91      5988


ðŸ”¹ Naive Bayes:
               precision    recall  f1-score   support

           0       0.67      0.05      0.10       658
           1       0.90      1.00      0.94      5330

    accuracy                           0.89      5988
   macro avg       0.78      0.53      0.52      5988
weig

 Build UBCF & IBCF

In [None]:
# Create User-Item Rating Matrix
user_item_matrix = df.pivot_table(index='reviews_username',
                                  columns='name',
                                  values='reviews_rating').fillna(0)

# User-based CF
user_similarity = cosine_similarity(user_item_matrix)
user_sim_df = pd.DataFrame(user_similarity,
                           index=user_item_matrix.index,
                           columns=user_item_matrix.index)

# Item-based CF
item_similarity = cosine_similarity(user_item_matrix.T)
item_sim_df = pd.DataFrame(item_similarity,
                           index=user_item_matrix.columns,
                           columns=user_item_matrix.columns)


Recommend Top-20 Products

In [None]:
def recommend_user_based(user, top_n=20):
    # find similar users
    similar_users = user_sim_df[user].sort_values(ascending=False)[1:6].index
    # recommend products rated highly by similar users
    recommended_items = df[df['reviews_username'].isin(similar_users)]
    return recommended_items['name'].value_counts().head(top_n).index.tolist()

username = df['reviews_username'].sample(1).values[0]
top20_products = recommend_user_based(username, top_n=20)
print(f"\nTop 20 Recommendations for {username}:\n", top20_products)



Top 20 Recommendations for greyguy:
 ['Just For Men Touch Of Gray Gray Hair Treatment, Black T-55']


 Fine-Tuning with Sentiment

In [None]:
# Predict sentiment for top 20 recommended products
product_sentiments = {}
for product in top20_products:
    product_reviews = df[df['name']==product]['clean_review']
    if len(product_reviews) > 0:
        preds = best_model.predict(tfidf.transform(product_reviews))
        pos_percent = np.mean(preds)*100
        product_sentiments[product] = pos_percent

# Select Top 5 by positive sentiment
top5_products = sorted(product_sentiments, key=product_sentiments.get, reverse=True)[:5]
print("\nTop 5 Final Recommendations with Sentiment:")
for p in top5_products:
    print(f"{p} --> {product_sentiments[p]:.2f}% positive reviews")




Top 5 Final Recommendations with Sentiment:
Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd) --> 95.98% positive reviews
Godzilla 3d Includes Digital Copy Ultraviolet 3d/2d Blu-Ray/dvd --> 94.47% positive reviews
Clorox Disinfecting Bathroom Cleaner --> 92.25% positive reviews
Clorox Disinfecting Wipes Value Pack Scented 150 Ct Total --> 91.17% positive reviews
Mike Dave Need Wedding Dates (dvd + Digital) --> 91.15% positive reviews


Deployment with Flask

In [None]:
import joblib
import os

# Create models folder
if not os.path.exists("models"):
    os.makedirs("models")

# Save the trained model and vectorizer
joblib.dump(best_model, "models/sentiment_model.pkl")
joblib.dump(tfidf, "models/tfidf.pkl")

print("Best model (XGBoost) and TF-IDF saved successfully!")


âœ… Best model (XGBoost) and TF-IDF saved successfully!


Download These Files

In [None]:
from google.colab import files
files.download("models/sentiment_model.pkl")
files.download("models/tfidf.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Save the models