In [6]:
# ----- Imports & settings -----
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# NLP / sklearn
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (9,6)
tqdm.pandas()


In [7]:
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('sentiment/vader_lexicon')
except Exception:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('vader_lexicon')

try:
    _ = TextBlob("test").sentiment
except Exception as e:
    from textblob import download_corpora; download_corpora.download_all()
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pratham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pratham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Pratham\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
# ----- Load data -----
DATA_DIR = '../Local Business Review Project NT/data/processed'
MASTER_PATHS = [
    os.path.join(DATA_DIR, 'master_sheet_cleaned.csv'),
]

def load_first(paths):
    for p in paths:
        if os.path.exists(p):
            print("Loading", p)
            return pd.read_csv(p)
    raise FileNotFoundError("Master file not found. Put master_sheet_combined.csv in ../data/processed or /mnt/data")

df = load_first(MASTER_PATHS)
print("Loaded shape:", df.shape)
df.head()


Loading ../Local Business Review Project NT/data/processed\master_sheet_cleaned.csv
Loaded shape: (6627, 8)


Unnamed: 0,business_name,address,city,category,rating,review_text,review_length,review_words
0,Bihari Express,"bihari express, 40/3, 2nd Main Rd, Somasundara...","BTM,Bangalore",restaurant,5.0,"Perfect meal, delicious and served quickly. Lo...",53,8
1,Cinnamon - Continental Delicacies By Ambrosia,"Cinnamon - Continental Delicacies by Ambrosia,...","Whitefield,Bangalore",restaurant,5.0,Top-notch food and service. Best place in town.,47,8
2,Slurpy Shakes,"Slurpy Shakes, Hotel Regenta Central Antarim, ...","Navrangpura,Ahmedabad",restaurant,4.2,Nice place with tasty food. Slight issues with...,54,9
3,Shree Swaminarayan Food Center,"Shree Swaminarayan Food Center, a-1\011 sun vi...","Ghatlodia,Ahmedabad",restaurant,5.0,Amazing food and great service! Highly recomme...,51,7
4,Brindhavana Nithya Bhojanalaya,"BRINDHAVANA NITHYA BHOJANALAYA, 181, 1st cross...","Kumaraswamy Layout & Uttarahalli,Bangalore",restaurant,4.0,Good experience overall. Worth a repeat visit.,46,7


In [9]:
# ----- Create sentiment label from numeric rating -----
# mapping rule: 4-5 -> positive, 3 -> neutral, 1-2 -> negative
def rating_to_sent(r):
    try:
        r = float(r)
    except:
        return np.nan
    if r >= 4.0:
        return 'positive'
    if r == 3.0:
        return 'neutral'
    return 'negative'

df['rating_sentiment'] = df['rating'].apply(rating_to_sent)

print("Counts of rating-based sentiment:")
display(df['rating_sentiment'].value_counts(dropna=False))


Counts of rating-based sentiment:


rating_sentiment
positive    4506
negative    1276
neutral      845
Name: count, dtype: int64

In [10]:
# ----- Rule-based baselines ----- 
analyzer = SentimentIntensityAnalyzer()

def vader_label(text):
    s = analyzer.polarity_scores(str(text))['compound']
    if s >= 0.05:
        return 'positive'
    if s <= -0.05:
        return 'negative'
    return 'neutral'

def textblob_label(text):
    pol = TextBlob(str(text)).sentiment.polarity
    if pol > 0.05:
        return 'positive'
    if pol < -0.05:
        return 'negative'
    return 'neutral'

compute_df = df.copy()
compute_df['vader_label'] = compute_df['review_text'].progress_apply(vader_label)
compute_df['textblob_label'] = compute_df['review_text'].progress_apply(textblob_label)

cmp = compute_df[compute_df['rating_sentiment'].notna()]
print("Baseline comparison (accuracy vs rating_sentiment):")
v_acc = accuracy_score(cmp['rating_sentiment'], cmp['vader_label'])
t_acc = accuracy_score(cmp['rating_sentiment'], cmp['textblob_label'])
print(f"VADER accuracy: {v_acc:.3f}")
print(f"TextBlob accuracy: {t_acc:.3f}")

display(pd.crosstab(cmp['rating_sentiment'], cmp['vader_label'], normalize='index'))
display(pd.crosstab(cmp['rating_sentiment'], cmp['textblob_label'], normalize='index'))


100%|████████████████████████████████████████████████████████████████████████████| 6627/6627 [00:06<00:00, 1035.90it/s]
100%|████████████████████████████████████████████████████████████████████████████| 6627/6627 [00:05<00:00, 1118.64it/s]


Baseline comparison (accuracy vs rating_sentiment):
VADER accuracy: 0.778
TextBlob accuracy: 0.742


vader_label,negative,neutral,positive
rating_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,0.518025,0.075235,0.40674
neutral,0.191716,0.07929,0.728994
positive,0.010874,0.00577,0.983356


textblob_label,negative,neutral,positive
rating_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,0.469436,0.174765,0.355799
neutral,0.100592,0.043787,0.855621
positive,0.008877,0.040834,0.950289


In [11]:
# ----- Prepare dataset for supervised learning -----
ml_df = df[df['rating_sentiment'].notna()].copy()
print("Rows available for supervised training:", ml_df.shape[0])

MAX_ROWS = None
if MAX_ROWS is not None and len(ml_df) > MAX_ROWS:
    ml_df = ml_df.sample(MAX_ROWS, random_state=42).reset_index(drop=True)

X = ml_df['review_text'].astype(str).values
y = ml_df['rating_sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train size:", len(X_train), "Test size:", len(X_test))


Rows available for supervised training: 6627
Train size: 5301 Test size: 1326


In [12]:
# ----- TF-IDF vectorizer -----
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF shapes:", X_train_tfidf.shape, X_test_tfidf.shape)


TF-IDF shapes: (5301, 20000) (1326, 20000)


In [13]:
# ----- Train Logistic Regression -----
clf = LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga', random_state=42)
clf.fit(X_train_tfidf, y_train)

cv_scores = cross_val_score(clf, X_train_tfidf, y_train, cv=3, scoring='accuracy')
print("CV accuracy (3-fold):", cv_scores.mean())


CV accuracy (3-fold): 0.8345595170722505


In [14]:
# ----- Evaluation -----
y_pred = clf.predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
print("Test accuracy:", acc)

print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=3))


Test accuracy: 0.832579185520362

Classification report:
              precision    recall  f1-score   support

    negative      0.615     0.757     0.678       255
     neutral      0.820     0.781     0.800       169
    positive      0.915     0.864     0.889       902

    accuracy                          0.833      1326
   macro avg      0.783     0.801     0.789      1326
weighted avg      0.845     0.833     0.837      1326



In [15]:
# ----- Save predictions -----
OUT_DIR = '../Local Business Review Project NT/data/processed'
os.makedirs(OUT_DIR, exist_ok=True)

test_df = pd.DataFrame({
    'review_text': X_test,
    'rating_sentiment_true': y_test,
    'predicted_sentiment': y_pred
})

test_preds_path = os.path.join(OUT_DIR, 'sentiment_test_predictions.csv')
test_df.to_csv(test_preds_path, index=False)
print("Saved test predictions to", test_preds_path)


Saved test predictions to ../Local Business Review Project NT/data/processed\sentiment_test_predictions.csv


In [17]:
import joblib
import os

# Path where files will be stored
OUT_DIR = "../Local Business Review Project NT/data/processed"
os.makedirs(OUT_DIR, exist_ok=True)

joblib.dump(clf, os.path.join(OUT_DIR, "sentiment_model.joblib"))
print("Saved: sentiment_model.joblib")

vec_name = None
for name in ["vectorizer", "tfidf_vectorizer", "tfidf", "sentiment_vectorizer"]:
    if name in globals():
        vec_name = name
        break

if vec_name is None:
    raise NameError("Vectorizer variable not found. Please check your vectorizer variable name.")

joblib.dump(globals()[vec_name], os.path.join(OUT_DIR, "sentiment_tfidf_vectorizer.joblib"))
print(f"Saved: sentiment_tfidf_vectorizer.joblib (using variable '{vec_name}')")

print("\n✔ Sentiment model & vectorizer saved successfully!")


Saved: sentiment_model.joblib
Saved: sentiment_tfidf_vectorizer.joblib (using variable 'tfidf')

✔ Sentiment model & vectorizer saved successfully!
