# **Assignment #3: Feature Vectorization and Sentiment Analysis**

In [None]:
# Pin versions that work with gensim on Colab
import sys, subprocess, os
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q", "--no-cache-dir",
    "numpy==1.26.4", "scipy==1.11.4", "gensim==4.3.2"
])
# Hard restart so SciPy/NumPy wheels actually load
os.kill(os.getpid(), 9)


In [None]:
import numpy as np, scipy
from scipy import linalg
print("NumPy:", np.__version__, "| SciPy:", scipy.__version__, "| has triu:", hasattr(linalg, "triu"))

import gensim, gensim.downloader as api
print("Gensim:", gensim.__version__)


NumPy: 1.26.4 | SciPy: 1.11.4 | has triu: True
Gensim: 4.3.2


In [None]:
# Run this FIRST after reconnect
import numpy, scipy
from scipy import linalg
print("NumPy:", numpy.__version__, "| SciPy:", scipy.__version__, "| has triu:", hasattr(linalg,"triu"))

try:
    import gensim
    print("Gensim:", gensim.__version__)
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable,"-m","pip","install","-q","--no-cache-dir","--force-reinstall","gensim==4.3.2"])
    import gensim
    print("Gensim:", gensim.__version__)


NumPy: 2.3.3 | SciPy: 1.16.2 | has triu: False


ImportError: cannot import name 'triu' from 'scipy.linalg' (/usr/local/lib/python3.11/dist-packages/scipy/linalg/__init__.py)

## **Section 1**

### **Setup (imports / warnings)**

In [None]:
# Section 1: Setup
import warnings, os, re
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np


### **Drive Mount and Data Load**

In [None]:
# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    DATA_PATH = "/content/drive/MyDrive/Mydata/Restaurant_reviews_R.csv"
except Exception:

    DATA_PATH = "Restaurant_reviews_R.csv"

col_names = ['Cust_Rating','Datetime','Review','Restaurant','City','State','Zipcode','Business_Rating_Score']
try:
    df_raw = pd.read_csv(DATA_PATH, header=0)

    if len(df_raw.columns) >= 8:
        df_raw = df_raw.iloc[:, :8]
        df_raw.columns = col_names
except Exception:
    df_raw = pd.read_csv(DATA_PATH, header=None, names=col_names)

# sanity check
df_raw.head(3)


Mounted at /content/drive


Unnamed: 0,Cust_Rating,Datetime,Review,Restaurant,City,State,Zipcode,Business_Rating_Score
0,3.0,2013-12-06 23:22:26,"This place is an interesting combo. The chef, ...",La Mongerie Bakery & Bistro,Atlanta,GA,30308,3.0
1,5.0,2008-11-16 09:44:04,Pizza Hut is great! You get huge pizzas for yo...,Pizza Hut,Orlando,FL,32819,3.5
2,5.0,2016-04-11 17:00:34,Always a great place to taste some tea with fr...,Teavana,Atlanta,GA,30326,3.0


### **spaCy install/load**

In [None]:
import sys, subprocess
def _pip_install(pkg):
    try:
        __import__(pkg.split('==')[0].replace('-', '_'))
    except:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

_pip_install("spacy==3.7.2")

import spacy
try:
    nlp = spacy.load("en_core_web_sm")
except:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")


### **Lemmatize, drop empties**

In [None]:
def to_lemmas(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    doc = nlp(text.lower())
    toks = [t.lemma_ for t in doc if t.is_alpha and not t.is_stop]
    return " ".join(toks).strip()

df = df_raw.copy()
df["Review_cleaned"] = df["Review"].astype(str).apply(to_lemmas)
df = df[df["Review_cleaned"].str.len() > 0].reset_index(drop=True)

len(df)  # final cleaned row count


63206

In [None]:
# Load the cleaned file you saved earlier
CLEAN_PATH = "/content/drive/MyDrive/Mydata/cleaned_restaurant_reviews_Lab3.csv"
import pandas as pd
df = pd.read_csv(CLEAN_PATH)

# final cleaned row count (record this in your submission)
len(df)


63206

### **Preview (first 5 rows)**

In [None]:
df.loc[:, ["Cust_Rating","Datetime","Review","Restaurant","City","State","Zipcode","Business_Rating_Score","Review_cleaned"]].head()


Unnamed: 0,Cust_Rating,Datetime,Review,Restaurant,City,State,Zipcode,Business_Rating_Score,Review_cleaned
0,3.0,2013-12-06 23:22:26,"This place is an interesting combo. The chef, ...",La Mongerie Bakery & Bistro,Atlanta,GA,30308,3.0,place interesting combo chef david incredible ...
1,5.0,2008-11-16 09:44:04,Pizza Hut is great! You get huge pizzas for yo...,Pizza Hut,Orlando,FL,32819,3.5,pizza hut great huge pizza money service quick...
2,5.0,2016-04-11 17:00:34,Always a great place to taste some tea with fr...,Teavana,Atlanta,GA,30326,3.0,great place taste tea friend staff friendly pushy
3,4.0,2015-05-26 06:47:56,Delish! Stopped in on a random Sunday and so g...,Native Foods Cafe,Happy Valley,OR,97086,4.5,delish stop random sunday glad normally eat he...
4,2.0,2014-02-23 23:30:56,The 2 stars are for great variety. I would hav...,Waltham India Market,Waltham,MA,2453,3.0,star great variety give star remove failure in...


In [None]:
# save cleaned output for reuse
df.to_csv("/content/drive/MyDrive/Mydata/cleaned_restaurant_reviews_Lab3.csv", index=False)


## **Section 2: Pre-trained FastText embeddings (300d)**

### **Install deps + load FastText**

In [None]:
import gensim, gensim.downloader as api
ft = api.load("fasttext-wiki-news-subwords-300")  # KeyedVectors
ft_dim = ft.vector_size  # 300
ft_dim


ModuleNotFoundError: No module named 'gensim'

### **Frequent token list (>1% docs) ∩ FastText vocab**

In [None]:
from collections import Counter
import numpy as np

docs = df["Review_cleaned"].tolist()
N = len(docs)

dfreq = Counter()
for d in docs:
    dfreq.update(set(d.split()))  # document presence

threshold = int(np.floor(0.01 * N))  # > 1% of docs
freq_terms = [w for w, c in dfreq.items() if c > threshold]
freq_terms = [w for w in freq_terms if w in ft.key_to_index]
len(freq_terms)


698

### **Per-review averaged FastText vectors**

In [None]:
import numpy as np

freq_set = set(freq_terms)

def doc_avg_vec(text, kv):
    toks = [t for t in text.split() if (t in freq_set and t in kv.key_to_index)]
    if not toks:
        return np.zeros(kv.vector_size, dtype=np.float32)
    mat = np.vstack([kv.get_vector(t) for t in toks])
    return mat.mean(axis=0)

X = np.vstack([doc_avg_vec(txt, ft) for txt in df["Review_cleaned"]])
X.shape  # (rows, 300)


(63206, 300)

### **Assemble dataframe (metadata + vectors) & preview**

In [None]:
meta_cols = ["Cust_Rating","Datetime","Review","Restaurant","City","State","Zipcode","Business_Rating_Score"]
vec_cols  = [f"ft_{i}" for i in range(ft_dim)]

df_ft = pd.concat(
    [df[meta_cols].reset_index(drop=True),
     pd.DataFrame(X, columns=vec_cols)],
    axis=1
)

df_ft.head(5)
#df_ft.to_csv("/content/drive/MyDrive/Mydata/df_fasttext_review_vectors.csv", index=False)


Unnamed: 0,Cust_Rating,Datetime,Review,Restaurant,City,State,Zipcode,Business_Rating_Score,ft_0,ft_1,...,ft_290,ft_291,ft_292,ft_293,ft_294,ft_295,ft_296,ft_297,ft_298,ft_299
0,3.0,2013-12-06 23:22:26,"This place is an interesting combo. The chef, ...",La Mongerie Bakery & Bistro,Atlanta,GA,30308,3.0,-0.028984,-0.037984,...,0.004486,0.032442,-0.024005,-0.006096,0.002617,-0.008675,0.00683,0.022984,-0.027191,-0.006683
1,5.0,2008-11-16 09:44:04,Pizza Hut is great! You get huge pizzas for yo...,Pizza Hut,Orlando,FL,32819,3.5,-0.02347,-0.014872,...,0.015916,0.024685,-0.00345,-0.015848,0.02122,-0.019094,0.032824,0.034016,-0.009846,-0.013066
2,5.0,2016-04-11 17:00:34,Always a great place to taste some tea with fr...,Teavana,Atlanta,GA,30326,3.0,-0.016991,-0.042579,...,-0.021573,0.058634,-0.019549,0.00898,0.000755,0.003944,0.033365,0.042965,-0.018749,0.00613
3,4.0,2015-05-26 06:47:56,Delish! Stopped in on a random Sunday and so g...,Native Foods Cafe,Happy Valley,OR,97086,4.5,-0.012254,0.006191,...,0.008091,0.01141,-0.026101,-0.010715,0.013988,-0.021434,0.023488,0.008143,-0.019713,-0.001002
4,2.0,2014-02-23 23:30:56,The 2 stars are for great variety. I would hav...,Waltham India Market,Waltham,MA,2453,3.0,0.000793,-0.003037,...,-0.004901,0.031194,-0.016404,0.002264,-0.001636,-0.024458,0.01453,0.005403,-0.013201,-0.006987


## **Section 3- Bigrams + Custom Word2Vec**

### **Tokenize reviews & build bigrams**

In [None]:
from gensim.models.phrases import Phrases, Phraser

# tokenized unigrams (use cleaned text)
tokens = [t.split() for t in df["Review_cleaned"].tolist()]  # list[list[str]]

# bigram model  (NOTE: delimiter is a string, not bytes)
bigram = Phrases(tokens, min_count=20, threshold=10.0, delimiter="_")
bigram_phraser = Phraser(bigram)

# apply bigrams
tokens_bi = [bigram_phraser[doc] for doc in tokens]

# quick sanity check
tokens_bi[0][:20], sum(len(doc) for doc in tokens_bi)



NameError: name 'df' is not defined

### **Top 20 unigrams and top 20 bigrams**

In [None]:
from collections import Counter

uni_counter = Counter([w for doc in tokens for w in doc])
bi_counter  = Counter([w for doc in tokens_bi for w in doc if "_" in w])

top20_uni = uni_counter.most_common(20)
top20_bi  = bi_counter.most_common(20)

top20_uni, top20_bi


([('food', 46477),
  ('good', 44839),
  ('place', 37341),
  ('order', 35217),
  ('time', 24904),
  ('like', 24834),
  ('great', 24331),
  ('come', 22600),
  ('service', 21267),
  ('try', 17593),
  ('get', 17105),
  ('go', 16434),
  ('chicken', 15061),
  ('restaurant', 14686),
  ('eat', 13570),
  ('pizza', 12481),
  ('love', 12292),
  ('nice', 11317),
  ('wait', 10768),
  ('want', 10567)],
 [('customer_service', 2554),
  ('feel_like', 1844),
  ('highly_recommend', 1761),
  ('ice_cream', 1522),
  ('staff_friendly', 1409),
  ('wait_minute', 1162),
  ('happy_hour', 1158),
  ('look_forward', 826),
  ('friendly_staff', 818),
  ('super_friendly', 772),
  ('take_minute', 721),
  ('gluten_free', 693),
  ('parking_lot', 691),
  ('dim_sum', 671),
  ('decide_try', 646),
  ('little_bit', 614),
  ('mac_cheese', 602),
  ('price_reasonable', 592),
  ('minute_later', 586),
  ('year_ago', 575)])

### **Train Word2Vec (CBOW, min_count=30, window=7, size=300, workers=2, epochs=20)**

In [None]:
from gensim.models import Word2Vec

w2v = Word2Vec(
    sentences=tokens_bi,
    vector_size=300,
    window=7,
    min_count=30,
    workers=2,
    sg=0,               # CBOW
    epochs=20
)

# keyed vectors handle
w2v_kv = w2v.wv
w2v_kv.vector_size


300

### **Most-similar examples (unigram + bigram)**

In [None]:
def try_most_similar(model, word, topn=10):
    if word in model.key_to_index:
        return model.most_similar(word, topn=topn)
    else:
        return f"{word} not in vocabulary"

examples = {
    "food":  try_most_similar(w2v_kv, "food", 10),
    "service": try_most_similar(w2v_kv, "service", 10),
    # example bigram token (may vary by corpus; safe-check)
    "great_service": try_most_similar(w2v_kv, "great_service", 10),
}
examples


{'food': [('service', 0.415457546710968),
  ('restaurant', 0.34035512804985046),
  ('meal', 0.33094701170921326),
  ('speedy', 0.3276204764842987),
  ('extremely_slow', 0.3148893713951111),
  ('sushi', 0.31467074155807495),
  ('server_attentive', 0.306128591299057),
  ('chip_salsa', 0.30594488978385925),
  ('time', 0.3056448698043823),
  ('place', 0.30493712425231934)],
 'service': [('customer_service', 0.5901476740837097),
  ('costumer_service', 0.5624114274978638),
  ('waitstaff', 0.5535829067230225),
  ('staff', 0.4810692071914673),
  ('service_prompt', 0.4769642651081085),
  ('server_attentive', 0.47683045268058777),
  ('food', 0.4154576063156128),
  ('experience', 0.40347057580947876),
  ('refill_drink', 0.39816126227378845),
  ('server', 0.39810675382614136)],
 'great_service': 'great_service not in vocabulary'}

### **FastText nearest neighbors (same seeds)**

In [None]:
# Load pretrained FastText (300d)
ft = api.load("fasttext-wiki-news-subwords-300")  # ~958MB if not already cached

def ft_neighbors(word, topn=10):
    return ft.most_similar(word, topn=topn) if word in ft.key_to_index else f"{word} not in FastText vocab"

# Use the SAME seed words you used for custom Word2Vec
ft_examples = {
    "food (FT)":    ft_neighbors("food", 10),
    "service (FT)": ft_neighbors("service", 10),
    # If your chosen bigram isn't in FT, swap to a frequent phrase that exists, e.g. "customer_service"
    "customer_service (FT)": ft_neighbors("customer_service", 10),
}
ft_examples




{'food (FT)': [('foods', 0.7766500115394592),
  ('food-', 0.7599727511405945),
  ('foodstuff', 0.735816240310669),
  ('foodless', 0.7320124506950378),
  ('food.', 0.7302034497261047),
  ('foodstuffs', 0.7282736897468567),
  ('healthfood', 0.7277587652206421),
  ('dogfood', 0.7183505892753601),
  ('catfood', 0.716797411441803),
  ('non-food', 0.7167892456054688)],
 'service (FT)': [('services', 0.7911463975906372),
  ('non-service', 0.7496036291122437),
  ('service-', 0.7314968109130859),
  ('servic', 0.7274243235588074),
  ('service-type', 0.7205984592437744),
  ('post-service', 0.7153797745704651),
  ('sevice', 0.7121303081512451),
  ('service--and', 0.7114466428756714),
  ('cross-service', 0.7097828984260559),
  ('service-wide', 0.6980454325675964)],
 'customer_service (FT)': 'customer_service not in FastText vocab'}

### **Compare similarities: FastText vs. custom Word2Vec**

In [None]:
import numpy as np
import pandas as pd

def cos_sim(a, b):
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    if denom == 0: return np.nan
    return float(np.dot(a, b) / denom)

# pick pairs from frequent terms so they’re likely in both
pairs = [
    ("food", "service"),
    ("food", "price"),
    ("good", "bad"),
    ("wait", "time"),
]

rows = []
for a, b in pairs:
    # skip if OOV in either model
    if (a in w2v_kv.key_to_index) and (b in w2v_kv.key_to_index) and \
       (a in ft.key_to_index) and (b in ft.key_to_index):
        rows.append({
            "term_a": a,
            "term_b": b,
            "sim_w2v": cos_sim(w2v_kv.get_vector(a), w2v_kv.get_vector(b)),
            "sim_fasttext": cos_sim(ft.get_vector(a), ft.get_vector(b)),
        })

sim_df = pd.DataFrame(rows)
sim_df


Unnamed: 0,term_a,term_b,sim_w2v,sim_fasttext
0,food,service,0.415458,0.48361
1,food,price,0.236316,0.467126
2,good,bad,0.31915,0.850309
3,wait,time,0.460444,0.552057


### **Comparison: (pre-trained FastText vs custom Word2Vec)**

FastText retrieves broader, morphology-heavy neighbors (e.g., plural/variant forms and general terms), while the custom Word2Vec surfaces domain-specific co-occurrences tied to restaurant reviews (e.g., staff/service phrases and common collocations). In our cosine table, FastText tends to assign higher similarity to broadly related pairs (e.g., good–bad), whereas the custom model keeps stronger separation that better reflects sentiment polarity in this corpus. Net: FastText offers robust coverage; the custom model captures dataset-specific usage that’s often more useful for downstream sentiment/restaurant tasks.

## **Section 4: TF-IDF + Sentimental Modelling**

### **Create labels (binary sentiment) & train/test split**

In [None]:
# Binary labels from star rating: pos >= 4, neg <= 2 (drop neutrals like 3)
import pandas as pd
from sklearn.model_selection import train_test_split

df_sent = df[df["Cust_Rating"].isin([1.0, 2.0, 4.0, 5.0])].copy()
df_sent["y"] = (df_sent["Cust_Rating"] >= 4.0).astype(int)

X_text = df_sent["Review_cleaned"].astype(str).values
y      = df_sent["y"].values

X_train, X_test, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

len(df_sent), y.mean()


(55100, np.float64(0.6790018148820327))

### **TF-IDF vectorization (unigrams+bigrams)**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9,
    max_features=5000
)

Xtr = tfidf.fit_transform(X_train)
Xte = tfidf.transform(X_test)

Xtr.shape, Xte.shape


((44080, 5000), (11020, 5000))

### **Train baseline classifier (Logistic Regression)**

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=200,
    n_jobs=None,
    solver="liblinear"  # stable on sparse TF-IDF
)
clf.fit(Xtr, y_train)
clf


### **Evaluate (accuracy, ROC-AUC, classification report)**

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

pred = clf.predict(Xte)
proba = clf.predict_proba(Xte)[:,1]

acc  = accuracy_score(y_test, pred)
auc  = roc_auc_score(y_test, proba)
print("Accuracy:", round(acc, 4), "| ROC-AUC:", round(auc, 4))
print()
print(classification_report(y_test, pred, digits=3))


Accuracy: 0.9411 | ROC-AUC: 0.9827

              precision    recall  f1-score   support

           0      0.936     0.876     0.905      3537
           1      0.943     0.972     0.957      7483

    accuracy                          0.941     11020
   macro avg      0.940     0.924     0.931     11020
weighted avg      0.941     0.941     0.941     11020



### **Top weighted features (positive vs negative)**

In [None]:
import numpy as np
import pandas as pd

feat_names = np.array(tfidf.get_feature_names_out())
coefs = clf.coef_.ravel()

top_pos_idx = np.argsort(coefs)[-20:][::-1]
top_neg_idx = np.argsort(coefs)[:20]

top_pos = pd.DataFrame({"feature": feat_names[top_pos_idx], "weight": coefs[top_pos_idx]})
top_neg = pd.DataFrame({"feature": feat_names[top_neg_idx], "weight": coefs[top_neg_idx]})

top_pos.head(20), top_neg.head(20)


(       feature     weight
 0    delicious  11.721131
 1        great   9.856034
 2      amazing   8.709173
 3         good   8.592095
 4         love   6.777239
 5      awesome   6.617407
 6    excellent   6.559201
 7      perfect   6.206605
 8     friendly   5.513953
 9     favorite   5.227923
 10  definitely   5.211392
 11   fantastic   5.119513
 12       tasty   5.110974
 13       fresh   4.599006
 14       yummy   4.447897
 15   wonderful   4.308957
 16       enjoy   4.187020
 17        nice   4.168607
 18        glad   4.018619
 19   perfectly   4.008606,
           feature    weight
 0           bland -7.301219
 1             bad -7.245744
 2            rude -7.058052
 3   disappointing -6.407037
 4        horrible -6.245298
 5        terrible -6.243522
 6        mediocre -5.944460
 7    disappointed -5.900560
 8           awful -5.102435
 9              ok -5.005473
 10           poor -4.874378
 11            dry -4.867549
 12          gross -4.746351
 13           will -4.7153

### **Quick predictions on a few samples**

In [None]:
samples = [
    "loved the food and super friendly staff",
    "terrible service, waited forever and food was cold",
    "okay experience, portions small but tasty"
]
pred_s = clf.predict(tfidf.transform(samples))
list(zip(samples, pred_s))


[('loved the food and super friendly staff', np.int64(1)),
 ('terrible service, waited forever and food was cold', np.int64(0)),
 ('okay experience, portions small but tasty', np.int64(0))]

### **Save vectorizer + model for reuse**

In [None]:
import joblib, os
OUT_DIR = "/content/drive/MyDrive/Mydata"
os.makedirs(OUT_DIR, exist_ok=True)
joblib.dump(tfidf, f"{OUT_DIR}/tfidf_lab3.joblib")
joblib.dump(clf,   f"{OUT_DIR}/logreg_lab3.joblib")


['/content/drive/MyDrive/Mydata/logreg_lab3.joblib']

### **Build per-restaurant FastText document embeddings (1075 rows)**

In [None]:
# only run this if you didn't keep df_ft in RAM
df_ft = pd.read_csv("/content/drive/MyDrive/Mydata/df_fasttext_review_vectors.csv")


In [None]:
import pandas as pd
import numpy as np

# vector columns come from df_ft created in Section 2
vec_cols = [c for c in df_ft.columns if c.startswith("ft_")]
meta_cols = ["Restaurant","City","State","Zipcode","Business_Rating_Score","Cust_Rating","Review"]

df_with_vec = pd.concat(
    [df[meta_cols].reset_index(drop=True), df_ft[vec_cols].reset_index(drop=True)],
    axis=1
)

group_cols = ["Restaurant","City","State","Zipcode","Business_Rating_Score"]
agg_map = {c: "mean" for c in vec_cols}
agg_map.update({"Cust_Rating": "mean", "Review": "count"})

df_rest_doc = (
    df_with_vec
      .groupby(group_cols, as_index=False)
      .agg(agg_map)
      .rename(columns={"Review":"review_count","Cust_Rating":"cust_rating_avg"})
      .sort_values(["State","City","Restaurant"])
      .reset_index(drop=True)
)

len(df_rest_doc), df_rest_doc.shape


(1075, (1075, 307))

### **Preview first 5 restaurants**

In [None]:
df_rest_doc.loc[:, ["Restaurant","City","State","Zipcode","Business_Rating_Score",
                    "review_count","cust_rating_avg"] + vec_cols[:5]].head(5)


Unnamed: 0,Restaurant,City,State,Zipcode,Business_Rating_Score,review_count,cust_rating_avg,ft_0,ft_1,ft_2,ft_3,ft_4
0,IHOP,BURNABY,BC,V5H 2E6,3.5,60,3.566667,-0.006099,-0.020242,0.013036,-0.000374,-0.010482
1,Cattle Cafe,Burnaby,BC,V5H 4T2,2.5,73,2.60274,-0.011275,-0.017237,0.007729,-0.002314,-0.006924
2,Chatime,Burnaby,BC,V5H 2E8,3.0,98,2.94898,-0.006267,-0.02575,0.003499,-0.002189,-0.003196
3,Chronic Tacos,Burnaby,BC,V5H 4P1,3.0,32,3.03125,-0.017612,-0.01086,0.001035,0.003337,-0.016227
4,Cozmos Cafe + Bistro,Burnaby,BC,V5B 1S1,4.0,81,4.148148,-0.013299,-0.021173,0.013922,-5.4e-05,-0.014217


## **Section 5**

### **Per-review sentiment (VADER + TextBlob)**

In [None]:
# install once per fresh runtime
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "vaderSentiment==3.3.2", "textblob==0.18.0"])

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
analyzer = SentimentIntensityAnalyzer()

# compound ∈ [-1,1]; TextBlob polarity ∈ [-1,1]; subjectivity ∈ [0,1]
df["nltk_compound"]  = df["Review_cleaned"].astype(str).map(lambda t: analyzer.polarity_scores(t)["compound"])
df["tb_polarity"]    = df["Review_cleaned"].astype(str).map(lambda t: TextBlob(t).sentiment.polarity)
df["tb_subjectivity"]= df["Review_cleaned"].astype(str).map(lambda t: TextBlob(t).sentiment.subjectivity)

df[["Review","nltk_compound","tb_polarity","tb_subjectivity"]].head(3)


Unnamed: 0,Review,nltk_compound,tb_polarity,tb_subjectivity
0,"This place is an interesting combo. The chef, ...",0.9506,0.369913,0.499026
1,Pizza Hut is great! You get huge pizzas for yo...,0.9623,0.555208,0.725
2,Always a great place to taste some tea with fr...,0.8555,0.5875,0.625


### **Aggregate by restaurant (→ 1,075 rows) + preview**

In [None]:
group_cols = ["Restaurant","City","State","Zipcode","Business_Rating_Score"]

df_rest_sa = (
    df.groupby(group_cols, as_index=False)
      .agg(
          review_count=("Review","count"),
          polarity=("tb_polarity","mean"),
          subjectivity=("tb_subjectivity","mean"),
          nltk_compound=("nltk_compound","mean"),
          cust_rating_avg=("Cust_Rating","mean"),
      )
      .sort_values(["State","City","Restaurant"])
      .reset_index(drop=True)
)

len(df_rest_sa), df_rest_sa.shape, df_rest_sa.head(5)


(1075,
 (1075, 10),
              Restaurant     City State  Zipcode  Business_Rating_Score  \
 0                  IHOP  BURNABY    BC  V5H 2E6                    3.5   
 1           Cattle Cafe  Burnaby    BC  V5H 4T2                    2.5   
 2               Chatime  Burnaby    BC  V5H 2E8                    3.0   
 3         Chronic Tacos  Burnaby    BC  V5H 4P1                    3.0   
 4  Cozmos Cafe + Bistro  Burnaby    BC  V5B 1S1                    4.0   
 
    review_count  polarity  subjectivity  nltk_compound  cust_rating_avg  
 0            60  0.182035      0.576365       0.685772         3.566667  
 1            73  0.041021      0.588100       0.360289         2.602740  
 2            98  0.110853      0.563575       0.572917         2.948980  
 3            32  0.130774      0.574958       0.573644         3.031250  
 4            81  0.351373      0.618779       0.872700         4.148148  )

### **Save for later tasks**

In [None]:
df_rest_sa.to_csv("/content/drive/MyDrive/Mydata/section5_restaurant_sentiment.csv", index=False)


In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)


import os
os.makedirs("/content/drive/MyDrive/Mydata", exist_ok=True)

#save file section 4
out_path = "/content/drive/MyDrive/Mydata/section4_restaurant_doc_embeddings.csv"
df_rest_doc.to_csv(out_path, index=False)


import os
print("Saved:", out_path, "| size:", os.path.getsize(out_path), "bytes")


Mounted at /content/drive
Saved: /content/drive/MyDrive/Mydata/section4_restaurant_doc_embeddings.csv | size: 6778309 bytes
