This notebook contains:

- Basic NLP features (length, polarity, subjectivity)
- TF-IDF + BoW
- Sentence-BERT embeddings
- Aspect-Based Sentiment Analysis
- Topic modeling (LDA, NMF, BERTopic)
- Aggregation to product-level features

In [26]:
import os
import re
import json
import math
import time
import pickle
from collections import Counter, defaultdict


import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy.sparse


# NLP libs
import nltk
from nltk import sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy


# Topic modeling / LDA
import gensim
from gensim import corpora


# Embeddings
from sentence_transformers import SentenceTransformer

In [2]:
from pathlib import Path

# Use the current working directory as the base
project_root = Path.cwd().parent.parent  # adjust if needed to point to ML_DB_PROJECT

# Output directory
out_dir = project_root / "data/ml/reviews/reviews_preprocessed"
out_dir.mkdir(parents=True, exist_ok=True)

DF_PATH = out_dir / "reviews_preprocessed.parquet"
df = pd.read_parquet(DF_PATH)

print("Rows:", len(df))

Rows: 300000


Basic feature: lengths

In [7]:
print("Computing basic length features...")
df['n_words'] = df['text_for_training'].fillna("").str.split().apply(len)
df['n_chars'] = df['text_for_training'].fillna("").str.len()

#print(df['n_words'])
#print(df['n_chars'])

Computing basic length features...


Sentiment: VADER (baseline) + TextBlob subjectivity

In [9]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ND.COM\AppData\Roaming\nltk_data...


True

In [10]:
print("Computing VADER sentiment scores and TextBlob subjectivity...")
sia = SentimentIntensityAnalyzer()

Computing VADER sentiment scores and TextBlob subjectivity...


In [11]:
df['vader_compound'] = df['text_for_training'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
# Normalize to 0..1
df['vader_compound_norm'] = (df['vader_compound'] + 1.0) / 2.0

TextBlob polarity & subjectivity

In [13]:
def tb_polarity_subjectivity(text):
    try:
        tb = TextBlob(str(text))
        return tb.polarity, tb.subjectivity
    except Exception:
        return 0.0, 0.0

In [14]:
pol_sub = df['text_for_training'].fillna("").map(tb_polarity_subjectivity)

In [15]:
df['tb_polarity'] = pol_sub.map(lambda x: x[0])
df['tb_subjectivity'] = pol_sub.map(lambda x: x[1])

In [16]:
df.to_parquet(out_dir / "reviews_fe_step1.parquet", index=False)
print("Saved step1 features")

Saved step1 features


TF-IDF (global) and Bag-of-Words

In [17]:
print("Computing TF-IDF matrix (max_features=10000)...")
TFIDF_MAX_FEAT = 10000
vectorizer = TfidfVectorizer(stop_words='english', max_features=TFIDF_MAX_FEAT)
texts = df['text_for_training'].fillna("").tolist()

Computing TF-IDF matrix (max_features=10000)...


In [18]:
tfidf_matrix = vectorizer.fit_transform(texts)

In [19]:
scipy.sparse.save_npz(os.path.join(out_dir, 'reviews_tfidf_full.npz'), tfidf_matrix)

with open(os.path.join(out_dir, 'reviews_tfidf_vectorizer_full.pkl'), 'wb') as f:
    pickle.dump(vectorizer, f)
print("TF-IDF saved. shape:", tfidf_matrix.shape)

TF-IDF saved. shape: (300000, 279)


saving CountVectorizer for ngrams if needed

In [20]:
cv = CountVectorizer(ngram_range=(1,2), max_features=20000, stop_words='english')
cv_matrix = cv.fit_transform(texts)

In [21]:
scipy.sparse.save_npz(os.path.join(out_dir, 'reviews_cv_ngram.npz'), cv_matrix)
with open(os.path.join(out_dir, 'reviews_cv_ngram.pkl'), 'wb') as f:
    pickle.dump(cv, f)

Sentence-BERT embeddings (all-MiniLM-L6-v2)

In [23]:
EMB_MODEL = 'all-MiniLM-L6-v2'
emb_path = os.path.join(out_dir, 'reviews_bert_embeddings_full.npy')
print("Loading sentence-transformer model:", EMB_MODEL)
model = SentenceTransformer(EMB_MODEL)

Loading sentence-transformer model: all-MiniLM-L6-v2


In [24]:
# Encode in batches
batch_size = 512
n = len(texts)
embeddings = np.memmap(emb_path, dtype='float32', mode='w+', shape=(n, model.get_sentence_embedding_dimension()))

In [25]:
print("Encoding embeddings in batches... total rows:", n)
for start in tqdm(range(0, n, batch_size)):
    end = min(n, start + batch_size)
    batch_texts = texts[start:end]
    batch_emb = model.encode(batch_texts, batch_size=64, show_progress_bar=False, convert_to_numpy=True)
    embeddings[start:end] = batch_emb

Encoding embeddings in batches... total rows: 300000


100%|██████████| 586/586 [41:36<00:00,  4.26s/it]  


In [27]:
# flush to disk
embeddings.flush()
print("Saved embeddings to", emb_path)

Saved embeddings to c:\Users\ND.COM\Desktop\ML DB Project\data\ml\reviews\reviews_preprocessed\reviews_bert_embeddings_full.npy


Aspect-Based Sentiment Analysis(simple rule-based + sentence aggregation)

In [28]:
print("Running simple ABSA extraction (rule-based + VADER sentence sentiment)...")
spacy_nlp = spacy.load('en_core_web_sm')
ASPECTS = ['fit','size','quality','material','color','price','delivery','packaging','service','comfort']

Running simple ABSA extraction (rule-based + VADER sentence sentiment)...


In [29]:
def extract_aspect_sentiments(text):
    """
    For each pre-defined aspect, search sentences that contain the aspect token (exact or lemma)
    and compute an average VADER sentiment for those sentences. Returns dict aspect->score
    """
    out = {}
    try:
        sents = sent_tokenize(str(text))
    except Exception:
        sents = [str(text)]
    for asp in ASPECTS:
        asp_sents = [s for s in sents if re.search(r'' + re.escape(asp) + r's?', s, flags=re.IGNORECASE)]
        if not asp_sents:
            out[asp] = None
    else:
        scores = [sia.polarity_scores(s)['compound'] for s in asp_sents]
        out[asp] = np.mean(scores)
    return out

In [30]:
# Compute for all reviews (may be slow) — use tqdm
aspect_dicts = []
for t in tqdm(texts, desc='ABSA'):
    aspect_dicts.append(extract_aspect_sentiments(t))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
ABSA: 100%|██████████| 300000/300000 [00:19<00:00, 15685.24it/s]


In [53]:
print(aspect_dicts[:2] )

[{'fit': None, 'size': None, 'quality': None, 'material': None, 'color': None, 'price': None, 'delivery': None, 'packaging': None, 'service': None, 'comfort': nan}, {'fit': None, 'size': None, 'quality': None, 'material': None, 'color': None, 'price': None, 'delivery': None, 'packaging': None, 'service': None, 'comfort': nan}]


Topic Modeling: LDA (Gensim) on tokenized reviews

In [31]:
print("Preparing tokens for LDA (gensim)...")
import gensim.utils as gu

# simple tokenization & stopword removal
stoplist = set(nltk.corpus.stopwords.words('english'))

Preparing tokens for LDA (gensim)...


In [32]:
def tokenize_for_lda(text):
    tokens = [w for w in re.findall(r"\b[a-z]{2,}\b", str(text).lower()) 
              if w not in stoplist]
    return tokens

In [33]:
texts_tokens = [tokenize_for_lda(t) for t in tqdm(texts, desc='tokenizing')]

tokenizing: 100%|██████████| 300000/300000 [00:03<00:00, 80207.20it/s] 


In [34]:
# Create dictionary & corpus
dictionary = corpora.Dictionary(texts_tokens)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=50000)
corpus = [dictionary.doc2bow(text) for text in texts_tokens]

In [35]:
# Train LDA model
NUM_TOPICS = 30
print("Training LDA model with", NUM_TOPICS, "topics... This may take a while.")
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=5, random_state=42)
lda_model.save(os.path.join(out_dir, 'lda_model_full'))

Training LDA model with 30 topics... This may take a while.


In [36]:
# Compute topic distribution per document
print("Computing topic distributions for each document...")
lda_topic_dist = np.zeros((n, NUM_TOPICS), dtype=float)
for i, bow in enumerate(tqdm(corpus, desc='lda infer')):
    topics = lda_model.get_document_topics(bow, minimum_probability=0)
    for tid, prob in topics:
        lda_topic_dist[i, tid] = prob

Computing topic distributions for each document...


lda infer: 100%|██████████| 300000/300000 [01:56<00:00, 2575.55it/s]


In [37]:
np.save(os.path.join(out_dir, 'lda_topic_dist.npy'), lda_topic_dist)
print("Saved LDA topic distributions")

Saved LDA topic distributions


BERTopic

In [None]:
try:
    #from bertopic import BERTopic
    print("Running BERTopic (this requires significant memory)...")
    # load embeddings from memmap
    emb = np.load(emb_path, mmap_mode='r')
    topic_model = BERTopic(nr_topics='auto', language='english')
    topics, probs = topic_model.fit_transform(texts, emb)
    topic_model.save(os.path.join(out_dir, 'bertopic_model'))
    # save topic assignments
    df['bertopic_topic'] = topics
    print("BERTopic done")
except Exception as e:
    print("BERTopic not run (missing package or OOM). Reason:", str(e))

Review-to-Product Feature Linking (Aggregation)

In [38]:
print("Aggregating review-level features to product level...")

# choose features to aggregate
agg_funcs = {
    'vader_compound_norm': ['mean', 'std'],
    'tb_polarity': ['mean'],
    'n_words': ['mean'],
    'rating': ['mean','std','count']
}

Aggregating review-level features to product level...


In [39]:
# aspect columns normalized
aspect_norm_cols = [c for c in df.columns if c.startswith('aspect_') and c.endswith('_norm')]
for c in aspect_norm_cols:
    agg_funcs[c] = ['mean']

In [51]:
df.columns

Index(['review_id', 'customer_id', 'article_id', 'category_id', 'rating',
       'review_text', 'created_at', 'verified_purchase', 'helpful_votes',
       'synthetic_sentiment_label', 'aspect_terms', 'language',
       'review_source', 'review_age_days', 'clean_text', 'vader_score',
       'vader_label', 'aspect_terms_list', 'tokens', 'lemmas', 'sentences',
       'language_detected', 'translated_text', 'is_spam', 'final_text_for_ml',
       'text_for_training', 'n_words', 'n_chars', 'vader_compound',
       'vader_compound_norm', 'tb_polarity', 'tb_subjectivity'],
      dtype='object')

In [40]:
# topic proportions (LDA)
# we'll compute average topic probability per product
topic_cols = [f'topic_{i}' for i in range(NUM_TOPICS)]
lda_df = pd.DataFrame(lda_topic_dist, columns=topic_cols)

In [41]:
# merge lda topic cols into df for aggregation
df_topics = pd.concat([df.reset_index(drop=True), lda_df.reset_index(drop=True)], axis=1)

In [42]:
# product aggregation
prod_agg = df_topics.groupby('article_id').agg(agg_funcs)
# flatten multiindex
prod_agg.columns = ['_'.join(col).strip() for col in prod_agg.columns.values]
prod_agg.reset_index(inplace=True)

In [43]:
# topic averages
topic_agg = df_topics.groupby('article_id')[topic_cols].mean().reset_index()
prod_features = prod_agg.merge(topic_agg, on='article_id', how='left')

In [45]:
# additional product features
prod_features['pct_negative'] = df_topics.groupby('article_id').apply(lambda g: (g['rating']<=2).sum()/len(g)).values
#prod_features['review_freshness'] = df_topics.groupby('article_id').apply(lambda g: np.exp(- (pd.to_datetime('now') - pd.to_datetime(g['created_at'])).dt.days.mean()/30)).values
prod_features['controversy_score'] = prod_features['rating_std']

  prod_features['pct_negative'] = df_topics.groupby('article_id').apply(lambda g: (g['rating']<=2).sum()/len(g)).values


Most common complaints: top TF-IDF terms among negative reviews for each product

In [46]:
print("Computing top complaint keywords per product...")

neg = df_topics[df_topics['rating']<=2]

Computing top complaint keywords per product...


In [47]:
prod_complaints = {}
for aid, group in tqdm(neg.groupby('article_id'), desc='complaints'):
    texts_group = group['text_for_training'].tolist()
    if len(texts_group) < 3:
        prod_complaints[aid] = []
        continue
    v = TfidfVectorizer(stop_words='english', max_features=50)
    m = v.fit_transform(texts_group)
    scores = np.asarray(m.sum(axis=0)).ravel()
    terms = v.get_feature_names_out()
    top_idx = np.argsort(scores)[-10:][::-1]
    prod_complaints[aid] = [terms[i] for i in top_idx]

complaints: 100%|██████████| 4459/4459 [00:22<00:00, 195.53it/s]


In [48]:
# convert to dataframe
prod_complaints_df = pd.DataFrame([{'article_id':k, 'complaints':v} for k,v in prod_complaints.items()])
prod_features = prod_features.merge(prod_complaints_df, on='article_id', how='left')

In [49]:
# Save product-level features
prod_features.to_parquet(os.path.join(out_dir, 'product_review_features.parquet'), index=False)
print("Saved product-level features")

Saved product-level features


In [50]:
df_topics.to_parquet(os.path.join(out_dir, 'reviews_features_full.parquet'), index=False)
print("Saved full review-level features")

Saved full review-level features
