In [1]:
import pandas as pd
import numpy as np
import json
from pprint import pprint
from pathlib import Path

project_root = Path.cwd().parent.parent
# Output directory
ML_DIR = project_root / "data/ml/reviews/reviews_preprocessed"
ML_DIR.mkdir(parents=True, exist_ok=True)


In [2]:
reviews = pd.read_parquet(f"{ML_DIR}/reviews_features_full.parquet")
print("Loaded reviews_features_full.parquet")
print("Shape:", reviews.shape)
print("\nColumns:\n", reviews.columns.tolist())

Loaded reviews_features_full.parquet
Shape: (300000, 62)

Columns:
 ['review_id', 'customer_id', 'article_id', 'category_id', 'rating', 'review_text', 'created_at', 'verified_purchase', 'helpful_votes', 'synthetic_sentiment_label', 'aspect_terms', 'language', 'review_source', 'review_age_days', 'clean_text', 'vader_score', 'vader_label', 'aspect_terms_list', 'tokens', 'lemmas', 'sentences', 'language_detected', 'translated_text', 'is_spam', 'final_text_for_ml', 'text_for_training', 'n_words', 'n_chars', 'vader_compound', 'vader_compound_norm', 'tb_polarity', 'tb_subjectivity', 'topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17', 'topic_18', 'topic_19', 'topic_20', 'topic_21', 'topic_22', 'topic_23', 'topic_24', 'topic_25', 'topic_26', 'topic_27', 'topic_28', 'topic_29']


In [3]:
# Preview a random enriched review
print("\n--- Sample Enriched Review ---")
sample = reviews.sample(1).iloc[0]
pprint({
    "review_id": sample["review_id"],
    "article_id": sample["article_id"],
    "rating": sample["rating"],
    "text": sample["text_for_training"][:200] + "...",
    "vader_norm": sample["vader_compound_norm"],
    "tb_polarity": sample["tb_polarity"],
    "subjectivity": sample["tb_subjectivity"],
})


--- Sample Enriched Review ---
{'article_id': '699622001',
 'rating': 4,
 'review_id': 2187511,
 'subjectivity': 0.5,
 'tb_polarity': 0.3125,
 'text': 'exactly as described fits well and the color is rich...',
 'vader_norm': 0.8453999999999999}


In [6]:
print("Aspect sentiments:")
aspect_cols = [c for c in reviews.columns if c.startswith("aspect_")]
for c in aspect_cols[:8]:
    print(f"{c}: {sample[c]}")

Aspect sentiments:
aspect_terms: ['packaging' 'color' 'material']
aspect_terms_list: ['packaging' 'color' 'material']


Inspect LDA topic distributions

In [7]:
NUM_TOPICS = len([c for c in reviews.columns if c.startswith("topic_")])
print("\nTotal LDA topics detected:", NUM_TOPICS)



Total LDA topics detected: 30


In [8]:
topic_means = reviews[[f"topic_{i}" for i in range(NUM_TOPICS)]].mean().sort_values(ascending=False)
print("\n--- Top Topics by Mean Probability ---")
print(topic_means.head(10))


--- Top Topics by Mean Probability ---
topic_15    0.115321
topic_12    0.069816
topic_9     0.062094
topic_14    0.059773
topic_20    0.055659
topic_27    0.048012
topic_24    0.044957
topic_17    0.041566
topic_3     0.040181
topic_29    0.037988
dtype: float64


Show top words for each LDA topic

In [10]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary

print("\nLoading LDA model...")
lda = LdaModel.load(f"{ML_DIR}/lda_model_full")
#dictionary = Dictionary.load(f"{ML_DIR}/lda_dictionary.dict")


Loading LDA model...


In [11]:
print("\n--- LDA Topic Key Words ---")
for i in range(min(NUM_TOPICS, 10)):   # show top 10 topics
    print(f"\nTopic {i}:")
    pprint(lda.show_topic(i, topn=10))


--- LDA Topic Key Words ---

Topic 0:
[('satisfied', 0.4996268),
 ('purchase', 0.4996268),
 ('neither', 6.7225915e-06),
 ('time', 6.7225915e-06),
 ('break', 6.7225915e-06),
 ('mediocre', 6.7225915e-06),
 ('look', 6.7225915e-06),
 ('job', 6.7225915e-06),
 ('buy', 6.7225915e-06),
 ('things', 6.7225915e-06)]

Topic 1:
[('customer', 0.14328744),
 ('service', 0.1432378),
 ('quickly', 0.10106688),
 ('help', 0.08940032),
 ('returned', 0.08940032),
 ('okay', 0.08006607),
 ('query', 0.05385076),
 ('handled', 0.053850736),
 ('special', 0.053834535),
 ('nothing', 0.053697612)]

Topic 2:
[('price', 0.12357457),
 ('packaging', 0.12350252),
 ('happy', 0.11299767),
 ('really', 0.11299387),
 ('totally', 0.112993754),
 ('worth', 0.11291197),
 ('neat', 0.11091781),
 ('secure', 0.1109176),
 ('comfort', 0.023505023),
 ('material', 0.011854288)]

Topic 3:
[('feels', 0.18465863),
 ('buy', 0.16468386),
 ('loved', 0.16468327),
 ('great', 0.16468237),
 ('premium', 0.09309568),
 ('cheap', 0.091427036),
 ('pack

Load product-level features

In [12]:
prod = pd.read_parquet(f"{ML_DIR}/product_review_features.parquet")
print("\nLoaded product_review_features.parquet")
print("Shape:", prod.shape)


Loaded product_review_features.parquet
Shape: (6292, 41)


In [25]:
print("--- Product Feature Columns ---")
pprint(prod.columns.tolist())

--- Product Feature Columns ---
['article_id',
 'vader_compound_norm_mean',
 'vader_compound_norm_std',
 'tb_polarity_mean',
 'n_words_mean',
 'rating_mean',
 'rating_std',
 'rating_count',
 'topic_0',
 'topic_1',
 'topic_2',
 'topic_3',
 'topic_4',
 'topic_5',
 'topic_6',
 'topic_7',
 'topic_8',
 'topic_9',
 'topic_10',
 'topic_11',
 'topic_12',
 'topic_13',
 'topic_14',
 'topic_15',
 'topic_16',
 'topic_17',
 'topic_18',
 'topic_19',
 'topic_20',
 'topic_21',
 'topic_22',
 'topic_23',
 'topic_24',
 'topic_25',
 'topic_26',
 'topic_27',
 'topic_28',
 'topic_29',
 'pct_negative',
 'controversy_score',
 'complaints']


In [24]:
print("--- Top 5 Products by Average Rating ---")
print(prod.sort_values("rating_mean", ascending=False)[["article_id","rating_mean","rating_count"]].head())


--- Top 5 Products by Average Rating ---
     article_id  rating_mean  rating_count
983   567742002          5.0             2
407   500364001          5.0             1
5237  816215002          5.0             3
3094  691210001          5.0             1
4481  769729001          5.0             1


In [23]:
print("--- Worst 5 Products by Average Rating ---")
print(prod.sort_values("rating_mean", ascending=True)[["article_id","rating_mean","rating_count"]].head())


--- Worst 5 Products by Average Rating ---
     article_id  rating_mean  rating_count
5805  868597004          1.0             1
907   563189004          1.0             1
348   493814025          1.0             1
6218  914363001          1.0             1
5760  867334001          1.0             1


Complaint Keywords

In [22]:
print("--- Sample Complaint Keywords ---")
complaints = prod[["article_id", "complaints"]].dropna().sample(5)
for _, row in complaints.iterrows():
    print(f"\nArticle: {row['article_id']}")
    print("Complaints:", row["complaints"])

--- Sample Complaint Keywords ---

Article: 888700006
Complaints: ['better' 'stitching' 'perfect' 'fit' 'price' 'cheap' 'feels' 'delivery'
 'great' 'service']

Article: 542695005
Complaints: ['service' 'price' 'great' 'faster' 'feels' 'didn' 'returned' 'customer'
 'help' 'small']

Article: 687704039
Complaints: ['experience' 'use' 'terrible' 'broke' 'service' 'price' 'fit' 'stitching'
 'perfect' 'better']

Article: 537612027
Complaints: []

Article: 613826002
Complaints: ['delivery' 'quality' 'service' 'packaging' 'used' 'daily' 'issues' 'week'
 'late' 'bad']


Controversy

In [21]:
print("--- High Controversy Products (Large Rating Std) ---")
print(prod.sort_values("rating_std", ascending=False)[["article_id","rating_mean","rating_std"]].head())


--- High Controversy Products (Large Rating Std) ---
     article_id  rating_mean  rating_std
3905  735833002          3.0    2.828427
1938  636454001          3.0    2.828427
4150  749823001          3.0    2.828427
1488  613952001          3.0    2.828427
5794  868444002          3.0    2.828427


In [20]:
print("Sanity checks complete.")

Sanity checks complete.
