Phase 1: Data Preparation


In [None]:
import sys
!{sys.executable} -m pip install datasets==2.17.0 -q



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
✅ Done. Now restart the kernel before continuing.


## Step 0 — Install Dependencies

In [2]:
# Run this once. If you're using a virtual env, activate it first in the VS Code terminal:
# python -m venv venv
# Windows  : venv\Scripts\activate
# Mac/Linux: source venv/bin/activate

import sys
!{sys.executable} -m pip install datasets pandas tqdm -q
print('✅ Dependencies installed.')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
✅ Dependencies installed.


## Step 1 — Create Local Folder Structure

In [4]:
import os
from pathlib import Path

# All paths are relative to wherever this notebook lives
BASE_DIR  = Path('.')          # shopsense/
DATA_DIR  = BASE_DIR / 'data'  # shopsense/data/
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Create .gitignore so large data files don't get pushed to GitHub
gitignore_path = BASE_DIR / '.gitignore'
if not gitignore_path.exists():
    gitignore_path.write_text(
        '# Data files (too large for GitHub)\n'
        'data/\n'
        'models/\n'
        '__pycache__/\n'
        '.env\n'
        'venv/\n'
        '*.pyc\n'
    )
    print('✅ .gitignore created')

print(f'✅ Folder structure ready')
print(f'   BASE : {BASE_DIR.resolve()}')
print(f'   DATA : {DATA_DIR.resolve()}')

✅ Folder structure ready
   BASE : /Users/rithikabaskaran/MyFiles/ShopSense
   DATA : /Users/rithikabaskaran/MyFiles/ShopSense/data


## Step 2 — Configuration
Only thing you might want to change is `CATEGORY`.

In [5]:
# ── CONFIG ──────────────────────────────────────────────────
CATEGORY              = 'Home_and_Kitchen'  
MAX_PRODUCTS          = 10_000
MAX_REVIEWS_PER_PRODUCT = 10
MIN_REVIEWS           = 5
MIN_DESCRIPTION_LEN   = 50
PRODUCT_SCAN_LIMIT    = 150_000
REVIEW_SCAN_LIMIT     = 500_000
# ────────────────────────────────────────────────────────────

print(f'Category    : {CATEGORY}')
print(f'Max products: {MAX_PRODUCTS:,}')

Category    : Home_and_Kitchen
Max products: 10,000


## Step 3 — Load Product Metadata (Streaming)
Streaming means it never loads the full dataset into RAM — safe for local machines.

In [6]:
import sys, shutil, os
from datasets import load_dataset
import datasets
print(f'datasets version: {datasets.__version__}')  # should show 2.17.0

meta_dataset = load_dataset(
    'McAuley-Lab/Amazon-Reviews-2023',
    f'raw_meta_{CATEGORY}',
    split='full',
    streaming=True,
    trust_remote_code=True   # needed again with 2.17.0
)

print('✅ Product stream ready.')

datasets version: 2.17.0


Downloading builder script: 39.6kB [00:00, 53.7MB/s]
Downloading readme: 30.3kB [00:00, 33.1MB/s]


✅ Product stream ready.


## Step 4 — Extract & Clean Product Records

In [9]:
from tqdm import tqdm


def extract_description(item):
    """Combine title + bullet features + description into one rich text field."""
    parts = []

    title = item.get('title', '') or ''
    parts.append(title.strip())

    features = item.get('features', []) or []
    if features:
        parts.append(' '.join(str(f) for f in features[:5]))

    description = item.get('description', []) or []
    if description:
        parts.append(' '.join(str(d) for d in description[:3]))

    return ' '.join(parts).strip()


def safe_float(val, default=None):
    try:
        return float(str(val).replace('$', '').replace(',', '').strip())
    except Exception:
        return default


records   = []
seen_asins = set()

print(f'Scanning up to {PRODUCT_SCAN_LIMIT:,} raw records to find {MAX_PRODUCTS:,} clean products...')

for i, item in enumerate(tqdm(meta_dataset, total=PRODUCT_SCAN_LIMIT)):
    if i >= PRODUCT_SCAN_LIMIT or len(records) >= MAX_PRODUCTS:
        break

    asin = item.get('parent_asin') or item.get('asin', '')
    if not asin or asin in seen_asins:
        continue

    title = (item.get('title') or '').strip()
    if not title:
        continue

    description = extract_description(item)
    if len(description) < MIN_DESCRIPTION_LEN:
        continue

    price        = safe_float(item.get('price'))
    rating       = safe_float(item.get('average_rating'))
    rating_count = item.get('rating_number') or 0

    if not rating or not rating_count or rating_count < MIN_REVIEWS:
        continue

    seen_asins.add(asin)
    records.append({
        'asin'         : asin,
        'title'        : title,
        'description'  : description,
        'category'     : CATEGORY.replace('_', ' & '),
        'price'        : price,
        'rating'       : rating,
        'rating_count' : int(rating_count),
        'store'        : item.get('store', ''),
        'main_image'   : '',  # skipping image field — not needed for embeddings
    })

print(f'\n✅ Collected {len(records):,} clean product records.')

Scanning up to 150,000 raw records to find 10,000 clean products...


  8%|▊         | 12599/150000 [00:03<00:37, 3638.15it/s]


✅ Collected 10,000 clean product records.





## Step 5 — Inspect Products

In [10]:
import pandas as pd

df_products = pd.DataFrame(records)

print(f'Shape : {df_products.shape}')
print(f'Columns: {df_products.columns.tolist()}')
df_products.head(3)

Shape : (10000, 9)
Columns: ['asin', 'title', 'description', 'category', 'price', 'rating', 'rating_count', 'store', 'main_image']


Unnamed: 0,asin,title,description,category,price,rating,rating_count,store,main_image
0,B07R3DYMH6,Set of 4 Irish Coffee Glass Mugs Footed 10.5 o...,Set of 4 Irish Coffee Glass Mugs Footed 10.5 o...,Home & and & Kitchen,24.95,4.6,18,LavoHome,
1,B0BNZ8Q7YT,Foaming Soap Dispenser Thick Ceramic Foam Hand...,Foaming Soap Dispenser Thick Ceramic Foam Hand...,Home & and & Kitchen,24.99,4.4,135,rejomiik,
2,B00KKU8HTG,jersey seating 2 x Vinyl Air Lift Adjustable S...,jersey seating 2 x Vinyl Air Lift Adjustable S...,Home & and & Kitchen,,4.3,167,jersey seating®,


In [11]:
print('=== Key Stats ===')
print(f"Products            : {len(df_products):,}")
print(f"With price          : {df_products['price'].notna().sum():,} ({df_products['price'].notna().mean()*100:.1f}%)")
print(f"Price range         : ${df_products['price'].min():.2f} — ${df_products['price'].max():.2f}")
print(f"Avg rating          : {df_products['rating'].mean():.2f}")
print(f"Avg description len : {df_products['description'].str.len().mean():.0f} chars")

print('\n=== Price Distribution ===')
bins   = [0, 10, 25, 50, 100, 200, float('inf')]
labels = ['<$10', '$10-25', '$25-50', '$50-100', '$100-200', '$200+']
df_products['price_bucket'] = pd.cut(df_products['price'], bins=bins, labels=labels)
print(df_products['price_bucket'].value_counts().sort_index())

=== Key Stats ===
Products            : 10,000
With price          : 5,792 (57.9%)
Price range         : $1.53 — $2699.00
Avg rating          : 4.31
Avg description len : 1073 chars

=== Price Distribution ===
price_bucket
<$10         737
$10-25      2460
$25-50      1450
$50-100      615
$100-200     331
$200+        199
Name: count, dtype: int64


## Step 6 — Save Products CSV

In [12]:
df_save = df_products.drop(columns=['price_bucket'])

products_path = DATA_DIR / 'products_clean.csv'
df_save.to_csv(products_path, index=False)

size_mb = products_path.stat().st_size / (1024 * 1024)
print(f'✅ products_clean.csv saved ({size_mb:.1f} MB) → {products_path.resolve()}')

✅ products_clean.csv saved (12.1 MB) → /Users/rithikabaskaran/MyFiles/ShopSense/data/products_clean.csv


## Step 7 — Load Reviews for These Products

In [13]:
valid_asins = set(df_products['asin'].tolist())
print(f'Will filter reviews to {len(valid_asins):,} product ASINs')

print(f'\nOpening review stream for: {CATEGORY}')
review_dataset = load_dataset(
    'McAuley-Lab/Amazon-Reviews-2023',
    f'raw_review_{CATEGORY}',
    split='full',
    streaming=True,
    trust_remote_code=True
)
print('✅ Review stream ready.')

Will filter reviews to 10,000 product ASINs

Opening review stream for: Home_and_Kitchen
✅ Review stream ready.


In [14]:
review_records = []
review_counts  = {}   # asin → count
MIN_REVIEW_LEN = 30

print(f'Scanning up to {REVIEW_SCAN_LIMIT:,} review records...')

for i, review in enumerate(tqdm(review_dataset, total=REVIEW_SCAN_LIMIT)):
    if i >= REVIEW_SCAN_LIMIT:
        break

    asin = review.get('parent_asin') or review.get('asin', '')
    if asin not in valid_asins:
        continue
    if review_counts.get(asin, 0) >= MAX_REVIEWS_PER_PRODUCT:
        continue

    text = (review.get('text') or '').strip()
    if len(text) < MIN_REVIEW_LEN:
        continue

    review_records.append({
        'asin'             : asin,
        'review_title'     : (review.get('title') or '').strip(),
        'review_text'      : text,
        'rating'           : safe_float(review.get('rating'), default=0),
        'helpful_vote'     : review.get('helpful_vote', 0) or 0,
        'verified_purchase': review.get('verified_purchase', False)
    })
    review_counts[asin] = review_counts.get(asin, 0) + 1

print(f'\n✅ Collected {len(review_records):,} reviews across {len(review_counts):,} products.')

Scanning up to 500,000 review records...


100%|██████████| 500000/500000 [00:07<00:00, 63298.74it/s]


✅ Collected 5,566 reviews across 2,003 products.





## Step 8 — Inspect & Save Reviews

In [15]:
df_reviews = pd.DataFrame(review_records)

print('=== Review Stats ===')
print(f"Total reviews       : {len(df_reviews):,}")
print(f"Products covered    : {df_reviews['asin'].nunique():,} / {len(valid_asins):,}")
print(f"Avg reviews/product : {len(df_reviews) / df_reviews['asin'].nunique():.1f}")
print(f"Avg review length   : {df_reviews['review_text'].str.len().mean():.0f} chars")
print(f"Verified purchases  : {df_reviews['verified_purchase'].mean()*100:.1f}%")
print(f"\nRating distribution:")
print(df_reviews['rating'].value_counts().sort_index())

=== Review Stats ===
Total reviews       : 5,566
Products covered    : 2,003 / 10,000
Avg reviews/product : 2.8
Avg review length   : 268 chars
Verified purchases  : 82.2%

Rating distribution:
rating
1.0     299
2.0     239
3.0     427
4.0     808
5.0    3793
Name: count, dtype: int64


In [16]:
reviews_path = DATA_DIR / 'reviews_clean.csv'
df_reviews.to_csv(reviews_path, index=False)

size_mb = reviews_path.stat().st_size / (1024 * 1024)
print(f'✅ reviews_clean.csv saved ({size_mb:.1f} MB) → {reviews_path.resolve()}')

✅ reviews_clean.csv saved (1.7 MB) → /Users/rithikabaskaran/MyFiles/ShopSense/data/reviews_clean.csv


## Step 9 — Save Product ASINs Index

In [17]:
import json

asin_list  = df_products['asin'].tolist()
asin_path  = DATA_DIR / 'product_ids.json'

with open(asin_path, 'w') as f:
    json.dump(asin_list, f)

print(f'✅ {len(asin_list):,} ASINs saved → {asin_path.resolve()}')

✅ 10,000 ASINs saved → /Users/rithikabaskaran/MyFiles/ShopSense/data/product_ids.json


## Step 10 — Final Summary

In [18]:
print('=' * 55)
print('         PHASE 1 COMPLETE — SUMMARY')
print('=' * 55)
print(f'  Category           : {CATEGORY}')
print(f'  Products saved     : {len(df_products):,}')
print(f'  Reviews saved      : {len(df_reviews):,}')
print(f'  Products w/ reviews: {df_reviews["asin"].nunique():,}')
print()
print('  Files created:')
for fname in ['products_clean.csv', 'reviews_clean.csv', 'product_ids.json']:
    fpath = DATA_DIR / fname
    size_kb = fpath.stat().st_size / 1024
    print(f'    ✅ data/{fname} ({size_kb:.0f} KB)')
print()
print('  ➡️  Next: Phase 2 — Semantic Retrieval with FAISS')
print('=' * 55)

         PHASE 1 COMPLETE — SUMMARY
  Category           : Home_and_Kitchen
  Products saved     : 10,000
  Reviews saved      : 5,566
  Products w/ reviews: 2,003

  Files created:
    ✅ data/products_clean.csv (12402 KB)
    ✅ data/reviews_clean.csv (1726 KB)
    ✅ data/product_ids.json (137 KB)

  ➡️  Next: Phase 2 — Semantic Retrieval with FAISS


## Quick Sample Peek

In [19]:
print('=== 3 Sample Products ===\n')
for _, row in df_products.sample(3, random_state=42).iterrows():
    print(f"ASIN   : {row['asin']}")
    print(f"Title  : {row['title'][:80]}")
    print(f"Price  : {'$'+str(row['price']) if row['price'] else 'N/A'}")
    print(f"Rating : {row['rating']} ⭐ ({row['rating_count']:,} reviews)")
    print(f"Desc   : {row['description'][:150]}...")
    print('-' * 60)

=== 3 Sample Products ===

ASIN   : B08RP93QSX
Title  : Cubiker Computer Home Office Desk with Drawers, 40 Inch Small Desk Study Writing
Price  : $99.99
Rating : 4.5 ⭐ (2,857 reviews)
Desc   : Cubiker Computer Home Office Desk with Drawers, 40 Inch Small Desk Study Writing Table, Modern Simple PC Desk, Black Modern Confident Style: Cubiker o...
------------------------------------------------------------
ASIN   : B09Y31JYHK
Title  : Halloween Spider Net Tree Skirt, Seasonal Tree Mat Holiday Party Supplies Orname
Price  : $14.99
Rating : 4.8 ⭐ (106 reviews)
Desc   : Halloween Spider Net Tree Skirt, Seasonal Tree Mat Holiday Party Supplies Ornaments Indoor Outdoor Decorations for Trees 48 Inches (Silver, 48 in) Uni...
------------------------------------------------------------
ASIN   : B0BLSB5LVH
Title  : Swiss Ortho Sleep, Bamboo 12" Inch Certified Independently & Individually Wrappe
Price  : $259.99
Rating : 4.2 ⭐ (5,183 reviews)
Desc   : Swiss Ortho Sleep, Bamboo 12" Inch Certified I