In [None]:
#setup & constants
import os
from pathlib import Path

# ensure we're always anchored at the project root (Retail Chatbot/)
PROJECT_ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent

RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROC_DIR = PROJECT_ROOT / "data" / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

DATASET = "McAuley-Lab/Amazon-Reviews-2023"
CATEGORY = "Electronics"

RAW_JSONL = RAW_DIR / f"meta_{CATEGORY}.jsonl"
print("Raw path will be:", RAW_JSONL)

In [None]:
# Download raw JSONL (skips if already exists)
import requests

hf_url = f"https://huggingface.co/datasets/{DATASET}/resolve/main/raw/meta_categories/meta_{CATEGORY}.jsonl"

if RAW_JSONL.exists() and RAW_JSONL.stat().st_size > 0:
    print(f"[skip] Raw file already exists: {RAW_JSONL}")
else:
    print(f"[download] {hf_url} -> {RAW_JSONL}")
    with requests.get(hf_url, stream=True) as r:
        r.raise_for_status()
        with open(RAW_JSONL, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
    print(f"[done] Saved raw JSONL: {RAW_JSONL} ({RAW_JSONL.stat().st_size/1e6:.1f} MB)")


In [None]:
#helpers + precise category matcher (segment-level) with title positives
import re, json

def clean_price(price):
    try:
        m = re.search(r"[-+]?\d[\d,]*\.?\d*", str(price))
        return float(m.group(0).replace(",", "")) if m else None
    except:
        return None

def clean_float(x):
    try: return float(x)
    except: return None

def clean_int(x):
    try: return int(float(x))
    except: return None

def clean_string(x):
    if not x: return None
    s = str(x).strip()
    return s if s and s.lower() not in {"n/a","na"} else None

def clean_text(x):
    if isinstance(x, list):
        return " ".join(str(i).strip() for i in x if i)
    if isinstance(x, dict):
        return " ".join(str(v).strip() for v in x.values() if v)
    if isinstance(x, str):
        return x.strip()
    return None

def get_image(images):
    if isinstance(images, list) and images:
        img = images[0]
        if isinstance(img, str): return img
        if isinstance(img, dict): return img.get("hi_res") or img.get("large") or img.get("url")
    return None

# title positives (lightweight, avoids most accessories)
TITLE_POS_LAPTOP = re.compile(
    r"\b(laptop|notebook|chromebook|macbook|ultrabook|thinkpad|ideapad|vivobook|zenbook|inspiron|xps|latitude|yoga|omen|pavilion|envy|surface)\b",
    re.I
)
TITLE_POS_HEADPHONE = re.compile(
    r"\b(headphone|headphones|headset|earbud|earbuds|earphone|earphones|tws|anc)\b",
    re.I
)

def title_has(title: str, patt: re.Pattern) -> bool:
    return bool(patt.search(title or ""))

# category segment parsing
def split_segments(cat_path: str):
    return [seg.strip().lower() for seg in cat_path.split(">") if seg.strip()]

#Splits a segment (e.g., "Headphones, Earbuds & Accessories")into tokens: ["headphones", "earbuds", "accessories"]
def segment_tokens(seg: str):
    toks = re.split(r"[,&/|;+]", seg)
    toks = [t.strip().lower() for t in toks if t.strip()]
    return toks

#True only for segments that are 'laptops' or end with 'laptops'(avoids 'laptop accessories' because that includes 'accessories' token).
def is_laptops_segment(seg: str) -> bool:
    toks = segment_tokens(seg)
    return ("laptops" in toks or seg.endswith(" laptops")) and ("accessories" not in toks)
    
#Accept segments that mention headphones/earbuds/earphones.
def is_headphones_segment(seg: str) -> bool:
    toks = segment_tokens(seg)
    return any(t in {"headphone", "headphones", "headset", "earbud", "earbuds", "earphone", "earphones"} for t in toks)

#Keep only Laptops or Headphones.
# - Laptops: must have a laptops segment (not 'laptop accessories') AND a laptop-positive title.
# - Headphones: must have a headphones segment AND a headphones-positive title.
def classify_category(cat_path: str, title: str):
    if not cat_path:
        return None
    segs = split_segments(cat_path)

    # Laptops rule
    if any(is_laptops_segment(seg) for seg in segs) and title_has(title, TITLE_POS_LAPTOP):
        return "Laptops"

    # Headphones rule
    if any(is_headphones_segment(seg) for seg in segs) and title_has(title, TITLE_POS_HEADPHONE):
        return "Headphones"

    return None


In [None]:
#scan raw JSONL, enforce price>1 & rating_number>=1, classify (strict), dedupe, rank, save
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

rows, scanned, kept = [], 0, 0
print(f"[read] Scanning {RAW_JSONL} ...")

with open(RAW_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        scanned += 1
        item = json.loads(line)

        asin = item.get("parent_asin") or item.get("asin")
        if not asin:
            continue

        title = clean_string(item.get("title"))
        brand = clean_string(item.get("brand") or item.get("store"))
        cat_path = clean_text(item.get("category_path") or item.get("categories"))
        price = clean_price(item.get("price"))
        avg_rating = clean_float(item.get("average_rating") or item.get("rating"))
        rating_num = clean_int(item.get("rating_number") or item.get("ratings_total"))
        desc = clean_text(item.get("details") or item.get("description"))
        img = get_image(item.get("images"))

        # required fields
        if not title or not brand:
            continue
        if price is None or price <= 1:
            continue
        if rating_num is None or rating_num < 1:
            continue

        # strict classification (ONLY laptops/headphones, with title positives)
        canon = classify_category(cat_path or "", title)
        if not canon:
            continue

        rows.append({
            "asin": asin,
            "title": title,
            "brand": brand,
            "category_path": cat_path,
            "price": price,
            "average_rating": avg_rating,
            "rating_number": rating_num,
            "description": desc,
            "image_url": img,
            "filtered_category": canon
        })
        kept += 1

print(f"[done] scanned={scanned:,} kept={kept:,}")

df = pd.DataFrame(rows).drop_duplicates(subset=["asin"], keep="first").reset_index(drop=True)
print("shape after dedupe:", df.shape)

# popularity score for ranking
df["popularity"] = np.log1p(df["rating_number"].fillna(0)) * df["average_rating"].fillna(0)


In [None]:
# Optional cap to limit the number of products per category
CAP_PER_CATEGORY = None

saved_total = 0
for cat in sorted(df["filtered_category"].unique()):
    sub = df[df["filtered_category"] == cat].copy()
    sub = sub.sort_values("popularity", ascending=False)
    if CAP_PER_CATEGORY is not None:
        sub = sub.head(CAP_PER_CATEGORY)
    out_path = PROC_DIR / f"products_{cat.lower()}_2023.parquet"
    pq.write_table(pa.Table.from_pandas(sub), out_path)
    print(f"[save] {cat}: {len(sub):,}")
    saved_total += len(sub)

print(f"[summary] total saved across categories: {saved_total:,}")
try:
    display(df.head(10))
except:
    print(df.head(10))

In [None]:
for col in ["title","brand","description","image_url","category_path"]:
    empties = (df[col].astype(str).str.len() == 0).sum()
    print(f"{col:15} empty strings: {empties:,}")

# any weird numeric values?
print("price min/max:", df["price"].min(), df["price"].max())
print("average_rating min/max:", df["average_rating"].min(), df["average_rating"].max())
print("rating_number min/max:", df["rating_number"].min(), df["rating_number"].max())

# category counts
print(df["filtered_category"].value_counts())

In [None]:
#adding search_text + price_bucket (category-aware) to parquet files
import pandas as pd

files = [
    PROC_DIR / "products_headphones_2023.parquet",
    PROC_DIR / "products_laptops_2023.parquet",
]

def price_bucket(row):
    p = row.get("price")
    if p is None or pd.isna(p): return "unknown"
    cat = (row.get("filtered_category") or row.get("category") or "").lower()
    if "laptop" in cat:
        if p < 500:   return "budget"
        if p < 1200:  return "midrange"
        return "premium"
    if "headphone" in cat:
        if p < 50:    return "budget"
        if p < 200:   return "midrange"
        return "premium"
    # fallback
    if p < 100: return "budget"
    if p < 300: return "midrange"
    return "premium"

for fp in files:
    dfp = pd.read_parquet(fp)
    for col in ["title","brand","description"]:
        dfp[col] = dfp[col].fillna("").astype(str).str.strip()
    dfp["search_text"] = (
        dfp[["title","brand","description"]]
        .agg(" . ".join, axis=1)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    dfp["price_bucket"] = dfp.apply(price_bucket, axis=1)
    dfp.to_parquet(fp, index=False)
    print(f"Updated {fp.name}: {len(dfp):,} rows — added search_text + price_bucket")


In [None]:
# Inspect after adding search_text + price_bucket
import pandas as pd

df_laptops = pd.read_parquet(PROC_DIR / "products_laptops_2023.parquet")
df_headphones = pd.read_parquet(PROC_DIR / "products_headphones_2023.parquet")

print("Laptops:", df_laptops.shape)
print("Headphones:", df_headphones.shape)

display(df_laptops.head(3))
display(df_headphones.head(3))

#Combined into one DataFrame for quick analysis
df_all = pd.concat([df_laptops, df_headphones], ignore_index=True)
print("Combined total:", df_all.shape)

# Quick check of price_bucket counts per category
print("\nPrice buckets by category:")
print(df_all.groupby(["filtered_category","price_bucket"]).size())
