In [17]:
# If running fresh (uncomment to install):
# %pip install -q pandas sentence-transformers faiss-cpu langchain langchain-community

import numpy as np
import pandas as pd
from datetime import datetime, timedelta

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [18]:
# Path to your original catalog CSV
CSV_PATH = "./dataset/Nykaa_Product_Review_Cleaned.csv"  # update if needed

df = pd.read_csv(CSV_PATH)

def safe(x):
    return "" if pd.isna(x) else str(x)

texts = (
    df["Product Name"].map(safe) + " | " +
    df["Product Category"].map(safe) + " | " +
    df["Product Brand"].map(safe) + " | " +
    df["Product Tags"].map(safe) + " | " +
    df["Product Contents"].map(safe) + " | " +
    df["Product Description"].map(safe)
).tolist()

metadatas = [{
    "product_id": safe(r["Product Id"]),
    "brand_code": safe(r["Product Brand Code"]),
    "retailer": safe(r["Retailer"]),
    "category": safe(r["Product Category"]),
    "brand": safe(r["Product Brand"]),
    "name": safe(r["Product Name"]),
    "price": float(r["Product Price"]) if pd.notna(r["Product Price"]) else 0.0,
    "url": safe(r["Product Url"]),
    "market": safe(r["Market"]),
    "currency": safe(r["Product Currency"]),
    "image_url": safe(r["Product Image Url"]),
    "tags": safe(r["Product Tags"]),
    "contents": safe(r["Product Contents"]),
    "rating": float(r["Product Rating"]) if pd.notna(r["Product Rating"]) else 0.0,
    "reviews_count": int(r["Product Reviews Count"]) if pd.notna(r["Product Reviews Count"]) else 0,
    "exp_cat_count": int(r["Expected Category Count"]) if pd.notna(r["Expected Category Count"]) else 0,
    "exp_brand_count": int(r["Expected Brand Count"]) if pd.notna(r["Expected Brand Count"]) else 0,
} for _, r in df.iterrows()]


In [19]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_DIR = "faiss_index"

emb = HuggingFaceEmbeddings(model_name=MODEL_NAME)

# Build once (uncomment to build)
# vs = FAISS.from_texts(texts=texts, embedding=emb, metadatas=metadatas)
# vs.save_local(INDEX_DIR)

# Load for repeated runs
vs = FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)


In [20]:
# # Pick 5–8 real product ids from the catalog to reference (replace with actual ids from your CSV)
# sample_product_ids = df["Product Id"].dropna().astype(str).sample(6, random_state=1).tolist()

# # Build a single user's synthetic interactions (ensure product_id values exist in df["Product Id"])
# user_id = "u_0001"
# now_iso = datetime.utcnow().isoformat() + "Z"

# interactions = [
#     {"user_id": user_id, "product_id": sample_product_ids[0], "event_type":"view",         "event_weight":0.2, "timestamp":now_iso, "session_id":"s_01"},
#     {"user_id": user_id, "product_id": sample_product_ids[1], "event_type":"view",         "event_weight":0.2, "timestamp":now_iso, "session_id":"s_01"},
#     {"user_id": user_id, "product_id": sample_product_ids[1], "event_type":"click",        "event_weight":1.0, "timestamp":now_iso, "session_id":"s_01"},
#     {"user_id": user_id, "product_id": sample_product_ids[2], "event_type":"view",         "event_weight":0.2, "timestamp":now_iso, "session_id":"s_02"},
#     {"user_id": user_id, "product_id": sample_product_ids[3], "event_type":"click",        "event_weight":1.0, "timestamp":now_iso, "session_id":"s_02"},
#     {"user_id": user_id, "product_id": sample_product_ids[3], "event_type":"add_to_cart",  "event_weight":2.0, "timestamp":now_iso, "session_id":"s_02"},
#     {"user_id": user_id, "product_id": sample_product_ids[4], "event_type":"view",         "event_weight":0.2, "timestamp":now_iso, "session_id":"s_03"},
#     {"user_id": user_id, "product_id": sample_product_ids[4], "event_type":"purchase",     "event_weight":4.0, "timestamp":now_iso, "session_id":"s_03"}
# ]

# interactions_df = pd.DataFrame(interactions)
# interactions_df.head()


In [21]:
# Popularity prior for fallback ordering
def popularity_prior_from_df(df):
    reviews = pd.to_numeric(df["Product Reviews Count"], errors="coerce").fillna(0.0)
    ratings = pd.to_numeric(df["Product Rating"], errors="coerce").fillna(0.0)
    r_norm = ratings / 5.0
    rv_norm = np.tanh(reviews / 1000.0)
    prior = 0.7 * r_norm + 0.3 * rv_norm
    p = prior.clip(lower=1e-12)
    return prior, (p / p.sum()).values

pop_prior, pop_probs = popularity_prior_from_df(df)

# Sample single user's preferences from catalog distribution
rng = np.random.default_rng(7)
user_id = "u_0001"

cats = df["Product Category"].dropna().astype(str)
brands = df["Product Brand"].dropna().astype(str)
fav_cats = list(cats.value_counts(normalize=True).sample(rng.integers(1,3), random_state=7).index)
fav_brands = list(brands.value_counts(normalize=True).sample(rng.integers(1,3), random_state=7).index)

prices = pd.to_numeric(df["Product Price"], errors="coerce").fillna(0.0).clip(lower=0)
q25, q50, q75 = np.percentile(prices, [25, 50, 75])
band_choice = rng.choice(["low","mid","high","very_high"], p=[0.3,0.4,0.2,0.1])
if band_choice == "low":
    price_band = (0.0, q25)
elif band_choice == "mid":
    price_band = (q25, q75)
elif band_choice == "high":
    price_band = (q75, prices.max())
else:
    price_band = (prices.quantile(0.9), prices.max())

# Controls for sparsity and strength
EVENT_W = {"view":0.2, "click":1.0, "add_to_cart":2.0, "purchase":4.0}
base_view_prob = 0.02
match_boost = 0.15
heavy_event_prob = 0.10
max_events = int(0.12 * len(df))
days = 30

# Generate interactions over catalog
rows, now = [], datetime.utcnow()
for _, r in df.iterrows():
    pid = str(r["Product Id"])
    cat = str(r.get("Product Category", ""))
    brand = str(r.get("Product Brand", ""))
    price = float(pd.to_numeric(r.get("Product Price"), errors="coerce") or 0.0)

    is_cat = cat in fav_cats
    is_brand = brand in fav_brands
    in_band = (price_band[0] <= price <= price_band[1]) if price_band[1] >= price_band[0] else False

    p_view = base_view_prob
    if is_cat:   p_view += match_boost * 0.5
    if is_brand: p_view += match_boost * 0.3
    if in_band:  p_view += match_boost * 0.2

    if rng.random() < p_view:
        ts = now - timedelta(days=float(rng.random()*days))
        sess = f"s_{rng.integers(1000,9999)}"
        rows.append({"user_id":user_id,"product_id":pid,"event_type":"view","event_weight":EVENT_W["view"],"timestamp":ts.isoformat()+"Z","session_id":sess})
        if (is_cat or is_brand) and rng.random() < heavy_event_prob:
            et = rng.choice(["click","add_to_cart","purchase"], p=[0.6,0.25,0.15])
            rows.append({"user_id":user_id,"product_id":pid,"event_type":et,"event_weight":EVENT_W[et],
                         "timestamp":(ts+timedelta(seconds=rng.integers(30,600))).isoformat()+"Z","session_id":sess})
    if len(rows) >= max_events:
        break

interactions_df = pd.DataFrame(rows)
interactions_df.head(), len(interactions_df)


  rows, now = [], datetime.utcnow()


(  user_id                        product_id event_type  event_weight  \
 0  u_0001  f7f76573099db0058ef5264c35d9d02e       view           0.2   
 1  u_0001  c545b5953dd5220a0a448dca87fa543f       view           0.2   
 2  u_0001  562360a77dac7186e553a5d151950e6b       view           0.2   
 3  u_0001  a33f156a70a0f02abd87453222cabc44       view           0.2   
 4  u_0001  6526f3983c14d16e91e68d826dfe4704       view           0.2   
 
                      timestamp session_id  
 0  2025-09-17T08:09:55.188065Z     s_2182  
 1  2025-09-26T12:43:47.426370Z     s_8172  
 2  2025-10-06T04:55:12.891303Z     s_9720  
 3  2025-09-17T01:48:55.532867Z     s_7227  
 4  2025-09-19T09:18:21.559188Z     s_7687  ,
 14)

In [22]:
def build_user_profile(interactions_df, products_df, user_id):
    u = interactions_df[interactions_df["user_id"]==user_id].copy()
    if u.empty:
        return {}, {}, (None, None)
    joined = u.merge(products_df[["Product Id","Product Brand","Product Category","Product Price"]],
                     left_on="product_id", right_on="Product Id", how="left")
    brand_aff = joined.groupby("Product Brand")["event_weight"].sum().to_dict()
    cat_aff = joined.groupby("Product Category")["event_weight"].sum().to_dict()
    sb = sum(brand_aff.values()); sc = sum(cat_aff.values())
    if sb: brand_aff = {k:v/sb for k,v in brand_aff.items()}
    if sc: cat_aff = {k:v/sc for k,v in cat_aff.items()}
    prices_hist = pd.to_numeric(joined["Product Price"], errors="coerce").dropna()
    if len(prices_hist):
        med = prices_hist.median()
        price_band_user = (float(med*0.75), float(med*1.25))
    else:
        price_band_user = (None, None)
    return brand_aff, cat_aff, price_band_user

brand_aff, cat_aff, user_price_band = build_user_profile(interactions_df, df, user_id)
brand_aff, cat_aff, user_price_band


({'Clinique': 0.058823529411764705,
  'Estee Lauder': 0.058823529411764705,
  'Garnier': 0.058823529411764705,
  'Guerlain': 0.058823529411764705,
  'Himalaya': 0.058823529411764705,
  "L'Oreal Paris": 0.058823529411764705,
  'Nykaa Cosmetics': 0.11764705882352941,
  'O3+': 0.058823529411764705,
  'Paco Rabanne': 0.058823529411764705,
  'Paese Cosmetics': 0.058823529411764705,
  'Pahadi Local': 0.11764705882352941,
  'Plix': 0.11764705882352941,
  'Vaadi Herbals': 0.058823529411764705,
  'Versace': 0.058823529411764705},
 {'Brand > Himalaya': 0.058823529411764705,
  'Health & Wellness > Weight Management > Weight Gain': 0.058823529411764705,
  'Makeup > Face > Bronzer': 0.058823529411764705,
  'Makeup > Lips > Lip Stain': 0.11764705882352941,
  "Men's Store > Wellness > Sports Nutrition": 0.058823529411764705,
  'NFBA 2020 Nominees Online Sale': 0.058823529411764705,
  'Natural > Shop By Concern > Anti Aging': 0.058823529411764705,
  'Nykaa Luxe > Fragrance > Perfumes (EDP/EDT)': 0.176

In [23]:
def search_candidates(query: str, k: int = 50, category: str | None = None):
    docs_scores = vs.similarity_search_with_score(query, k=k)
    if category:
        docs_scores = [(d, s) for d, s in docs_scores if d.metadata.get("category","").lower()==category.lower()]
    return docs_scores


In [24]:
def popularity_prior(m):
    rating = float(m.get("rating", 0.0))
    reviews = float(m.get("reviews_count", 0.0))
    return 0.7*(rating/5.0) + 0.3*(np.tanh(reviews/1000.0))

def price_match(m_price, band):
    lo, hi = band
    if lo is None or hi is None or m_price is None:
        return 0.0
    return 1.0 if (lo <= float(m_price) <= hi) else 0.0


In [25]:
def rerank_with_behavior(query_docs_scores, brand_aff, cat_aff, price_band, top_k=5):
    out = []
    for doc, base_sim in query_docs_scores:
        m = doc.metadata
        b = brand_aff.get(m.get("brand",""), 0.0)
        c = cat_aff.get(m.get("category",""), 0.0)
        pm = price_match(m.get("price", 0.0), price_band)
        pop = popularity_prior(m)
        w_pop = 0.20 if (b+c+pm) > 0 else 0.35
        score = 0.55*base_sim + 0.15*b + 0.15*c + w_pop*pop + 0.05*pm
        out.append((doc, score))
    out.sort(key=lambda x: x[1], reverse=True)
    return out[:top_k]


In [26]:
def top_key(d):
    return max(d.items(), key=lambda x: x[1])[0] if d else "N/A"

def build_explanation(query, m, brand_aff, cat_aff, price_band):
    tb = top_key(brand_aff)
    tc = top_key(cat_aff)
    lo, hi = price_band
    price_txt = f"₹{int(lo)}–₹{int(hi)}" if (lo is not None and hi is not None) else "usual range"
    return (
        f"Recommended for '{query}' due to interest in {tc} and {tb}, "
        f"fits the price range ({price_txt}), and is well-rated/popular."
    )


In [33]:
def group_candidates(candidates:list, llm_cat_1:str|None, llm_cat_2:str|None, llm_cat_3:str|None):
    grouped = {"exact_match": [], "relevant": [], "others": []}
    
    for doc, score in candidates:
        cat = doc.metadata.get("category","Unknown")
        
        if llm_cat_1 and cat.lower() == llm_cat_1.lower():
            grouped["exact_match"].append((doc, score))
        elif llm_cat_2 and cat.lower() == llm_cat_2.lower():
            grouped["relevant"].append((doc, score))
        elif llm_cat_3 and cat.lower() == llm_cat_3.lower():
            grouped["relevant"].append((doc, score))
        else:
            grouped["others"].append((doc, score))
    
    return grouped

In [None]:
query = "lipstick"          # try variations
category_filter = None      # e.g., "Makeup" if you want a category constraint

cands = search_candidates(query, k=50, category=category_filter)
final = rerank_with_behavior(cands, brand_aff, cat_aff, price_band, top_k=5)

for rank, (doc, score) in enumerate(final, start=1):
    m = doc.metadata
    reason = build_explanation(query, m, brand_aff, cat_aff, price_band)
    print(f"{rank}. {m['name']} | {m['brand']} | {m['category']} | ₹{m['price']} | rating {m['rating']} | score {score:.3f}")
    print(f"   Why: {reason}\n")


1. Faces Canada Ultime Pro Matte Lip Crayon With Free Sharpener - Peach Me 08 | Faces Canada | Makeup > Lips > Lipstick | ₹719.0 | rating 4.5 | score 0.795
   Why: Recommended for 'lipstick' due to interest in Nykaa Luxe > Fragrance > Perfumes (EDP/EDT) and Nykaa Cosmetics, fits the price range (₹1307–₹9500), and is well-rated/popular.

2. SUGAR Smudge Me Not Liquid Lipstick - 20 Cocoa Ammo | SUGAR | Makeup > Lips > Liquid Lipstick | ₹499.0 | rating 4.2 | score 0.781
   Why: Recommended for 'lipstick' due to interest in Nykaa Luxe > Fragrance > Perfumes (EDP/EDT) and Nykaa Cosmetics, fits the price range (₹1307–₹9500), and is well-rated/popular.

3. Maybelline New York Baby Lips Color Candy Rush Lip Balm - Cotton Candy | Maybelline New York | Makeup > Lips > Lip Balm | ₹190.0 | rating 4.4 | score 0.776
   Why: Recommended for 'lipstick' due to interest in Nykaa Luxe > Fragrance > Perfumes (EDP/EDT) and Nykaa Cosmetics, fits the price range (₹1307–₹9500), and is well-rated/popular.

4. 

In [34]:
query = "lipstick"          # try variations
category_filter = None      # e.g., "Makeup" if you want a category constraint
llm_cat_1 = "Makeup > Lips > Lipstick"
llm_cat_2 = "Makeup > Lips > Liquid Lipstick"
llm_cat_3 = "Makeup > Lips > Lip Gloss"

cands = search_candidates(query, k=50, category=category_filter)
grp_cands = group_candidates(cands, llm_cat_1, llm_cat_2, llm_cat_3)
grp1 = grp_cands['exact_match']
grp2 = grp_cands['relevant']
grp3 = grp_cands['others']

final = []
final += rerank_with_behavior(grp1, brand_aff, cat_aff, price_band, top_k=5)
final += rerank_with_behavior(grp2, brand_aff, cat_aff, price_band, top_k=5)
final += rerank_with_behavior(grp3, brand_aff, cat_aff, price_band, top_k=5)
# rerank_with_behavior(cands, brand_aff, cat_aff, price_band, top_k=5)

for rank, (doc, score) in enumerate(final, start=1):
    m = doc.metadata
    reason = build_explanation(query, m, brand_aff, cat_aff, price_band)
    print(f"{rank}. {m['name']} | {m['brand']} | {m['category']} | ₹{m['price']} | rating {m['rating']} | score {score:.3f}")
    print(f"   Why: {reason}\n")


1. Faces Canada Ultime Pro Matte Lip Crayon With Free Sharpener - Peach Me 08 | Faces Canada | Makeup > Lips > Lipstick | ₹719.0 | rating 4.5 | score 0.795
   Why: Recommended for 'lipstick' due to interest in Nykaa Luxe > Fragrance > Perfumes (EDP/EDT) and Nykaa Cosmetics, fits the price range (₹1307–₹9500), and is well-rated/popular.

2. Maybelline New York Color Sensational The Loaded Bolds Lipstick - 08 Sunny Coral | Maybelline New York | Makeup > Lips > Lipstick | ₹238.0 | rating 4.4 | score 0.766
   Why: Recommended for 'lipstick' due to interest in Nykaa Luxe > Fragrance > Perfumes (EDP/EDT) and Nykaa Cosmetics, fits the price range (₹1307–₹9500), and is well-rated/popular.

3. Maybelline New York Color Sensational Creamy Matte Lipstick The Bricks-City Heat Collection - 4 Soho Nudes | Maybelline New York | Makeup > Lips > Lipstick | ₹299.0 | rating 4.3 | score 0.763
   Why: Recommended for 'lipstick' due to interest in Nykaa Luxe > Fragrance > Perfumes (EDP/EDT) and Nykaa Cosmet

In [35]:
final

[(Document(id='b517d20b-b7ba-4a46-b14c-aebfa5a2057f', metadata={'product_id': '92db2c86993a8188314620532b6e00ce', 'brand_code': 'BZ1000', 'retailer': 'nykaa.com', 'category': 'Makeup > Lips > Lipstick', 'brand': 'Faces Canada', 'name': 'Faces Canada Ultime Pro Matte Lip Crayon With Free Sharpener - Peach Me 08', 'price': 719.0, 'url': 'https://www.nykaa.com/c/p/28720?skuId=28722', 'market': 'IN', 'currency': 'INR', 'image_url': 'https://images-static.nykaa.com/media/catalog/product/tr:h-800,w-800,cm-pad_resize/8/9/8903380158649_1_1.jpg|https://images-static.nykaa.com/media/catalog/product/tr:h-800,w-800,cm-pad_resize/8/9/8903380158649_2.jpg|https://images-static.nykaa.com/media/catalog/product/tr:h-800,w-800,cm-pad_resize/n/y/nyfaces_canada_acert_4_139.jpg', 'tags': 'Faces Canada Ultime Pro Matte Lip Crayon - Peach Me 08, Makeup, Lips, Lpistick', 'contents': 'Red 7 lake, Mica, Titanium Dioxide, Yellow 5 Lake, Iron Oxides, Fragrance, Isododecane, Diisostearyl Malate, Synthetic Wax, Demi

In [29]:
import json

def to_json_results(final_results, query, brand_aff, cat_aff, price_band):
    items = []
    for rank, (doc, score) in enumerate(final_results, start=1):
        m = doc.metadata
        items.append({
            "rank": rank,
            "name": m.get("name"),
            "brand": m.get("brand"),
            "category": m.get("category"),
            "price": m.get("price"),
            "rating": m.get("rating"),
            "score": round(float(score), 3),
            "url": m.get("url"),
            "image_url": m.get("image_url"),
            "why": build_explanation(query, m, brand_aff, cat_aff, price_band)
        })
    return items

query = "perfume"
category_filter = None

cands = search_candidates(query, k=50, category=category_filter)
final = rerank_with_behavior(cands, brand_aff, cat_aff, user_price_band, top_k=5)

json_payload = to_json_results(final, query, brand_aff, cat_aff, user_price_band)

# In a notebook: print as JSON string
# print(json.dumps(json_payload, ensure_ascii=False, indent=2))


with open("temp_recommendations.json", "w", encoding="utf-8") as f:
    json.dump(json_payload, f, ensure_ascii=False, indent=4)


In [30]:
interactions_df.to_csv("./data/interactions_u0001.csv", index=False)

profile_row = {
    "user_id": user_id,
    "brand_affinity": brand_aff,
    "category_affinity": cat_aff,
    "price_band_low": user_price_band[0],
    "price_band_high": user_price_band[1],
    "updated_at": datetime.utcnow().isoformat()+"Z"
}
with open("./data/user_profile_u0001.json", "w") as f:
    json.dump(profile_row, f, indent=2)


  "updated_at": datetime.utcnow().isoformat()+"Z"
