In [None]:
# ================================
# PHASE 3 — REVIEW & PRODUCT EMBEDDINGS (KAGGLE 2025 — FULLY FIXED)
# Fixes NumPy 2.2.6 binary incompatibility + creates reusable dataset
# Runtime: ~15–20 min on 300k reviews (GPU)
# ================================

# %% [markdown]
# ## 1) CRITICAL FIX: Downgrade NumPy to 1.26.4 (solves dtype size error)
# Kaggle's NumPy 2.2.6 breaks sklearn/transformers — this forces 1.x compatibility
# %%
!pip uninstall -y numpy scipy scikit-learn transformers sentence-transformers torch torchvision torchaudio
!pip install "numpy==1.26.4" "scipy==1.13.1" "scikit-learn==1.3.2" --no-cache-dir --force-reinstall

# %% [markdown]
# ## 2) Install the rest (now compatible)
# %%
!pip install -q sentence-transformers==2.3.1 faiss-cpu==1.8.0 pandas==2.1.4 tqdm

# %% [markdown]
# ## 3) FORCE KERNEL RESTART (required after downgrade)
# Run this cell — Kaggle will restart automatically
# %%
import os
os.kill(os.getpid(), 9)  # Clean restart to load new NumPy

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scipy 1.15.3
Uninstalling scipy-1.15.3:
  Successfully uninstalled scipy-1.15.3
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: transformers 4.53.3
Uninstalling transformers-4.53.3:
  Successfully uninstalled transformers-4.53.3
Found existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfull

In [5]:
# PHASE 3 — REVIEW & PRODUCT EMBEDDINGS (KAGGLE VERSION)
# Generates: review embeddings + product embeddings + FAISS indices
# Output: Ready-to-use dataset for recommendations, similarity, clustering
# Runtime: ~12–18 min on 300k reviews (GPU)
# ================================

# %% [markdown]
# ## Install dependencies
# %%
!pip install -q sentence-transformers faiss-cpu # faiss-gpu is faster on Kaggle


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [2]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import faiss

# Auto-find your preprocessed reviews (from Phase 2)
INPUT_DIR = Path("/kaggle/input")
WORKING_DIR = Path("/kaggle/working")
OUTPUT_DIR = WORKING_DIR / "embeddings_output"
OUTPUT_DIR.mkdir(exist_ok=True)

# Find the folder containing reviews_features_full.parquet
parquet_files = list(INPUT_DIR.rglob("reviews_features_full.parquet"))
if not parquet_files:
    raise FileNotFoundError("reviews_features_full.parquet not found! Add your Phase 2 dataset.")
DATA_DIR = parquet_files[0].parent
print(f"Found data at: {DATA_DIR}")

Found data at: /kaggle/input/your-reviews-features-full


In [4]:
# ## 1) Load reviews
# %%
df = pd.read_parquet(DATA_DIR / "reviews_features_full.parquet")
print(f"Loaded {len(df):,} reviews")

texts = df['text_for_training'].fillna('').astype(str).tolist()
article_ids = df['article_id'].astype(str).tolist()
review_ids = df['review_id'].astype(str).tolist()  # keep as string to avoid overflow

print("Sample text:", texts[0][:200])


Loaded 300,000 reviews
Sample text: average product feels okay might be fine for everyday use


In [5]:
# %% [markdown]
# ## 2) Load Sentence-BERT Model (best speed/quality)
# %%
model = SentenceTransformer('all-MiniLM-L6-v2')  # uses GPU automatically
print("Model loaded: all-MiniLM-L6-v2 (384-dim)")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded: all-MiniLM-L6-v2 (384-dim)


In [6]:
# ================================


# %% [markdown]
# ## 3) Compute Review Embeddings (batched + fast)
# %%
batch_size = 512
embeddings_list = []

print("Encoding reviews in batches...")
for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
    batch = texts[i:i + batch_size]
    batch_emb = model.encode(
        batch,
        batch_size=batch_size,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=True  # crucial for cosine similarity
    )
    embeddings_list.append(batch_emb)

review_embeddings = np.vstack(embeddings_list)
print(f"Review embeddings shape: {review_embeddings.shape}")

# Save
review_emb_path = OUTPUT_DIR / "review_embeddings.npy"
np.save(review_emb_path, review_embeddings)
print(f"Saved to {review_emb_path}")




Encoding reviews in batches...


Encoding: 100%|██████████| 586/586 [22:28<00:00,  2.30s/it]


Review embeddings shape: (300000, 384)
Saved to /kaggle/working/embeddings_output/review_embeddings.npy


In [7]:
# %% [markdown]
# ## 4) Build FAISS Index for Reviews (instant similarity search)
# %%
d = review_embeddings.shape[1]
index_reviews = faiss.IndexFlatIP(d)  # Inner Product = Cosine (because normalized)
index_reviews.add(review_embeddings.astype('float32'))

faiss.write_index(index_reviews, str(OUTPUT_DIR / "faiss_review_index.index"))
print("FAISS review index built and saved")

FAISS review index built and saved


In [9]:
# %% [markdown]
# ## 5) Product-Level Embeddings (average pooling) — FIXED & MEMORY SAFE
# %%
import gc

print("Computing product-level embeddings (average of review embeddings)...")

# Free up RAM first (review_embeddings can be huge)
if 'review_embeddings' in globals():
    del review_embeddings
    gc.collect()

# We will use the saved .npy file in memory-mapped mode → no OOM
review_emb_path = OUTPUT_DIR / "review_embeddings.npy"
review_embeddings_mmap = np.load(review_emb_path, mmap_mode='r')  # ← loads without eating RAM

# Create a mapping: article_id → list of row indices in the embeddings file
df_with_idx = pd.DataFrame({
    'article_id': article_ids,
    'emb_idx': np.arange(len(article_ids))   # row index = position in embeddings file
})

# Group indices per product
grouped_indices = df_with_idx.groupby('article_id')['emb_idx'].apply(list)

print(f"Found {len(grouped_indices):,} unique products → averaging reviews...")

product_embeddings_list = []
product_ids_list = []

for article_id, idx_list in tqdm(grouped_indices.items(), desc="Averaging per product"):
    # Load only the embeddings for this product (still memory-mapped → very cheap)
    product_embs = review_embeddings_mmap[idx_list]           # shape: (n_reviews, 384)
    avg_emb = np.mean(product_embs, axis=0).astype('float32') # average → 384-dim
    product_embeddings_list.append(avg_emb)
    product_ids_list.append(article_id)

# Convert to final matrix
product_matrix = np.stack(product_embeddings_list)  # shape: (n_products, 384)
product_ids = product_ids_list

print(f"Product embeddings ready → shape: {product_matrix.shape}")

# Save
np.save(OUTPUT_DIR / "product_embeddings.npy", product_matrix)
pd.DataFrame({'article_id': product_ids}).to_parquet(OUTPUT_DIR / "product_ids.parquet", index=False)
print("Product embeddings + IDs saved!")



Computing product-level embeddings (average of review embeddings)...
Found 6,292 unique products → averaging reviews...


Averaging per product: 6292it [00:00, 14634.86it/s]

Product embeddings ready → shape: (6292, 384)
Product embeddings + IDs saved!





In [10]:
# %% [markdown]
# ## 6) FAISS Index for Products
# %%
index_products = faiss.IndexFlatIP(d)
index_products.add(product_matrix)

faiss.write_index(index_products, str(OUTPUT_DIR / "faiss_product_index.index"))
print("FAISS product index saved")


FAISS product index saved


In [11]:
# %% [markdown]
# ## 7) Demo: Similar Products
# %%
def get_similar_products(article_id, k=6):
    if article_id not in product_ids:
        return f"Product {article_id} not found"
    idx = product_ids.index(article_id)
    query = product_matrix[idx:idx+1]
    scores, indices = index_products.search(query, k+1)  # +1 to skip itself
    similar_ids = [product_ids[i] for i in indices[0][1:]]  # skip first (itself)
    return similar_ids


In [12]:
# Test
sample_id = product_ids[0]
print(f"Sample product: {sample_id}")
print("Similar products:", get_similar_products(sample_id, k=5))

# %% [markdown]
# ## 8) Save Everything as a Reusable Kaggle Dataset
# %%
print("Creating final dataset...")

!mkdir -p /kaggle/working/final_embeddings_dataset
!cp -r {OUTPUT_DIR}/* /kaggle/working/final_embeddings_dataset/

Sample product: 176209023
Similar products: ['564474001', '557248016', '554126001', '493814030', '488555002']
Creating final dataset...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
# Create the dataset (run once!)
!kaggle datasets create -p /kaggle/working/final_embeddings_dataset --dir-mode tar -m "Review + Product embeddings + FAISS indices (all-MiniLM-L6-v2)"

print("\nDONE! Your dataset is created.")
print("Files included:")
!ls -lh /kaggle/working/final_embeddings_dataset/

# %% [markdown]
# ## What You Get (add this dataset to any future notebook)
# ├── review_embeddings.npy
# ├── product_embeddings.npy
# ├── product_ids.parquet
# ├── faiss_review_index.index
# └── faiss_product_index.index
# 
# → Instant "You might also like", user personalization, cold-start solution, clustering, etc.
# 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.11/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.11/dist-packages/kaggle/api/kaggle_api_extended.py", line 434, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/

DONE! Your dataset is created.
Files included:
total 898M
-rw-r--r-- 1 root root 9.3M Dec  8 17:56 faiss_product_index.index
-rw-r--r-- 1 root root 440M Dec  8 17:56 faiss_review_index.index
-rw-r--r-- 1 root root 9.3M Dec  8 17:56 product_embeddings.npy
-rw-r--r-- 1 root root  42K Dec  8 17:56 product_ids.parquet
-rw-r--r-- 1 root root 440M Dec  8 17:56 review_embeddings.npy


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
# %% [markdown]
# CREATE review_ids.parquet — Maps embedding row index → original review_id
# Run this in the same notebook where you generated review_embeddings.npy
# %%

import pandas as pd
import numpy as np
from pathlib import Path

# ———————— CONFIG: UPDATE THESE PATHS FOR YOUR NOTEBOOK ————————
# Change these only if your files are in different locations

# 1) Path to your preprocessed reviews (the one used for embedding generation)
PREPROCESSED_PATH = Path("/kaggle/input/your-reviews-features-full/reviews_features_full.parquet")
# Common alternatives:
# PREPROCESSED_PATH = Path("/kaggle/input/hm-reviews-preprocessed/reviews_preprocessed.parquet")
# PREPROCESSED_PATH = Path("/kaggle/working/data/ml/reviews_preprocessed.parquet")

# 2) Where to save the mapping (same folder as your embeddings)
OUTPUT_DIR = Path("/kaggle/working/embeddings")        # ← most common
# Or if you used a different folder:
# OUTPUT_DIR = Path("/kaggle/working/embeddings_output")

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ———————————————————————————————————————————————————————————————

# Load the dataframe that was used to generate embeddings
print(f"Loading reviews from: {PREPROCESSED_PATH}")
df = pd.read_parquet(PREPROCESSED_PATH)

print(f"Loaded {len(df):,} reviews")

# Ensure review_id exists and is unique
if "review_id" not in df.columns:
    raise ValueError("review_id column not found! Check your parquet file.")
if df["review_id"].duplicated().any():
    print("Warning: review_id has duplicates — taking first occurrence")

# Create mapping: embedding row index (0, 1, 2, ...) → review_id
mapping_df = pd.DataFrame({
    "idx": np.arange(len(df)),           # This matches the row order in review_embeddings.npy
    "review_id": df["review_id"].astype(str).values
})

# Optional: also save article_id if useful later
mapping_df["article_id"] = df["article_id"].astype(str).values

# Save
output_path = OUTPUT_DIR / "review_ids.parquet"
mapping_df.to_parquet(output_path, index=False)

print(f"Saved mapping → {output_path}")
print(f"Shape: {mapping_df.shape}")
print("\nFirst 5 rows:")
print(mapping_df.head())

# Bonus: Add to your final dataset so it's always together
!cp {output_path} /kaggle/working/final_embeddings_dataset/ 2>/dev/null || echo "Skip copy if folder doesn't exist"

Loading reviews from: /kaggle/input/your-reviews-features-full/reviews_features_full.parquet
Loaded 300,000 reviews
Saved mapping → /kaggle/working/embeddings/review_ids.parquet
Shape: (300000, 3)

First 5 rows:
   idx review_id article_id
0    0   2116198  399061015
1    1   2116199  789274001
2    2   2116200  399223001
3    3   2116201  393447016
4    4   2116202  399223034
Skip copy if folder doesn't exist


In [7]:
emb = np.load("/kaggle/input/model-emb/review_embeddings.npy")
mapdf = pd.read_parquet("/kaggle/working/embeddings/review_ids.parquet")

print(len(emb), len(mapdf))
assert len(emb) == len(mapdf), "Mismatch! Wrong dataframe used."


300000 300000
