In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import yaml

# Load config
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Define paths
data_path = Path(config['output']['base_path'])
train_file = data_path / config['output']['train_file']
val_file = data_path / config['output']['val_file']
test_file = data_path / config['output']['test_file']
metadata_file = data_path / config['output']['metadata_file']
embeddings_file = data_path / config['output']['embeddings_file']

print("Checking file existence:")
for file in [train_file, val_file, test_file, metadata_file, embeddings_file]:
    print(f"✓ {file.name}" if file.exists() else f"✗ {file.name} (missing)")


Checking file existence:
✓ train_data.parquet
✓ val_data.parquet
✓ test_data.parquet
✓ metadata.parquet
✓ title_embeddings.parquet


In [2]:
# Load and verify data files
print("\nVerifying data files:")

# Load train data
train_df = pd.read_parquet(train_file)
print("\nTrain data:")
print(f"- Records: {len(train_df)}")
print(f"- Unique users: {train_df['user_id'].nunique()}")
print(f"- Unique items: {train_df['parent_asin'].nunique()}")
print("\nColumns:", train_df.columns.tolist())
print("\nSample data:")
display(train_df.head(2))

# Load validation data
val_df = pd.read_parquet(val_file)
print("\nValidation data:")
print(f"- Records: {len(val_df)}")
print(f"- Unique users: {val_df['user_id'].nunique()}")
print(f"- Unique items: {val_df['parent_asin'].nunique()}")

# Load test data
test_df = pd.read_parquet(test_file)
print("\nTest data:")
print(f"- Records: {len(test_df)}")
print(f"- Unique users: {test_df['user_id'].nunique()}")
print(f"- Unique items: {test_df['parent_asin'].nunique()}")

# Verify no overlap between splits
train_users = set(train_df['user_id'])
val_users = set(val_df['user_id'])
test_users = set(test_df['user_id'])

print("\nVerifying user overlap between splits:")
print(f"- Train-Val overlap: {len(train_users & val_users)} users")
print(f"- Train-Test overlap: {len(train_users & test_users)} users")
print(f"- Val-Test overlap: {len(val_users & test_users)} users")



Verifying data files:

Train data:
- Records: 2029
- Unique users: 253
- Unique items: 341

Columns: ['user_id', 'parent_asin', 'rating', 'timestamp', 'history']

Sample data:


Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07J3GH1W1,5.0,1547589356557,
1,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07W397QG4,5.0,1593352422858,B07J3GH1W1



Validation data:
- Records: 253
- Unique users: 253
- Unique items: 168

Test data:
- Records: 253
- Unique users: 253
- Unique items: 138

Verifying user overlap between splits:
- Train-Val overlap: 253 users
- Train-Test overlap: 253 users
- Val-Test overlap: 253 users


In [3]:
# Verify metadata and embeddings
print("\nVerifying metadata and embeddings:")

# Load metadata
metadata_df = pd.read_parquet(metadata_file)
print("\nMetadata:")
print(f"- Records: {len(metadata_df)}")
print(f"- Unique parent ASINs: {metadata_df['parent_asin'].nunique()}")
print("\nColumns:", metadata_df.columns.tolist())
print("\nSample data:")
display(metadata_df.head(2))

# Load embeddings
embeddings_df = pd.read_parquet(embeddings_file)
print("\nEmbeddings:")
print(f"- Records: {len(embeddings_df)}")
print(f"- Unique parent ASINs: {embeddings_df['parent_asin'].nunique()}")
print(f"- Embedding dimension: {len(embeddings_df['embedding'].iloc[0])}")
print("\nColumns:", embeddings_df.columns.tolist())

# Verify coverage
all_parent_asins = set(pd.concat([train_df, val_df, test_df])['parent_asin'].unique())
metadata_asins = set(metadata_df['parent_asin'])
embeddings_asins = set(embeddings_df['parent_asin'])

print("\nVerifying coverage:")
print(f"- Total unique parent ASINs in data: {len(all_parent_asins)}")
print(f"- Parent ASINs with metadata: {len(metadata_asins)} ({len(metadata_asins & all_parent_asins)} in data)")
print(f"- Parent ASINs with embeddings: {len(embeddings_asins)} ({len(embeddings_asins & all_parent_asins)} in data)")

# Check for missing titles or embeddings
missing_metadata = all_parent_asins - metadata_asins
missing_embeddings = all_parent_asins - embeddings_asins

if missing_metadata:
    print("\nWarning: Missing metadata for ASINs:", list(missing_metadata)[:5], "...")
if missing_embeddings:
    print("\nWarning: Missing embeddings for ASINs:", list(missing_embeddings)[:5], "...")



Verifying metadata and embeddings:

Metadata:
- Records: 356
- Unique parent ASINs: 356

Columns: ['title', 'parent_asin']

Sample data:


Unnamed: 0,title,parent_asin
0,Organic Sweet Almond Oil and Fractionated Coco...,B08LYT4Q2X
1,"Empty Brown Glass Spray Bottles2-Pack, Refilla...",B089CSF11Y



Embeddings:
- Records: 356
- Unique parent ASINs: 356
- Embedding dimension: 3072

Columns: ['parent_asin', 'title', 'embedding']

Verifying coverage:
- Total unique parent ASINs in data: 356
- Parent ASINs with metadata: 356 (356 in data)
- Parent ASINs with embeddings: 356 (356 in data)


In [4]:
embeddings_df

Unnamed: 0,parent_asin,title,embedding
0,B08LYT4Q2X,Organic Sweet Almond Oil and Fractionated Coco...,"[0.007331471424549818, -0.009003796614706516, ..."
1,B089CSF11Y,"Empty Brown Glass Spray Bottles2-Pack, Refilla...","[-0.019447868689894676, -0.003274099435657263,..."
2,B081ZN3TD5,JPNK 4PCS Anti-Static Detangling Fine & Wide T...,"[-0.020500581711530685, 0.004741370212286711, ..."
3,B08WF29DM9,TULA Probiotic Skin Care Supersize 24-7 Moistu...,"[-0.04734344780445099, -0.010425296612083912, ..."
4,B08HMLXW65,"Claw Hair Clips, IKOCO 6 Pack Jaw Clips Stylis...","[-0.004870643373578787, -0.025788137689232826,..."
...,...,...,...
351,B08GJJ5RV9,"Facial Skincare,Cucumber Essence Hydrating Moi...","[-0.0002557237748987973, 0.01148942206054926, ..."
352,B086ST4W1C,"4D Silk Fiber Lash Mascara Black, Natural Wate...","[-0.007494322489947081, 0.001616129418835044, ..."
353,B07DFNPVSF,Dr. Denese SkinScience Essential Lipid Anti Ag...,"[-0.007007236126810312, -0.0027843439020216465..."
354,B08F7877DC,"KISSIO Eyeshadow Set Of 9 Colors, Practical Co...","[-0.044635046273469925, 0.02385716885328293, -..."


In [5]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07J3GH1W1,5.0,1547589356557,
1,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07W397QG4,5.0,1593352422858,B07J3GH1W1
2,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07KG1TWP5,5.0,1596473351088,B07J3GH1W1 B07W397QG4
3,AFSKPY37N3C43SOI5IEXEK5JSIYA,B08JTNQFZY,5.0,1617904219785,B07J3GH1W1 B07W397QG4 B07KG1TWP5
4,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07SLFWZKN,3.0,1619737501209,B07J3GH1W1 B07W397QG4 B07KG1TWP5 B08JTNQFZY
...,...,...,...,...,...
2024,AFKZESU3PTCQ2UVDBDSFVFTXBFNQ_2,B07Z4CVTRP,5.0,1582502307827,B07TLMZL3T B07J1LYVHC B07PHWX88W B07Z3NRMBS B0...
2025,AG3GU5MHHM662AATYNDWYOKOZP7A,B07KXM94BT,5.0,1549678927748,
2026,AG3GU5MHHM662AATYNDWYOKOZP7A,B07XVNJFNF,5.0,1582912683415,B07KXM94BT
2027,AG3GU5MHHM662AATYNDWYOKOZP7A,B0813ZQG3T,5.0,1584133894509,B07KXM94BT B07XVNJFNF
