In [4]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from huggingface_hub import HfApi
from pathlib import Path

# Repository information
REPO_ID = "ChernovAndrei/reco-fm-data"
SUBSET = "5core_last_out_w_his_All_Beauty"

# Initialize HuggingFace API
api = HfApi()


In [16]:
# Load all splits
print("Loading data from HuggingFace Hub...")

# Get the list of files in the repository
repo_files = api.list_repo_files(REPO_ID, repo_type="dataset")

# Function to load a specific split
def load_split(file_pattern):
    matching_files = [f for f in repo_files if file_pattern in f]
    if not matching_files:
        raise ValueError(f"No files found matching pattern: {file_pattern}")
    return load_dataset(REPO_ID, data_files=matching_files[0], split='train')

# Load interaction data (train/val/test)
train_data = load_split("train/train_data.parquet")
val_data = load_split("val/val_data.parquet")
test_data = load_split("test/test_data.parquet")

# Load metadata and embeddings
metadata = load_split("meta/metadata.parquet")
embeddings = load_split("embeddings/title_embeddings.parquet")

print("\nDataset loaded successfully!")


Loading data from HuggingFace Hub...

Dataset loaded successfully!


In [25]:
np.unique(train_data['user_id'])

array(['AE23ZBUF2YVBQPH2NN6F5XSA3QYQ', 'AE3335XF4PMHSXKTW5B7N7EALG3Q',
       'AE3KLVXGZPANXE5XLXYKHTVAZ3FQ', 'AE3LUVAAITFJIUTWBMRPHDQOCOFQ',
       'AE3PLZHW6NXWBMZ76TDVFQG2MJFA', 'AE3QBGRRHKT3GFDPRXFEN7JICEZQ',
       'AE3UKETTR77J4LM2ZE4AEUC4L6KA', 'AE53TOMIUB7ENP3RD44KDAARU6AQ',
       'AE5ESL52LWWBJTSFOAXSFZA3XCGQ', 'AE5GH4VM5Q5HNUL4EOAYRLNB7RVA',
       'AE5IMGWRBJA7JQFBQTBK25HDYGVA', 'AEAT2QOOIXWFLBQESCVLAVXLK3RQ',
       'AEAXAJACFMXIAAH4WOHRMXPSZWFA', 'AEBWGXGGL3Q5DSTMUQSTVUJDWSMA',
       'AEC6IZVI2NGFBCIYVD4X5MDGB6DQ_2', 'AECADZLPUNH3BDNACLFF7PSHN5MQ',
       'AECIT3NMW6RKABFS6YOCYX4YUYOQ', 'AECOPBDL3PHOESNB2RXKPZSXGBOA',
       'AECQQBG6YRYCOJL2NCB2H3V6LD6Q', 'AECTRGMRKOGAYIV3YXX73CQEQCSQ',
       'AEE4M36AZAKURLEYGV23TM3BE7OQ', 'AEFRTLVCVRALKXBED77KHPIXEPWQ',
       'AEFU6XKGH6J4ZIYTVVBGEDUCTZHQ', 'AEGKJTQWHGBIOTO34OB3GKIGCN7A',
       'AEGTJSI4X2EZHAL5VWJV3RCJIO4A', 'AEH7RAIDBU7QALXTMWAA73PTL4JA',
       'AEHGY7TZA4IMOXXBGPTZC7UPM6UQ', 'AEHLKY7Q5O3D3E6YEV67JIBVFNFA',
    

In [18]:
print("Dataset Statistics:")
print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")
print(f"Metadata entries: {len(metadata)}")
print(f"Embeddings entries: {len(embeddings)}")

print("\nFeatures in each split:")
print("\nTrain data features:")
print(train_data.features)

print("\nMetadata features:")
print(metadata.features)

print("\nEmbeddings features:")
print(embeddings.features)


Dataset Statistics:
Train set size: 2029
Validation set size: 253
Test set size: 253
Metadata entries: 356
Embeddings entries: 356

Features in each split:

Train data features:
{'user_id': Value(dtype='string', id=None), 'parent_asin': Value(dtype='string', id=None), 'rating': Value(dtype='string', id=None), 'timestamp': Value(dtype='string', id=None), 'history': Value(dtype='string', id=None)}

Metadata features:
{'main_category': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'average_rating': Value(dtype='float64', id=None), 'rating_number': Value(dtype='int64', id=None), 'features': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'description': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'price': Value(dtype='string', id=None), 'images': {'hi_res': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'large': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'thumb': Sequ

In [19]:
# Convert to pandas for easier analysis
train_df = train_data.to_pandas()
val_df = val_data.to_pandas()
test_df = test_data.to_pandas()
metadata_df = metadata.to_pandas()
embeddings_df = embeddings.to_pandas()

# Collect all unique product IDs from interaction data
all_products = set(pd.concat([
    train_df['parent_asin'],
    val_df['parent_asin'],
    test_df['parent_asin']
]).unique())

# Check metadata coverage
metadata_products = set(metadata_df['parent_asin'])
embeddings_products = set(embeddings_df['parent_asin'])

print("Data Coverage Analysis:")
print(f"Total unique products in interactions: {len(all_products)}")
print(f"Products with metadata: {len(metadata_products)} ({len(metadata_products & all_products)} in interactions)")
print(f"Products with embeddings: {len(embeddings_products)} ({len(embeddings_products & all_products)} in interactions)")

# Check for missing information
missing_metadata = all_products - metadata_products
missing_embeddings = all_products - embeddings_products

if missing_metadata:
    print(f"\nProducts missing metadata: {len(missing_metadata)}")
    print("Example missing ASINs:", list(missing_metadata)[:5])

if missing_embeddings:
    print(f"\nProducts missing embeddings: {len(missing_embeddings)}")
    print("Example missing ASINs:", list(missing_embeddings)[:5])


Data Coverage Analysis:
Total unique products in interactions: 356
Products with metadata: 356 (356 in interactions)
Products with embeddings: 356 (356 in interactions)


In [20]:
# Check embedding dimensions
first_embedding = embeddings_df['embedding'].iloc[0]
print(f"Embedding dimension: {len(first_embedding)}")

# Verify embeddings are normalized
embedding_norm = np.linalg.norm(first_embedding)
print(f"Embedding norm: {embedding_norm:.6f}")

# Example: Find similar products
def find_similar_products(query_asin, top_k=5):
    if query_asin not in embeddings_products:
        print(f"Product {query_asin} not found in embeddings")
        return
    
    # Get query embedding
    query_idx = embeddings_df[embeddings_df['parent_asin'] == query_asin].index[0]
    query_embedding = embeddings_df['embedding'].iloc[query_idx]
    
    # Calculate similarities
    similarities = np.dot(np.stack(embeddings_df['embedding']), query_embedding)
    
    # Get top-k similar products
    top_indices = np.argsort(similarities)[-top_k-1:][::-1]
    
    print(f"\nSimilar products to {query_asin}:")
    for idx in top_indices[1:]:  # Skip the first one (self-similarity)
        product = embeddings_df.iloc[idx]
        meta = metadata_df[metadata_df['parent_asin'] == product['parent_asin']].iloc[0]
        print(f"\nTitle: {meta['title']}")
        print(f"ASIN: {product['parent_asin']}")
        print(f"Similarity: {similarities[idx]:.4f}")

# Test similarity search with a random product
example_asin = embeddings_df['parent_asin'].iloc[0]
find_similar_products(example_asin)


Embedding dimension: 3072
Embedding norm: 1.000000

Similar products to B08LYT4Q2X:

Title: Claireceuticals Sweet Almond Oil for Hair, Face & Skin - Organic Almond Oil for Skin - Pure Almond Oil Moisturizer for Dry Skin - Facial Moisturizer for Women - Ideal Dandruff Remover - 1.69 Fl Oz
ASIN: B08HDG9F44
Similarity: 0.6433

Title: Livordo Moroccan Argan Oil Essential Organic Cold Pressed Skin Moisturizer Rich in Vitamin E Powerful Antioxidants For Skin, Hair, Body, and Face 4 Oz
ASIN: B081632HX6
Similarity: 0.5512

Title: Argan Magic 100% Pure Argan Oil â€“ Moisturizing Oil that Infuses Skin with Enriched Hydration | Leaves Skin Restored & Refreshed | Non-Greasy | Rich in Vitamin E | Made in USA | Cruelty Free (2 oz)
ASIN: B00O2FGBJS
Similarity: 0.5380

Title: 100% PURE Coconut Body Scrub Exfoliating Deep Cleanse with Sea Salts & Nourishing Oils - Natural Deeply Moisturizing Skincare - Vibrant Radiant Skin Restore & Pore Cleanser - Vegan - 10oz
ASIN: B09KX5N1DR
Similarity: 0.5189

Titl

In [21]:
print("User Interaction Statistics:")
print(f"\nTrain set:")
print(f"Unique users: {train_df['user_id'].nunique()}")
print(f"Average interactions per user: {len(train_df) / train_df['user_id'].nunique():.2f}")

print(f"\nValidation set:")
print(f"Unique users: {val_df['user_id'].nunique()}")
print(f"Average interactions per user: {len(val_df) / val_df['user_id'].nunique():.2f}")

print(f"\nTest set:")
print(f"Unique users: {test_df['user_id'].nunique()}")
print(f"Average interactions per user: {len(test_df) / test_df['user_id'].nunique():.2f}")

# Check user overlap between splits
train_users = set(train_df['user_id'])
val_users = set(val_df['user_id'])
test_users = set(test_df['user_id'])

print("\nUser Overlap Analysis:")
print(f"Train-Val overlap: {len(train_users & val_users)} users")
print(f"Train-Test overlap: {len(train_users & test_users)} users")
print(f"Val-Test overlap: {len(val_users & test_users)} users")


User Interaction Statistics:

Train set:
Unique users: 253
Average interactions per user: 8.02

Validation set:
Unique users: 253
Average interactions per user: 1.00

Test set:
Unique users: 253
Average interactions per user: 1.00

User Overlap Analysis:
Train-Val overlap: 253 users
Train-Test overlap: 253 users
Val-Test overlap: 253 users


In [22]:
# Get a random user's history
example_user = train_df['user_id'].iloc[0]

print(f"Example user: {example_user}")
print("\nInteraction history:")

# Get all interactions
user_history = train_df[train_df['user_id'] == example_user]
for _, interaction in user_history.iterrows():
    product_meta = metadata_df[metadata_df['parent_asin'] == interaction['parent_asin']].iloc[0]
    print(f"\nProduct: {product_meta['title']}")
    print(f"ASIN: {interaction['parent_asin']}")
    print(f"Rating: {interaction['rating']}")
    print(f"Timestamp: {interaction['timestamp']}")
    
# Parse and check the history field
history_asins = interaction['history'].split()
print(f"\nNumber of previous interactions: {len(history_asins)}")
print("Previous products:")
for asin in history_asins:
    if asin in metadata_products:
        title = metadata_df[metadata_df['parent_asin'] == asin].iloc[0]['title']
        print(f"- {title} ({asin})")


Example user: AFSKPY37N3C43SOI5IEXEK5JSIYA

Interaction history:

Product: Manicure and Pedicure Nail Clipper from POWERGROOMING - Powerful Trimmer for Thick and Thin Finger Nails and Toe Nails - Included Nail File and"Catcher" for Easy Cleanup (1 Pack)
ASIN: B07J3GH1W1
Rating: 5.0
Timestamp: 1547589356557

Product: Iryasa Night Indulge Cream - Natural Face Cream for Dry Skin - Vegan Anti Aging Night Cream for Women - Firming Cream for Face and Neck - Organic Vitamin C Moisturizer for Face - 1.7oz
ASIN: B07W397QG4
Rating: 5.0
Timestamp: 1593352422858

Product: Organic Bamboo Cotton Ear Swabs by Bali Boo - 200 - Natural Wooden Qtips Cotton Swabs for Cleaning Ears, Baby or Makeup and Nails - Sustainable & Vegan Buds Sticks - Eco Friendly & Biodegradable
ASIN: B07KG1TWP5
Rating: 5.0
Timestamp: 1596473351088

Product: GAINWELL
ASIN: B08JTNQFZY
Rating: 5.0
Timestamp: 1617904219785

Product: Keratin Secrets Do It Yourself Home Keratin System
ASIN: B07SLFWZKN
Rating: 3.0
Timestamp: 1619737501