# Embedding Analysis

This notebook analyzes the embeddings for `space2vec`, `time2vec`, and `hgi` located in `../output`.
It determines the embedding dimension and checks if the embedding is a POI embedding (unique POIs) or a Check-in embedding (duplicated POIs).

In [1]:
import os
import pandas as pd
import glob

In [2]:
OUTPUT_DIR = "../output"
MODELS = ["space2vec", "time2vec", "hgi"]
STATES = ["alabama"] # Or dynamically find them

In [3]:
results = []

for model in MODELS:
    model_dir = os.path.join(OUTPUT_DIR, model)
    if not os.path.exists(model_dir):
        print(f"Directory not found: {model_dir}")
        continue
        
    # iterate over states directories inside model directory
    # We can assume the immediate subdirectories are states
    subdirs = [d for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d)) and d in STATES]
    
    for state in subdirs:
        file_path = os.path.join(model_dir, state, "embeddings.parquet")
        if not os.path.exists(file_path):
            # Try checking recursively or skip
            continue
            
        try:
            df = pd.read_parquet(file_path)
            
            # Determine embedding length (dimension)
            # Assuming columns '0', '1', ... are the embedding dimensions.
            # Alternatively, exclude known metadata columns.
            metadata_cols = ['placeid', 'category', 'poi_id', 'id']
            embedding_cols = [c for c in df.columns if c not in metadata_cols and str(c).isdigit()]
            
            if not embedding_cols:
                 # Fallback: if columns are not digits, maybe it's all columns minus metadata
                 embedding_cols = [c for c in df.columns if c not in metadata_cols]
            
            embedding_len = len(embedding_cols)
            num_rows = len(df)
            
            # Determine type: Checking vs POI Embedding
            # check for duplicates in placeid
            id_col = 'placeid' if 'placeid' in df.columns else None
            if not id_col and 'poi_id' in df.columns: id_col = 'poi_id'
            
            if id_col:
                has_duplicates = df[id_col].duplicated().any()
                embedding_type = "Checking Embedding" if has_duplicates else "POI Embedding"
            else:
                embedding_type = "Unknown (No ID col)"
                has_duplicates = None

            results.append({
                "Model": model,
                "State": state,
                "Rows": num_rows,
                "Embedding Dim": embedding_len,
                "Type": embedding_type,
                "Has Duplicates": has_duplicates
            })
            
        except Exception as e:
            print(f"Error processing {file_path}: {e}")


In [4]:
results_df = pd.DataFrame(results)
results_df.sort_values(by=["Model", "State"], inplace=True)
results_df

Unnamed: 0,Model,State,Rows,Embedding Dim,Type,Has Duplicates
2,hgi,alabama,11706,64,POI Embedding,False
0,space2vec,alabama,11706,64,POI Embedding,False
1,time2vec,alabama,113753,64,Checking Embedding,True
