In [27]:
import os, argparse
from collections import defaultdict

import numpy as np
import pandas as pd
from neo4j import GraphDatabase
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD

In [28]:
URI = "bolt://localhost:7687"
USER = "neo4j"
PWD = "Meniere19"

def get_driver(uri, user, pwd):
    return GraphDatabase.driver(uri, auth=(user, pwd))

In [29]:

EDGE_QUERY = """
MATCH (c:Consumer)-[r:BOUGHT]->(p:Product)
RETURN c.hh_key               AS consumer_id,
       c.AGE_DESC             AS age,
       c.MARITAL_STATUS_CODE  AS marital,
       c.INCOME_DESC          AS income,
       c.HOMEOWNER_DESC       AS homeowner,
       c.HH_COMP_DESC         AS hh_comp,
       c.HOUSEHOLD_SIZE_DESC  AS hh_size,
       c.KID_CATEGORY_DESC    AS kid_cat,

       p.PRODUCT_ID           AS product_id,
       p.MANUFACTURER         AS manufacturer,
       p.DEPARTMENT           AS department,
       p.BRAND                AS brand,
       p.COMMODITY_DESC       AS commodity,
       p.SUB_COMMODITY_DESC   AS sub_commodity,
       p.CURR_SIZE_OF_PRODUCT AS size_of_prod,

       r.t                    AS ts,
       r.qty                  AS qty,
       r.sales_value          AS sales_value,
       r.retail_disc          AS retail_disc,
       r.coupon               AS coupon
"""

In [30]:
def fetch_all_edges(driver, batch=50_000):
    """Stream the Cypher result; return list-of-dicts."""
    edges = []
    with driver.session() as s:
        result = s.run(EDGE_QUERY)
        # Neo4j driver is already streaming, but we wrap tqdm for progress
        for record in tqdm(result, desc="Pulling edges from Neo4j"):
            edges.append(record.data())
            # you could flush to disk every `batch` rows for large datasets
    return edges

In [31]:
def build_node_mappings(edge_dicts):
    node2id, features = {}, []   # features will collect per node
    def _add(node_key, feat_dict):
        if node_key not in node2id:
            node2id[node_key] = len(node2id)
            features.append(feat_dict)
    for d in edge_dicts:
        # Consumer
        c_key  = f"C_{d.pop('consumer_id')}"
        c_feat = {k:d.pop(k) for k in
                  ("age","marital","income","homeowner",
                   "hh_comp","hh_size","kid_cat")}
        c_feat["node_type"] = "consumer"
        _add(c_key, c_feat)
        # Product
        p_key  = f"P_{d.pop('product_id')}"
        p_feat = {k:d.pop(k) for k in
                  ("manufacturer","department","brand",
                   "commodity","sub_commodity","size_of_prod")}
        p_feat["node_type"] = "product"
        _add(p_key, p_feat)
        # Keep remaining edge props in dict
        d["src"] = node2id[c_key]
        d["dst"] = node2id[p_key]
    return node2id, features, edge_dicts

In [32]:
def encode_node_features(node_features, out_dim=10):
    df = pd.DataFrame(node_features)

    enc = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    X_sparse = enc.fit_transform(df)              # huge but sparse

    svd = TruncatedSVD(n_components=out_dim, random_state=42)
    X = svd.fit_transform(X_sparse).astype(np.float32)   # dense  (n_nodes × 128)

    return X, (enc, svd)   

def encode_edge_features(edge_dicts):
    df = pd.DataFrame(edge_dicts)
    # Numerical edge attributes we keep as-is
    feat_mat = df[["qty","sales_value","retail_disc","coupon"]].fillna(0).to_numpy(dtype=np.float32)
    return feat_mat, df

In [33]:
def write_artifacts(df_edges, edge_feat, node_feat, out_dir="data", prefix="ml_retail"):
    os.makedirs(out_dir, exist_ok=True)

    # --- CSV (u,i,ts,idx,label)
    df_edges = df_edges.sort_values("ts").reset_index(drop=True)
    df_edges["idx"]   = df_edges.index
    df_edges["label"] = 0        # binary/other labels go here if you have them
    df_edges.rename(columns={"src":"u","dst":"i","ts":"ts"}, inplace=True)
    df_edges[["u","i","ts","idx","label"]].to_csv(
        f"{out_dir}/{prefix}.csv", index=False)

    # --- Edge feature matrix
    np.save(f"{out_dir}/{prefix}.npy", edge_feat)

    # --- Node feature matrix
    np.save(f"{out_dir}/{prefix}_node.npy", node_feat)

In [34]:
out = "./data"
driver = get_driver(URI, USER, PWD)
# 1. Pull everything
# raw_edges = fetch_all_edges(driver)
# # 2. Map nodes → ints, split edge / node info
# node2id, node_feats, edges = build_node_mappings(raw_edges)
# 3. Encode
node_feat_mat, _  = encode_node_features(node_feats)
edge_feat_mat, df = encode_edge_features(edges)
# 4. Persist
write_artifacts(df, edge_feat_mat, node_feat_mat, out_dir=out)
print(f"✅  Wrote {len(df):,} edges and {len(node_feat_mat):,} nodes into '{out}'")

✅  Wrote 1,427,303 edges and 69,054 nodes into './data'
