In [1]:
import numpy as np
import pandas as pd

In [None]:
## Synthetic transactions data generator

def make_synthetic_transactions(
    n_rows=10_000,
    n_unique_cards=8_000,
    n_unique_merchants=7_500,
    online_share=0.22,
    seed=42,
):
    rng = np.random.default_rng(seed)

    # Sanity checks
    if n_rows > 10_000:
        raise ValueError("n_rows must be <= 10,000")
    if n_unique_cards > n_rows:
        raise ValueError("n_unique_cards cannot exceed n_rows (or you cannot realize all unique cards).")

    # Candidate city pool + Online
    cities = [
        "Brussels", "Antwerp", "Ghent", "Charleroi", "Li√®ge", "Bruges", "Leuven", "Namur",
        "Paris", "London", "Amsterdam", "Berlin", "Madrid", "Rome", "Vienna", "Zurich",
        "Barcelona", "Rotterdam", "Lille", "Toulouse"
    ]

    # Build merchant dimension: each merchant is either Online or tied to a city
    merchant_ids = np.arange(1, n_unique_merchants + 1, dtype=np.int64)

    merchant_is_online = rng.random(n_unique_merchants) < online_share
    merchant_city = np.where(
        merchant_is_online,
        "ONLINE",
        rng.choice(cities, size=n_unique_merchants, replace=True)
    )

    merchant_dim = pd.DataFrame({
        "merchant_id": merchant_ids,
        "merchant_city": merchant_city
    })

    # Build card_id pool (exact count)
    # Using strings makes IDs look realistic and avoids accidental numeric formatting issues in CSVs.
    card_pool = np.array([f"C{idx:06d}" for idx in range(1, n_unique_cards + 1)])

    # Ensure ALL cards appear at least once:
    # 1) place each card once for the first n_unique_cards rows
    # 2) sample remaining rows with a mild skew (some cards used more than others)
    base_cards = card_pool.copy()

    remaining = n_rows - n_unique_cards
    if remaining > 0:
        # Create a skewed distribution over cards (Zipf-like but bounded)
        weights = 1 / (np.arange(1, n_unique_cards + 1) ** 0.8)
        weights = weights / weights.sum()
        extra_cards = rng.choice(card_pool, size=remaining, replace=True, p=weights)
        card_ids = np.concatenate([base_cards, extra_cards])
    else:
        card_ids = base_cards

    # Sample merchants for transactions; you can skew to simulate "big merchants"
    merchant_weights = 1 / (np.arange(1, n_unique_merchants + 1) ** 0.7)
    merchant_weights = merchant_weights / merchant_weights.sum()
    tx_merchant_ids = rng.choice(merchant_ids, size=n_rows, replace=True, p=merchant_weights)

    # Assemble transactions and attach merchant_city
    df = pd.DataFrame({
        "id": np.arange(1, n_rows + 1, dtype=np.int64),
        "card_id": card_ids,
        "merchant_id": tx_merchant_ids
    }).merge(merchant_dim, on="merchant_id", how="left")

    # Shuffle rows so the first 6,000 rows aren't "one-per-card" in order
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # Recreate id after shuffle to keep it clean and sequential
    df["id"] = np.arange(1, n_rows + 1, dtype=np.int64)

    return df


In [None]:
## Instantiate logical tasks


from enum import unique
def create_task_unique(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Adds a new column indicating whether values in `column` are unique.
    The new column is named `unique_<column>` and contains:
      - 1 if the value appears exactly once in the column
      - 0 otherwise
    """
    unique_col_name = f"unique_{column}"
    value_counts = df[column].value_counts()
    df[unique_col_name] = df[column].map(value_counts).eq(1).astype(int)
    return df

def create_task_count(df: pd.DataFrame, column: str, k: int, greater_than: bool = True) -> pd.DataFrame:
    """
    Adds a new column indicating whether values in `column` meet a count condition.

    The new column is named:
      - `count_gt_<k>_<column>` if greater_than is True
      - `count_eq_<k>_<column>` if greater_than is False

    Values:
      - 1 if the value appears more than k times (greater_than=True),
        otherwise exactly k times
      - 0 otherwise
    """
    suffix = "gt" if greater_than else "eq"
    new_col_name = f"count_{suffix}_{k}_{column}"

    value_counts = df[column].value_counts()

    if greater_than:
        mask = df[column].map(value_counts).gt(k)
    else:
        mask = df[column].map(value_counts).eq(k)

    df[new_col_name] = mask.astype(int)
    return df

import pandas as pd

def create_task_double(df: pd.DataFrame, col_1: str, col_2: str, anchor) -> pd.DataFrame:
    """
    Adds a new column indicating whether there exists ANOTHER row
    with the same value in `col_1` and with `col_2 == anchor`.

    The new column is named `double_<col_1>_<col_2>_<anchor>`.

    Values:
      - 1 if there exists another row with the same `col_1`
        and with `col_2 == anchor`
      - 0 otherwise
    """
    new_col_name = f"double_{col_1}_{col_2}_{anchor}"

    # Count how many anchor rows exist per col_1
    anchor_counts = (
        df[col_2].eq(anchor)
        .groupby(df[col_1])
        .sum()
    )

    # Map counts back to rows
    counts_per_row = df[col_1].map(anchor_counts).fillna(0)

    # Exists "other" row logic
    df[new_col_name] = (
        ((df[col_2] != anchor) & (counts_per_row >= 1)) |
        ((df[col_2] == anchor) & (counts_per_row >= 2))
    ).astype(int)

    return df


def create_task_diamond(df: pd.DataFrame, col_1: str, col_2: str, strict=False) -> pd.DataFrame:
    """
    Adds a new column indicating whether there exists another row
    with the same (`col_1`, `col_2`) combination.

    The new column is named `duplicate_<col_1>_<col_2>`.

    Values:
      - 1 if the (`col_1`, `col_2`) pair appears more than once
      - 0 otherwise
    """

    counts = df.groupby([col_1, col_2])[col_1].transform("size")
    if strict:
      new_col_name = f"duplicate_{col_1}_{col_2}_strict"
      df[new_col_name] = (counts == 2).astype(int)
    else:
      new_col_name = f"duplicate_{col_1}_{col_2}"
      df[new_col_name] = (counts > 1).astype(int)

    return df



In [None]:
OUTPUT_PATH = 'data/processed/'

df_train = make_synthetic_transactions(
    n_rows=8_000,
    n_unique_cards=2_500,
    n_unique_merchants=3_500,
    online_share=0.15,
)

df_val = make_synthetic_transactions(
    n_rows=1_000,
    n_unique_cards=350,
    n_unique_merchants=300,
    online_share=0.12,
)

df_test = make_synthetic_transactions(
    n_rows=1_000,
    n_unique_cards=350,
    n_unique_merchants=300,
    online_share=0.12,
)


for df in [df_train, df_val, df_test]:
  df = create_task_unique(df, 'merchant_id')
  df = create_task_count(df, 'card_id', 12)
  df = create_task_count(df, 'card_id', 3, greater_than=False)
  df = create_task_double(df, 'card_id', 'merchant_city', "ONLINE")
  df = create_task_diamond(df, 'card_id', 'merchant_city')
  df = create_task_diamond(df, 'card_id', 'merchant_city', strict=False)


In [None]:
df_train.to_csv(OUTPUT_PATH+'synth-df_train.csv')
df_val.to_csv(OUTPUT_PATH+'synth-df_val.csv')
df_test.to_csv(OUTPUT_PATH+'synth-df_test.csv')