# L2C Workbook -- Local to Cloud Migration

Migrate your local **Apache Polaris** PoC to **AWS S3 + Snowflake External Iceberg Tables**.

This workbook walks through the full lifecycle:

1. Inspect local Polaris tables
2. Review migration infrastructure (AWS + Snowflake)
3. Verify synced data on S3
4. Query from Snowflake
5. Reset and re-demo
6. Incremental update flow

> **Prerequisites:** Run `task start WORK_DIR=<your-project>` before opening this notebook.
> L2C sections require `./bin/plf l2c setup` to have been run.

## How L2C Works

```mermaid
flowchart LR
    subgraph local [Local Environment]
        Polaris[Apache Polaris]
        RustFS[RustFS S3]
    end
    subgraph cloud [Cloud]
        S3[AWS S3 Bucket]
        SF[Snowflake External Iceberg Table]
    end
    RustFS -->|"1. sync"| S3
    S3 -->|"2. register"| SF
    local_change[Local Data Change] -->|"3. update --force"| S3
    S3 -->|"4. refresh"| SF
```

The `plf l2c` CLI orchestrates each step. This workbook lets you inspect
the state at each stage and verify end-to-end.

## Setup

Load project configuration from `.env` and L2C state from `.snow-utils/l2c-state.json`.
Credentials are read from `work/principal.txt` (masked in output).

In [None]:
import json
import sys
from pathlib import Path

from dotenv import dotenv_values

project_root = Path("..").resolve()

# --- .env (required) ---
env_file = project_root / ".env"
if not env_file.exists():
    print(f"ERROR: {env_file} not found.")
    print("  Run: task prepare WORK_DIR=<your-project>")
    print("  Then: task start WORK_DIR=<your-project>")
    raise SystemExit(1)

cfg = dotenv_values(env_file)
has_env = True

# --- work/principal.txt (needs cluster running) ---
principal_file = project_root / "work" / "principal.txt"
has_principal = principal_file.exists()
realm = client_id = client_secret = None
if has_principal:
    lines = principal_file.read_text().strip().splitlines()
    if len(lines) >= 3:
        realm, client_id, client_secret = lines[0], lines[1], lines[2]
else:
    print("WARNING: work/principal.txt not found.")
    print("  Run: task start WORK_DIR=<your-project>")

# --- .snow-utils/l2c-state.json (needs plf l2c setup) ---
state_file = project_root / ".snow-utils" / "l2c-state.json"
state = None
has_l2c_state = state_file.exists()
if has_l2c_state:
    state = json.loads(state_file.read_text())
else:
    print("INFO: .snow-utils/l2c-state.json not found.")
    print("  Run: ./bin/plf l2c setup")

# --- Derived config ---
polaris_url = cfg.get("POLARIS_HOST", "http://localhost:18181")
catalog_name = cfg.get("POLARIS_CATALOG_NAME", "polardb")

# --- Summary ---
print(f"Project root:  {project_root}")
print(f"Polaris URL:   {polaris_url}")
print(f"Catalog:       {catalog_name}")
print(f"has_principal: {has_principal}")
print(f"has_l2c_state: {has_l2c_state}")
if has_l2c_state and state:
    aws = state.get("aws", {})
    sf = state.get("snowflake", {})
    print(f"AWS bucket:    {aws.get('bucket', 'N/A')}")
    print(f"AWS region:    {aws.get('region', 'N/A')}")
    print(f"SF database:   {sf.get('database', 'N/A')}")
    print(f"SF schema:     {sf.get('schema', 'N/A')}")
    print(f"SF SA_ROLE:    {sf.get('sa_role', 'N/A')}")

## 1. Local Inventory

Discover all namespaces and tables in the local Polaris catalog via the Iceberg REST API.
These are the tables available for migration.

> Skill action: `l2c-inventory`

In [None]:
import pandas as pd
import requests

if not has_principal:
    print("\u23ed\ufe0f  Skipping -- Polaris credentials not available.")
    print("   Run: task start WORK_DIR=<your-project>")
else:
    token_resp = requests.post(
        f"{polaris_url}/api/catalog/v1/oauth/tokens",
        data={
            "grant_type": "client_credentials",
            "client_id": client_id,
            "client_secret": client_secret,
            "scope": "PRINCIPAL_ROLE:ALL",
        },
        headers={"Polaris-Realm": realm},
    )
    token_resp.raise_for_status()
    token = token_resp.json()["access_token"]
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
    base = f"{polaris_url}/api/catalog"

    rows = []
    ns_resp = requests.get(f"{base}/v1/{catalog_name}/namespaces", headers=headers)
    ns_resp.raise_for_status()
    for ns_parts in ns_resp.json().get("namespaces", []):
        ns_name = ".".join(ns_parts)
        tbl_resp = requests.get(f"{base}/v1/{catalog_name}/namespaces/{ns_name}/tables", headers=headers)
        tbl_resp.raise_for_status()
        for ident in tbl_resp.json().get("identifiers", []):
            table_name = ident["name"]
            meta_resp = requests.get(
                f"{base}/v1/{catalog_name}/namespaces/{ns_name}/tables/{table_name}",
                headers=headers,
            )
            meta_resp.raise_for_status()
            meta = meta_resp.json()
            location = meta.get("metadata", {}).get("location", "")
            schemas = meta.get("metadata", {}).get("schemas", [])
            col_count = len(schemas[-1].get("fields", [])) if schemas else 0
            rows.append({
                "namespace": ns_name,
                "table": table_name,
                "columns": col_count,
                "location": location,
            })

    df = pd.DataFrame(rows)
    df

## 2. Migration Status

Current state of the L2C pipeline -- AWS infrastructure, Snowflake resources,
and per-table sync/register status.

> Requires `plf l2c setup` to have been run.  
> Skill action: `l2c-status`

In [None]:
import pandas as pd

if not has_l2c_state:
    print("\u23ed\ufe0f  Skipping -- L2C not configured yet.")
    print("   Run: ./bin/plf l2c setup")
else:
    aws = state.get("aws", {})
    sf = state.get("snowflake", {})

    print("=== AWS ===")
    print(f"  Bucket:  {aws.get('bucket', 'N/A')}")
    print(f"  Region:  {aws.get('region', 'N/A')}")
    print(f"  Role:    {aws.get('role_arn', 'N/A')}")
    print()
    print("=== Snowflake ===")
    print(f"  SA_ROLE:         {sf.get('sa_role', 'N/A')}")
    print(f"  Ext Volume:      {sf.get('external_volume', 'N/A')}")
    print(f"  Catalog Int:     {sf.get('catalog_integration', 'N/A')}")
    print(f"  Database.Schema: {sf.get('database', 'N/A')}.{sf.get('schema', 'N/A')}")
    print()

    tables = state.get("tables", {})
    if tables:
        rows = []
        for key, info in tables.items():
            rows.append({
                "table": key,
                "namespace": info.get("namespace", ""),
                "sync_status": info.get("sync", {}).get("status", "pending"),
                "last_sync": info.get("sync", {}).get("last_sync", ""),
                "register_status": info.get("register", {}).get("status", "pending"),
            })
        df = pd.DataFrame(rows)
        df
    else:
        print("No tables in state yet. Run: ./bin/plf l2c migrate")

## 3. Sync Verification

Compare object counts between local RustFS and the AWS S3 migration bucket.
A matching count confirms all Iceberg data files and metadata were synced.

> Requires `plf l2c migrate` to have been run.  
> Skill action: `l2c-verify`

In [None]:
import boto3
import pandas as pd
from botocore.config import Config as BotoConfig

if not has_l2c_state:
    print("\u23ed\ufe0f  Skipping -- L2C not configured yet.")
    print("   Run: ./bin/plf l2c setup")
else:
    aws = state.get("aws", {})
    bucket = aws.get("bucket", "")
    region = aws.get("region", "us-east-1")
    profile = aws.get("profile", "default")

    # Local RustFS S3
    rustfs_s3 = boto3.Session(
        aws_access_key_id=cfg.get("AWS_ACCESS_KEY_ID", "admin"),
        aws_secret_access_key=cfg.get("AWS_SECRET_ACCESS_KEY", "password"),
        region_name="us-east-1",
    ).client(
        "s3",
        endpoint_url=cfg.get("AWS_ENDPOINT_URL", "http://localhost:19000"),
        region_name="us-east-1",
        config=BotoConfig(s3={"addressing_style": "path"}),
    )

    # Real AWS S3
    cloud_s3 = boto3.Session(profile_name=profile, region_name=region).client("s3", region_name=region)

    def count_objects(s3_client, bucket_name, prefix=""):
        paginator = s3_client.get_paginator("list_objects_v2")
        count = 0
        for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
            count += page.get("KeyCount", 0)
        return count

    local_bucket = cfg.get("POLARIS_CATALOG_NAME", "polardb")
    rows = []
    for key, info in state.get("tables", {}).items():
        ns = info.get("namespace", "")
        tbl = info.get("table", "")
        prefix = f"{ns}/{tbl}/"
        local_count = count_objects(rustfs_s3, local_bucket, prefix)
        s3_count = count_objects(cloud_s3, bucket, prefix)
        rows.append({
            "table": key,
            "local_objects": local_count,
            "s3_objects": s3_count,
            "match": "Yes" if local_count == s3_count else "NO",
        })

    df = pd.DataFrame(rows)
    df

## 4. Query from Snowflake

Run the same count query both **locally** (PyIceberg) and in **Snowflake** to
confirm the data matches end-to-end.

Uses `snowflake-connector-python` with a named connection from
`~/.snowflake/connections.toml` -- no hardcoded credentials needed.

> Requires `plf l2c migrate` to have been run.  
> Skill action: `l2c-query`

In [None]:
import pandas as pd

if not has_l2c_state:
    print("\u23ed\ufe0f  Skipping -- L2C not configured yet.")
    print("   Run: ./bin/plf l2c setup")
else:
    import snowflake.connector
    from pyiceberg.catalog import load_catalog

    sf = state["snowflake"]
    database = sf["database"]
    schema = sf["schema"]

    # Snowflake query
    conn = snowflake.connector.connect(
        connection_name=cfg.get("SNOWFLAKE_DEFAULT_CONNECTION_NAME", "default"),
        role=sf["sa_role"],
        database=database,
        schema=schema,
    )

    rows = []
    for key, info in state.get("tables", {}).items():
        if info.get("register", {}).get("status") != "done":
            continue
        sf_table = key.upper()

        # Snowflake count
        cur = conn.cursor()
        cur.execute(f"SELECT COUNT(*) FROM {database}.{schema}.{sf_table}")
        sf_count = cur.fetchone()[0]

        # Local count via PyIceberg
        ns = info["namespace"]
        tbl = info["table"]
        local_catalog = load_catalog(
            "polaris",
            **{
                "type": "rest",
                "uri": polaris_url,
                "credential": f"{client_id}:{client_secret}",
                "warehouse": catalog_name,
                "scope": "PRINCIPAL_ROLE:ALL",
                "header.X-Iceberg-Access-Delegation": "vended-credentials",
                "header.Polaris-Realm": realm,
            },
        )
        iceberg_tbl = local_catalog.load_table(f"{ns}.{tbl}")
        local_count = len(iceberg_tbl.scan().to_arrow())

        rows.append({
            "table": sf_table,
            "local_count": local_count,
            "snowflake_count": sf_count,
            "match": "Yes" if local_count == sf_count else "NO",
        })

    conn.close()
    df = pd.DataFrame(rows)
    df

## 5. Reset and Reload (Demo Reset)

Drop and recreate the local catalog to regenerate sample data, then clear
the S3 bucket and Snowflake tables. This lets you re-run the full L2C
migration demo **without** tearing down the k3d cluster.

> Skill action: `l2c-reset`

In [None]:
import subprocess

if not has_principal:
    print("\u23ed\ufe0f  Skipping -- Polaris credentials not available.")
    print("   Run: task start WORK_DIR=<your-project>")
else:
    plf = str(project_root / "bin" / "plf")

    print("=== Step 1: Drop and recreate catalog ===")
    subprocess.run([plf, "catalog", "cleanup", "--yes"], cwd=project_root, check=True)
    subprocess.run([plf, "catalog", "setup"], cwd=project_root, check=True)
    print("Catalog recreated with sample data.")
    print()

    if has_l2c_state:
        print("=== Step 2: Clear S3 + Snowflake tables ===")
        subprocess.run([plf, "l2c", "clear", "--yes"], cwd=project_root, check=True)
        print("S3 objects and Snowflake tables cleared.")
    else:
        print("L2C not configured -- skipping S3/Snowflake clear.")

    print()
    print("Ready to re-run L2C migration demo.")

## 6. Incremental Update

Demonstrate the day-2 workflow:

1. Insert new rows into the **local** Polaris table
2. Run `plf l2c update --force` to sync changes to S3 and refresh Snowflake
3. Query Snowflake to confirm the new rows appear

This proves that the L2C bridge works for ongoing development, not just
one-shot migrations.

> Requires both local cluster and `plf l2c setup` to have been run.  
> Skill action: `l2c-update`

In [None]:
import subprocess

import pandas as pd
import pyarrow as pa

if not has_l2c_state or not has_principal:
    missing = []
    if not has_principal:
        missing.append("Polaris credentials (run: task start)")
    if not has_l2c_state:
        missing.append("L2C state (run: ./bin/plf l2c setup)")
    print(f"\u23ed\ufe0f  Skipping -- missing: {', '.join(missing)}")
else:
    from pyiceberg.catalog import load_catalog
    import snowflake.connector

    sf = state["snowflake"]
    database = sf["database"]
    schema = sf["schema"]

    # Pick the first registered table for the demo
    demo_key = None
    demo_info = None
    for key, info in state.get("tables", {}).items():
        if info.get("register", {}).get("status") == "done":
            demo_key = key
            demo_info = info
            break

    if not demo_key:
        print("No registered tables found. Run: ./bin/plf l2c migrate")
    else:
        ns = demo_info["namespace"]
        tbl = demo_info["table"]
        sf_table = demo_key.upper()

        # Connect to local catalog
        local_catalog = load_catalog(
            "polaris",
            **{
                "type": "rest",
                "uri": polaris_url,
                "credential": f"{client_id}:{client_secret}",
                "warehouse": catalog_name,
                "scope": "PRINCIPAL_ROLE:ALL",
                "header.X-Iceberg-Access-Delegation": "vended-credentials",
                "header.Polaris-Realm": realm,
            },
        )
        iceberg_tbl = local_catalog.load_table(f"{ns}.{tbl}")
        before_count = len(iceberg_tbl.scan().to_arrow())

        # Insert a few demo rows by appending a PyArrow table
        existing = iceberg_tbl.scan().to_arrow()
        sample = existing.slice(0, min(3, len(existing)))
        iceberg_tbl.append(sample)
        after_local = len(iceberg_tbl.scan().to_arrow())
        print(f"Local {ns}.{tbl}: {before_count} -> {after_local} rows")

        # Sync to cloud
        plf = str(project_root / "bin" / "plf")
        print("\nRunning: plf l2c update --force --yes")
        subprocess.run([plf, "l2c", "update", "--force", "--yes"], cwd=project_root, check=True)

        # Query Snowflake
        conn = snowflake.connector.connect(
            connection_name=cfg.get("SNOWFLAKE_DEFAULT_CONNECTION_NAME", "default"),
            role=sf["sa_role"],
            database=database,
            schema=schema,
        )
        cur = conn.cursor()
        cur.execute(f"SELECT COUNT(*) FROM {database}.{schema}.{sf_table}")
        sf_count = cur.fetchone()[0]
        conn.close()

        df = pd.DataFrame([{
            "table": sf_table,
            "before": before_count,
            "after_local": after_local,
            "after_snowflake": sf_count,
            "match": "Yes" if after_local == sf_count else "NO",
        }])
        df