In [None]:
import os
from pathlib import Path
import shutil

import teehr

teehr.__version__

In [None]:
from teehr.evaluation.spark_session_utils import create_spark_session

In [None]:
spark = create_spark_session(
    start_spark_cluster=True,
    executor_instances=20,
    executor_memory="16g",
    executor_cores=4,
)

### Describe tables

In [None]:
%%time
query = f"""
    DESCRIBE TABLE iceberg.teehr.primary_timeseries
"""
spark.sql(query).show()

In [None]:
%%time
query = f"""
    DESCRIBE TABLE iceberg.teehr.secondary_timeseries
"""
spark.sql(query).show()

### Expire snapshots

In [None]:
snapshot_expiry_date = "2025-12-01 12:00:00.000"

Primary

In [None]:
%%time
query = f"""
    SELECT * FROM iceberg.teehr.primary_timeseries.snapshots;
"""
sdf = spark.sql(query)
print(f"Num snapshots in primary_timeseries: {sdf.count()}")

In [None]:
%%time
query = f"""
    CALL iceberg.system.expire_snapshots('teehr.primary_timeseries', TIMESTAMP '{snapshot_expiry_date}', 1);
"""
spark.sql(query).show()

Secondary

In [None]:
%%time
query = f"""
    SELECT * FROM iceberg.teehr.secondary_timeseries.snapshots;
"""
sdf = spark.sql(query)
print(f"Num snapshots in secondary_timeseries: {sdf.count()}")

In [None]:
%%time
query = f"""
    CALL iceberg.system.expire_snapshots('teehr.secondary_timeseries', TIMESTAMP '{snapshot_expiry_date}', 1);
"""
spark.sql(query).show()

### Rewrite data files

Note: We could specify `strategy='sort'` and `sort_order='zorder' or (ASC, DESC)` args here. 

In [None]:
%%time
query = f"""
    CALL iceberg.system.rewrite_data_files('teehr.primary_timeseries');
"""
spark.sql(query)

In [None]:
%%time
query = f"""
    CALL iceberg.system.rewrite_data_files('teehr.secondary_timeseries');
"""
spark.sql(query).show()

## Other things to try - Did not run

### Remove orphan files

In [None]:
%%time
query = f"""
    CALL iceberg.system.remove_orphan_files('teehr.primary_timeseries');
"""
spark.sql(query).show()

### Partitioning

Primary

In [None]:
%%time
query = f"""
    --ALTER TABLE iceberg.teehr.primary_timeseries ADD PARTITION FIELD location_id
    ALTER TABLE iceberg.teehr.primary_timeseries DROP PARTITION FIELD location_id
"""
spark.sql(query).show()

Secondary Timeseries

In [None]:
%%time
query = f"""
    -- ALTER TABLE iceberg.teehr.secondary_timeseries ADD PARTITION FIELD location_id
    ALTER TABLE iceberg.teehr.secondary_timeseries DROP PARTITION FIELD location_id
"""
spark.sql(query).show()

### Rewrite manifests

In [None]:
%%time
query = f"""
    CALL iceberg.system.rewrite_manifests('teehr.primary_timeseries');
"""
spark.sql(query).show()

## Explore the warehouse

In [None]:
%%time
dir_path = "/data/temp_warehouse"

ev = teehr.Evaluation(
    spark=spark,
    dir_path=dir_path,
    create_dir=False
)
ev.set_active_catalog("remote")
ev.active_catalog

In [None]:
ev.primary_timeseries.to_sdf().count() * 1e-9  # billion

In [None]:
ev.secondary_timeseries.to_sdf().count() * 1e-9  # billion

## Stop Spark

In [None]:
spark.stop()