In [None]:
import os
from pathlib import Path
import shutil

import hvplot.pandas
import holoviews as hv

import teehr

# Set global defaults for all line plots
hv.opts.defaults(
    hv.opts.Curve(
        bgcolor="#e7e9ecb8",
        show_grid=True,
        gridstyle={'grid_line_alpha': 0.5, 'grid_line_color': 'white'},
        frame_width=1000,
        frame_height=300
    )
)

teehr.__version__

In [None]:
from teehr.evaluation.spark_session_utils import create_spark_session

In [None]:
spark = create_spark_session(
    start_spark_cluster=True,
    executor_instances=20,
    executor_memory="16g",
    executor_cores=4,
)

### Describe tables

In [None]:
%%time
query = f"""
    DESCRIBE TABLE iceberg.teehr.primary_timeseries
"""
spark.sql(query).show()

In [None]:
%%time
query = f"""
    DESCRIBE TABLE iceberg.teehr.secondary_timeseries
"""
spark.sql(query).show()

### Expire snapshots

In [None]:
snapshot_expiry_date = "2025-12-01 12:00:00.000"

Primary

In [None]:
%%time
query = f"""
    SELECT * FROM iceberg.teehr.primary_timeseries.snapshots;
"""
sdf = spark.sql(query)
print(f"Num snapshots in primary_timeseries: {sdf.count()}")

In [None]:
%%time
query = f"""
    CALL iceberg.system.expire_snapshots('teehr.primary_timeseries', TIMESTAMP '{snapshot_expiry_date}', 1);
"""
spark.sql(query).show()

Secondary

In [None]:
%%time
query = f"""
    SELECT * FROM iceberg.teehr.secondary_timeseries.snapshots;
"""
sdf = spark.sql(query)
print(f"Num snapshots in secondary_timeseries: {sdf.count()}")

In [None]:
%%time
query = f"""
    CALL iceberg.system.expire_snapshots('teehr.secondary_timeseries', TIMESTAMP '{snapshot_expiry_date}', 1);
"""
spark.sql(query).show()

### Rewrite data files

Note: We could specify `strategy='sort'` and `sort_order='zorder' or (ASC, DESC)` args here. 

In [None]:
%%time
query = f"""
    CALL iceberg.system.rewrite_data_files('teehr.primary_timeseries');
"""
spark.sql(query)

In [None]:
%%time
query = f"""
    CALL iceberg.system.rewrite_data_files('teehr.secondary_timeseries');
"""
spark.sql(query).show()

## Other things to try - Did not run

### Remove orphan files

In [None]:
%%time
query = f"""
    CALL iceberg.system.remove_orphan_files('teehr.primary_timeseries');
"""
spark.sql(query).show()

### Partitioning

Primary

In [None]:
%%time
query = f"""
    --ALTER TABLE iceberg.teehr.primary_timeseries ADD PARTITION FIELD location_id
    ALTER TABLE iceberg.teehr.primary_timeseries DROP PARTITION FIELD location_id
"""
spark.sql(query).show()

Secondary Timeseries

In [None]:
%%time
query = f"""
    -- ALTER TABLE iceberg.teehr.secondary_timeseries ADD PARTITION FIELD location_id
    ALTER TABLE iceberg.teehr.secondary_timeseries DROP PARTITION FIELD location_id
"""
spark.sql(query).show()

### Rewrite manifests

In [None]:
%%time
query = f"""
    CALL iceberg.system.rewrite_manifests('teehr.primary_timeseries');
"""
spark.sql(query).show()

## Explore the warehouse and test some queries

In [None]:
%%time
spark = create_spark_session()

In [None]:
%%time
dir_path = "/data/temp_warehouse"

ev = teehr.Evaluation(
    spark=spark,
    dir_path=dir_path,
    create_dir=False
)
ev.set_active_catalog("remote")
ev.active_catalog

In [None]:
ev.primary_timeseries.to_sdf().count() * 1e-9  # billion

In [None]:
ev.secondary_timeseries.to_sdf().count() * 1e-9  # billion

Look at table properties

In [None]:
spark.sql("SHOW TBLPROPERTIES iceberg.teehr.primary_timeseries").show()

Note: We could also set these table properties to allow old metadata to be deleted:
```text
write.metadata.delete-after-commit.enabled=true  # default is false
write.metadata.previous-versions-max=<some number> # default is 100
```
Example (untested):
```python
spark.sql("""
ALTER TABLE icberg.teehr.primary_timeseries SET TBLPROPERTIES (
    'write.metadata.delete-after-commit.enabled' = 'true',
    'write.metadata.previous-versions-max' = '10'
)
""")
```

In [None]:
# Handpicked sites that seemed interesting
usgs_gages = [
    "usgs-02424000",
    "usgs-03068800",
    "usgs-01570500",
    "usgs-01347000",
    "usgs-05443500",
    "usgs-06770500",
    "usgs-08313000",
    "usgs-11421000",
    "usgs-14319500"
]

This currently takes about 6 secs on the small VM with no spark executors:

In [None]:
%%time
primary_df = ev.primary_timeseries.filter("location_id = 'usgs-02424000'").to_pandas()
primary_df.index.size

This one takes about 11 secs on the small VM with no spark executors:

In [None]:
%%time
secondary_df = ev.secondary_timeseries.filter(
    [
        "configuration_name = 'nwm30_retrospective'",
        "location_id = 'nwm30-21661814'"
    ]
).to_pandas()
secondary_df.index.size

Take a look at a plot:

In [None]:
primary_plot = primary_df.hvplot.line(
    x='value_time',
    y='value',
    by='configuration_name',
    legend=True,
    color='blue'
)

secondary_plot = secondary_df.hvplot.line(
    x='value_time',
    y='value',
    by='configuration_name',
    legend=True,
    color='magenta',
    alpha=0.75
)

(primary_plot * secondary_plot).opts(
    title=f"Observed and Simulated Timeseries at usgs-02424000"
    )

## Stop Spark

In [86]:
spark.stop()