In [None]:
import os
from pathlib import Path
import shutil

import teehr

from teehr.evaluation.spark_session_utils import create_spark_session

In [None]:
spark = create_spark_session(
    aws_access_key_id="minioadmin",
    aws_secret_access_key="minioadmin123"
)

In [None]:
%%time
dir_path = "/data/temp_warehouse"

shutil.rmtree(dir_path, ignore_errors=True)

ev = teehr.Evaluation(
    spark=spark,
    dir_path=dir_path,
    create_dir=True
)

In [None]:
# Copy local template
ev.clone_template()

In [None]:
ev.apply_schema_migration()

In [None]:
ev.set_active_catalog("remote")

ev.active_catalog

#### Attributes

In [None]:
options = {
    "header": "true",
    "ignoreMissingFiles": "true"
}

tbl = ev.attributes()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/attributes/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

In [None]:
ev.write.to_warehouse(table_name="attributes", source_data=sdf)

#### Locations

In [None]:
tbl = ev.locations()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/locations/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

In [None]:
df = sdf.toPandas()

In [None]:
# Handpicked sites that seemed interesting
usgs_gages = [
    "usgs-02424000",
    "usgs-03068800",
    "usgs-01570500",
    "usgs-01347000",
    "usgs-05443500",
    "usgs-06770500",
    "usgs-08313000",
    "usgs-11421000",
    "usgs-14319500"
]

In [None]:
locs_df = df[df["id"].isin(usgs_gages)]

In [None]:
ev.write.to_warehouse(table_name="locations", source_data=locs_df)

#### Location attributes

In [None]:
tbl = ev.location_attributes()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/location_attributes/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

In [None]:
sdf = sdf.filter(sdf.location_id.isin(usgs_gages))

In [None]:
ev.write.to_warehouse(table_name="location_attributes", source_data=sdf)

#### Location crosswalks

In [None]:
tbl = ev.location_crosswalks()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/location_crosswalks/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

In [None]:
sdf = sdf.filter(sdf.primary_location_id.isin(usgs_gages))

In [None]:
ev.write.to_warehouse(table_name="location_crosswalks", source_data=sdf)

#### Units

In [None]:
tbl = ev.units()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/units/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

ev.write.to_warehouse(table_name="units", source_data=sdf)

#### Variables

In [None]:
tbl = ev.variables()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/variables/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

ev.write.to_warehouse(table_name="variables", source_data=sdf)

#### Configurations

In [None]:
tbl = ev.configurations()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/configurations/"

sdf = spark.read.format("csv").options(**options).load(s3_dirpath, schema=schema) 

ev.write.to_warehouse(table_name="configurations", source_data=sdf)

#### Primary Timeseries

In [None]:
tbl = ev.primary_timeseries()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/primary_timeseries/"

sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

sdf = sdf.filter(sdf.location_id.isin(usgs_gages))

ev.write.to_warehouse(table_name="primary_timeseries", source_data=sdf)

#### Secondary Timeseries

In [None]:
%%time
tbl = ev.secondary_timeseries()
schema = tbl.schema_func().to_structtype()
s3_dirpath = "s3a://ciroh-rti-public-data/teehr-data-warehouse/v0_4_evaluations/e3_usgs_hourly_streamflow/dataset/secondary_timeseries/"
sdf = spark.read.format("parquet").options(**options).load(s3_dirpath, schema=schema) 

xwalk_df = ev.location_crosswalks.to_pandas()

sdf = sdf.filter(sdf.location_id.isin(xwalk_df.secondary_location_id.tolist()))

ev.write.to_warehouse(table_name="secondary_timeseries", source_data=sdf)

In [None]:
ev.secondary_timeseries.to_sdf().count()