In [None]:
import os
from pathlib import Path

import pandas as pd
import botocore.session
from botocore import UNSIGNED
from botocore.config import Config

from teehr.evaluation.spark_session_utils import create_spark_session

In [None]:
%%time
spark = create_spark_session()

In [None]:
bucket_name = "dev-teehr-iceberg-warehouse"

# Set up access for public S3 bucket
session = botocore.session.get_session()
creds = session.get_credentials()

s3 = session.create_client(
    's3',
    # config=Config(signature_version=UNSIGNED),
    # region_name="us-east-2",
    # aws_access_key_id=creds.access_key,
    # aws_secret_access_key=creds.secret_key
)

Get a list of all table prefixes

In [None]:
response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix="teehr/",
    Delimiter='/',
    MaxKeys=100
)

table_prefixes = []
for prefix in response["CommonPrefixes"]:
    table_prefixes.append(prefix["Prefix"])

table_prefixes

For each table:
- Get a dataframe of metadata .json files and associated last modified times
- Get the path to the most recent metadata .json file
- Create the SQL to register that file
- Execute the SQL with the spark session

In [None]:
table_prefix.split("/")

In [None]:
for table_prefix in table_prefixes:
    prefix = (f"{table_prefix}metadata/")
    response = s3.list_objects_v2(
        Bucket=bucket_name,
        Prefix=prefix,
        Delimiter='/',
        MaxKeys=100
    )
    meta_list = []
    for content in response["Contents"]:
        key = content["Key"]
        last_modified = content["LastModified"]
        if Path(key).suffix == ".json":
            meta_list.append(
                {"key": key, "last_modified": last_modified}
            )

    df = pd.DataFrame(meta_list)
    latest_indx = df.last_modified.idxmax()   ## --> NO!!
    latest_json = df.key[latest_indx]

    namespace = table_prefix.split("/")[0]
    table_name = table_prefix.split("/")[1]
    sql = (f"""
        CALL iceberg.system.register_table(
            table => '{namespace}.{table_name}',
            metadata_file => 's3://{bucket_name}/{latest_json}'
        )
    """)


    print()
    print(sql)

#     spark.sql(sql)

#     print(f"Registered table: {namespace}.{table_name}")

# print("Registering complete!")

Previous approach. This differs from the `last_modified` date in s3

In [None]:
# Register all tables with latest metadata files using spark.sql()
spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.attributes',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/attributes/metadata/00002-75df852d-cac0-4703-9c1b-5a12aaca4e1d.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.configurations',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/configurations/metadata/00007-db9c8db5-fb54-4e86-a544-2f0c84bdc316.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.forecast_metrics_by_location',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/forecast_metrics_by_location/metadata/00050-427136bb-dd56-45e3-a9e1-312b36755043.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.joined_forecast_timeseries',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/joined_forecast_timeseries/metadata/00049-f63daf35-34da-4a26-bb4c-c1792936eb11.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.joined_simulation_timeseries',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/joined_simulation_timeseries/metadata/00000-54424606-ccfa-44e9-9470-554b98604e05.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.joined_timeseries',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/joined_timeseries/metadata/00000-c0cabb5e-6a44-49fb-a26e-e297fc175cad.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.location_attributes',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/location_attributes/metadata/00001-13ce6096-f90a-43cc-963b-f3818c62f224.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.location_crosswalks',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/location_crosswalks/metadata/00004-a8f9a516-2fbe-478f-8aba-90dae9b912e6.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.locations',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/locations/metadata/00001-e5ae5ca7-d25d-4361-a342-58fa503497f8.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.primary_timeseries',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/primary_timeseries/metadata/00132-430a003b-5fd0-4f73-a096-9b854b650659.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.secondary_timeseries',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/secondary_timeseries/metadata/00152-f364173a-4011-4a6c-bf18-690efdd82ad3.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.sim_metrics_by_location',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/sim_metrics_by_location/metadata/00001-bf895198-40cc-478e-84e5-07737b33d7ca.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.units',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/units/metadata/00002-17b48bbe-208a-4568-b68d-58ca65313368.metadata.json'
)
""")

spark.sql("""
CALL iceberg.system.register_table(
  table => 'teehr.variables',
  metadata_file => 's3://dev-teehr-iceberg-warehouse/teehr/variables/metadata/00002-613e7158-61bf-4f57-9d7c-315c89339744.metadata.json'
)
""")