In [None]:
import os
from pathlib import Path
import logging
import shutil
import time
import gc
import glob
import re

from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType
import holoviews as hv
import hvplot.pandas
import xarray as xr
import fsspec
import rioxarray
import rasterio
import geopandas as gpd
import numpy as np
import pandas as pd

import teehr
from teehr.evaluation.spark_session_utils import create_spark_session

LINE_PLOT_HEIGHT = 300
LINE_PLOT_WIDTH = 600

# Set global defaults for all line plots
hv.opts.defaults(
    hv.opts.Curve(
        bgcolor="#e7e9ecb8",
        show_grid=True,
        gridstyle={'grid_line_alpha': 0.5, 'grid_line_color': 'white'},
        frame_width=LINE_PLOT_WIDTH,
        frame_height=LINE_PLOT_HEIGHT
    )
)

logger = logging.getLogger(__name__)

teehr.__version__

In [None]:
%%time
# ~2 pods/node
NUM_EXECUTORS = 4
NUM_CORES = 6
EXECUTOR_MEMORY = "50g"

spark = create_spark_session(
    start_spark_cluster=True,
    executor_instances=NUM_EXECUTORS,
    executor_memory=EXECUTOR_MEMORY,
    executor_cores=NUM_CORES,
    aws_profile="admin-user"
)

# spark = create_spark_session()

dir_path = "/data/playground/slamont/teehr/warehouse/sedona/usgs_basins_map"

# USE EXISTING:
ev = teehr.Evaluation(
    spark=spark,
    dir_path=dir_path,
    create_dir=False
)

In [None]:
sdf = ev.table(table_name="primary_timeseries_final").to_sdf()

In [None]:
sdf.count()

In [None]:
sdf.show(n=5, truncate=False)

In [None]:
ev.set_active_catalog("remote")
ev.active_catalog

In [None]:
%%time
ev.write.to_warehouse(
    source_data=sdf,
    table_name="primary_timeseries",
    write_mode="append",
)