In [0]:
import requests
import pandas as pd
from pyspark.sql import functions as F
from datetime import timedelta

# =========================
# CONFIG
# =========================

ZONE = "NO5"

csv_path = "/Volumes/electricity-project/landing/datasets/electricity_prices_all_zones.csv"

catalog_name = "electricity-project"
schema_name = "bronze"
bronze_table = "bronze.electricity_prices_no5"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
spark.sql(f"USE SCHEMA {schema_name}")

# =========================
# BOOTSTRAP FROM CSV (RUNS ONCE)
# =========================
if not spark.catalog.tableExists(bronze_table):

    df = (
        spark.read
        .option("header", True)
        .csv(csv_path)
        .filter(F.col("zone") == ZONE)
        .withColumn(
            "datetime",
            F.to_timestamp("datetime")
        )
        .withColumn(
            "price_nok",
            F.expr("try_cast(price_nok AS DOUBLE)")
        )
        .withColumn("ingestion_date", F.current_date())
        .select("datetime", "price_nok", "ingestion_date")
    )

    df.write.format("delta").mode("overwrite").saveAsTable(bronze_table)
    dbutils.notebook.exit("Bootstrap completed from CSV. API not called.")

# =========================
# API APPEND (NEW DATA ONLY)
# =========================

last_dt = (
    spark.table(bronze_table)
    .agg(F.max("datetime"))
    .collect()[0][0]
)

start_date = last_dt.date() + timedelta(days=1)
end_date = pd.Timestamp.utcnow().date()

rows = []

current_date = start_date
while current_date <= end_date:

    year = current_date.year
    month = f"{current_date.month:02d}"
    day = f"{current_date.day:02d}"

    url = f"https://www.hvakosterstrommen.no/api/v1/prices/{year}/{month}-{day}_{ZONE}.json"
    response = requests.get(url, timeout=20)

    if response.status_code == 200:
        data = response.json()
        for entry in data:
            rows.append({
                "datetime": entry["time_start"],
                "price_nok": entry["NOK_per_kWh"]
            })

    current_date += timedelta(days=1)

if rows:
    sdf = (
        spark.createDataFrame(pd.DataFrame(rows))
        .withColumn(
            "datetime",
            F.to_timestamp("datetime")
        )
        .withColumn(
            "price_nok",
            F.expr("try_cast(price_nok AS DOUBLE)")
        )
        .withColumn("ingestion_date", F.current_date())
    )

    sdf.write.format("delta").mode("append").saveAsTable(bronze_table)


In [0]:
%sql
SELECT
  min(datetime),
  max(datetime),
  count(*)
FROM `electricity-project`.bronze.electricity_prices_no5;

In [0]:
%sql
SELECT
  datetime,price_nok
FROM `electricity-project`.bronze.electricity_prices_no5
ORDER BY datetime ASC
LIMIT 10;