In [0]:
import requests
import pandas as pd
from pyspark.sql import functions as F
from datetime import timedelta

In [0]:


# =========================
# CONFIG
# =========================
CLIENT_ID = "0785decb-92bd-496b-ab0b-8edc636374a0"
CLIENT_SECRET = "8de38c97-b8d5-4bca-95ee-2b27e17d47a0"

STATION_ID = "SN50540"
ELEMENT = "air_temperature"

csv_path = "/Volumes/electricity-project/landing/datasets/weather_data.csv"

catalog_name = "electricity-project"
schema_name = "bronze"
bronze_table = "bronze.weather_observed"

# =========================
# CATALOG + SCHEMA
# =========================
spark.sql(f"USE CATALOG `{catalog_name}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
spark.sql(f"USE SCHEMA {schema_name}")

# =========================
# BOOTSTRAP FROM CSV (RUNS ONCE)
# =========================
if not spark.catalog.tableExists(bronze_table):

    df = (
        spark.read
        .option("header", True)
        .option("delimiter", ";")
        .csv(csv_path)
        .withColumn(
            "datetime",
            F.to_timestamp(
                F.col("Time(norwegian mean time)"),
                "dd.MM.yyyy HH:mm"
            )
        )
        .withColumn(
            "temperature",
            F.expr(
                "try_cast(regexp_replace(`Air temperature`, '[^0-9\\.-]', '') AS DOUBLE)"
            )
        )
        .withColumn("ingestion_ts", F.current_timestamp())

        .select("datetime", "temperature", "ingestion_date", "ingestion_ts")
    )

    df.write.format("delta").option("mergeSchema", "true").mode("overwrite").saveAsTable(bronze_table)
    dbutils.notebook.exit("Bootstrap completed from CSV. API not called.")

# =========================
# API APPEND (NEW DATA ONLY)
# =========================

last_dt_utc = (
    spark.sql("""
        SELECT to_utc_timestamp(max(datetime), 'Europe/Oslo') AS dt
        FROM bronze.weather_observed
    """)
    .collect()[0][0]
)

start_time = (last_dt_utc + timedelta(hours=1)).strftime("%Y-%m-%dT%H:00:00Z")
end_time = pd.Timestamp.utcnow().strftime("%Y-%m-%dT%H:00:00Z")

url = "https://frost.met.no/observations/v0.jsonld"
params = {
    "sources": STATION_ID,
    "elements": ELEMENT,
    "referencetime": f"{start_time}/{end_time}",
    "timeoffsets": "default",
    "levels": "default",
    "qualities": "0,1,2,3,4,9"
}

response = requests.get(
    url,
    params=params,
    auth=(CLIENT_ID, CLIENT_SECRET),
    timeout=30
)
response.raise_for_status()

data = response.json().get("data", [])

rows = []
for obs in data:
    for item in obs["observations"]:
        rows.append({
            "datetime": obs["referenceTime"],
            "temperature": str(item["value"])
        })

if rows:
    sdf = (
        spark.createDataFrame(pd.DataFrame(rows))
        .withColumn(
            "datetime",
            F.from_utc_timestamp(
                F.to_timestamp("datetime"),
                "Europe/Oslo"
            )
        )
        .withColumn(
            "temperature",
            F.expr(
                "try_cast(regexp_replace(temperature, '[^0-9\\.-]', '') AS DOUBLE)"
            )
        )
        .withColumn("ingestion_date", F.current_date())
        .withColumn("ingestion_ts", F.current_timestamp())
    )

    sdf.write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable(bronze_table)
    


In [0]:
# spark.table("`electricity-project`.bronze.weather_observed") \
#      .orderBy(F.col("datetime").asc()) \
#      .limit(5) \
#      .show(truncate=False)

In [0]:
#spark.read.option("header", True).csv(csv_path).printSchema()


In [0]:
# spark.table("`electricity-project`.bronze.weather_observed") \
#      .orderBy(F.col("datetime").desc()) \
#      .limit(10) \
#      .show(truncate=False)

In [0]:
# spark.sql("""
# SELECT
#   max(datetime) AS max_dt,
#   to_utc_timestamp(max(datetime), 'Europe/Oslo') AS max_dt_utc
# FROM `electricity-project`.bronze.weather_observed
# """).show(truncate=False)

In [0]:
##spark.sql("DROP TABLE IF EXISTS bronze.weather_observed")

In [0]:
# spark.table("bronze.weather_observed") \
#      .select(F.max("datetime")) \
#      .show()

In [0]:
# %sql
# SELECT
#   min(datetime),
#   max(datetime),
#   count(*)
# FROM `electricity-project`.bronze.weather_observed


In [0]:
# %sql
# SELECT
#   datetime, temperature
  
# FROM `electricity-project`.bronze.weather_observed
# ORDER BY datetime
# LIMIT 10