In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE" # Fabric requires full URL eg "https://key_vault_name.vault.azure.net/"
keyvault_linked_service = "INSERT_YOUR_LINKED_SERVICE_NAME_HERE"  # Not required for Fabric.


# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "wonde-bronze", keyvault_linked_service, keyvault)
silver_path = oeai.get_secret(spark, "wonde-silver", keyvault_linked_service, keyvault)
school_ids_secret = oeai.get_secret(spark, "wonde-school-ids", keyvault_linked_service, keyvault)
school_ids = school_ids_secret.split(",")
APIkey = oeai.get_secret(spark, "weather-apikey", keyvault_linked_service, keyvault)

In [None]:
# read bronze file into a data frame
json_dir_path = f"{bronze_path}/{school_id}.json"
df = spark.read.json(json_dir_path)
df.printSchema()

In [None]:
from pyspark.sql.functions import when, col

# List of weather IDs considered as bad weather
bad_weather_ids = [200, 201, 202, 210, 211, 212, 221, 230, 231, 232, 302, 501,
                   502, 503, 504, 511, 522, 531, 601, 602, 611, 616,
                   622, 771, 781]

# Create a new column based on the condition
df = df.withColumn("Is_bad_weather", when(col("weather_id").isin(bad_weather_ids), 1).otherwise(0))

# Show the DataFrame to verify the new column
df.show(truncate=False)


In [None]:
bad_weather_df = df.filter(col("Is_bad_weather") == 1)

# Show the filtered DataFrame
bad_weather_df.show(truncate=False)

In [None]:
# Get the count of records in the DataFrame
record_count = df.count()

# Print the count
print(f"Total number of records in the DataFrame: {record_count}")

In [None]:
from pyspark.sql.functions import col

# Group by 'weather_id' and count each occurrence
weather_id_counts = df.groupBy("weather_id").count()

# Show the weather IDs along with their counts
weather_id_counts.show(truncate=False)

# Optionally, sort the output by count in descending order to see the most common weather IDs first
weather_id_counts.orderBy(col("count").desc()).show(truncate=False)


In [None]:
from pyspark.sql.functions import from_unixtime, col, date_format

# Assuming 'timestamp' is your column with Unix timestamp values
# Create a column for the calendar date
df = df.withColumn("calendar_date", date_format(from_unixtime(col("timestamp")), "yyyy-MM-dd"))

# Create a column for the time
df = df.withColumn("time", date_format(from_unixtime(col("timestamp")), "HH:mm:ss"))

# Show the DataFrame to verify the new columns
df.select("timestamp", "calendar_date", "time").show(truncate=False)


In [None]:
from pyspark.sql.functions import col

# Assuming 'temperature' is the column with temperature values in Kelvin
# Create a new column 'temp_celsius' with temperature converted to Celsius
df = df.withColumn("temp_celsius", col("temperature") - 273.15)

# Show the DataFrame to verify the new column
df.select("temperature", "temp_celsius").show(truncate=False)


In [None]:
# Path where the Delta table will be stored
delta_table_path = f"{silver_path}/dim_weather"

# Save the DataFrame to the Delta table
df.write.format("delta").mode("overwrite").save(delta_table_path)


