In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE" # Fabric requires full URL eg "https://key_vault_name.vault.azure.net/"
keyvault_linked_service = "INSERT_YOUR_LINKED_SERVICE_NAME_HERE"  # Not required for Fabric.


# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "wonde-bronze", keyvault_linked_service, keyvault)
silver_path = oeai.get_secret(spark, "wonde-silver", keyvault_linked_service, keyvault)
school_ids_secret = oeai.get_secret(spark, "wonde-school-ids", keyvault_linked_service, keyvault)
school_ids = school_ids_secret.split(",")
APIkey = oeai.get_secret(spark, "weather-apikey", keyvault_linked_service, keyvault)


In [None]:
def get_weather_data(lat: str, long: str, part: str, APIkey: str) -> dict:
    """
    Get the data for a school from the Weather API.

    Args:
        token (str): The token to use for the Bromcom API.
        estab (str): The estab of the school to get data for.
        endpoint (str): The endpoint to get data from.

    Returns:
        dict: The data for the school from the Weather API.
    """
    #Uses openweathermap - replace as required
    url = f"https://api.openweathermap.org/data/3.0/onecall?lat={lat}&lon={lon}&exclude={part}&appid={APIkey}"
    all_data = []
    next_url = url

    while next_url:
        response = requests.get(next_url)
        print(response)
        # Check if the request was successful
        if response.status_code != 200:
            error_message = f"Error: {traceback.format_exc()}"
            oeai.log_error(spark, error_message, error_log_path)
            break

        response_data = response.json()
        
        # Check if there are more pages
        next_url = response_data.get("meta", {}).get("pagination", {}).get("next")
    print(response_data)
    return response_data

In [None]:
def load_bronze(spark, lat: str, long: str, part: str, APIkey: str, limit=None, audit_log_file="audit_log.json"):
    """
    Loads data from an API into a Bronze layer, handling pagination, date chunking, and audit logging.

    Args:
        spark (SparkSession): Active SparkSession for DataFrame operations.
        endpoint (str): API endpoint to retrieve data from.
        subkey (str): Subkey for identifying the specific data.
        school_id (str): Unique identifier for the school.
        token (str): Authentication token for API access.
        pagination_type (str): Type of pagination used by the API ('cursor' or 'offset').
        limit (int, optional): Limit for the number of records to retrieve. Defaults to None.
        query (str, optional): Additional query parameters for the API call. Defaults to None.
        audit_log_file (str, optional): Filename for the audit log. Defaults to "audit_log.json".
        
    Returns:
        DataFrame: A PySpark DataFrame with the loaded data.
    """
    global audit_log
    df = pd.DataFrame()
    data_list = []  # Default empty list
    full_data_list = []  # Default empty list

    r = get_weather_data(lat,lon,part,APIkey)
    
    # Construct the directory path
    school_folder = bronze_path + school_id

    # Check and create directory if it doesn't exist
    if not os.path.exists(school_folder):
        os.makedirs(school_folder)

    # Calculate the duration of the API call
    #start_time = datetime.now()

    if not data_list:
        oeai.save_empty_json(spark, school_folder + ".json")
    else:
        try:
            # Update the last called date in the JSON file
            #LastUpdated[subkey] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            r_df = pd.DataFrame.from_records(data_list)
            flattened_data_list = [oeai.flatten_json(item) for item in data_list]

            # Convert the list of dictionaries to a Pandas DataFrame
            pandas_df = pd.DataFrame(flattened_data_list)

            # Convert the Pandas DataFrame to a PySpark DataFrame
            r_df = spark.createDataFrame(pandas_df)

            # Add school_id and unique_key to the DataFrame
            r_df = r_df.withColumn("school_id", lit(school_id))
            r_df = r_df.withColumn("unique_key", lit(school_id).cast("string"))

            # Save the DataFrame to a JSON file
            r_df.write.mode("overwrite").json(school_folder + ".json")

            #with open(bronze_path+'last_run.json', 'w') as f:
            #    json.dump(LastUpdated, f)
            
        # if the key doesn't exist, skip it    
        except Exception as e:
            error_message = f"Error: {traceback.format_exc()}"
            oeai.log_error(spark, error_message, error_log_path)
            pass
    
    return df

In [None]:
'''
  BRONZE PROCESS
'''
# introduce a limit for testing or leave as None for Live
Limit = None
query = ""

#Introduce parameters for your school

lat = "" 
lon = "" 
part = ""



# Set up the daily Jobs list with required endpoints
for school_id in school_ids:
    daily_jobs = [
        (lat, lon, part, APIkey),
        ]

    # call load bronze for each of the daily jobs
    for job in daily_jobs:
        load_bronze(spark, job[0], job[1], job[2], job[3])


In [None]:
def get_weather_data_for_timestamp(lat: str, lon: str, dt: int, APIkey: str, school_id: str) -> dict:
    """
    Fetch the weather data for a specific timestamp and add school ID to the data.
    """
    url = f"https://api.openweathermap.org/data/3.0/onecall/timemachine?lat={lat}&lon={lon}&dt={dt}&appid={APIkey}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Error retrieving data: {response.text}")
        return {}

    response_data = response.json()
    response_data['school_id'] = school_id  # Append the school_id to the response data
    return response_data

In [None]:
import datetime
import calendar

def generate_daily_timestamps(start_date: str, end_date: str, hour_utc: int):
    """
    Generate Unix timestamps for every day at a specific hour in UTC between two dates.
    """
    start = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    delta = datetime.timedelta(days=1)

    timestamps = []
    while start <= end:
        # Ensure the datetime object is set to the specified hour in UTC
        daily_time = start.replace(hour=hour_utc, minute=0, second=0, microsecond=0)
        # Use calendar.timegm() to correctly handle UTC
        timestamp = int(calendar.timegm(daily_time.timetuple()))
        timestamps.append(timestamp)
        start += delta

    return timestamps


In [None]:
# Introduce parameters for youe school
lat = ""  
lon = ""  

#Define start and end date in format: YYYY-MM-DD
start_date = ""
end_date = ""
hour_utc = 8  # 8 AM UTC

# Generate timestamps
timestamps = generate_daily_timestamps(start_date, end_date, hour_utc)

# Collect data
weather_data = []
for timestamp in timestamps:
    data = get_weather_data_for_timestamp(lat, lon, timestamp, APIkey, school_id)
    print(data)  # See exactly what's being returned
    weather_data.append(data)



In [None]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, DoubleType, LongType, IntegerType, StructType, FloatType
from pyspark.sql.functions import explode

# Define the schema corresponding to the JSON structure
schema = StructType([
    StructField("lat", FloatType()),
    StructField("lon", FloatType()),
    StructField("timezone", StringType()),
    StructField("timezone_offset", LongType()),
    StructField("data", ArrayType(StructType([
        StructField("dt", LongType()),
        StructField("sunrise", LongType()),
        StructField("sunset", LongType()),
        StructField("temp", FloatType()),
        StructField("feels_like", FloatType()),
        StructField("pressure", IntegerType()),
        StructField("humidity", IntegerType()),
        StructField("dew_point", FloatType()),
        StructField("clouds", IntegerType()),
        StructField("visibility", IntegerType()),
        StructField("wind_speed", FloatType()),
        StructField("wind_deg", IntegerType()),
        StructField("weather", ArrayType(StructType([
            StructField("id", IntegerType()),
            StructField("main", StringType()),
            StructField("description", StringType()),
            StructField("icon", StringType())
        ])))
    ]))),
    StructField("school_id", StringType())
])

for record in weather_data:
    for item in record['data']:
        # Convert integer temperature and other related fields to float
        item['temp'] = float(item['temp']) if item['temp'] is not None else None
        item['feels_like'] = float(item['feels_like']) if item['feels_like'] is not None else None
        item['dew_point'] = float(item['dew_point']) if item['dew_point'] is not None else None
        item['wind_speed'] = float(item['wind_speed']) if item['wind_speed'] is not None else None

# Create DataFrame using the defined schema
df = spark.createDataFrame(weather_data, schema=schema)

# Show the DataFrame to verify the correct loading of data
df.show(truncate=False)


In [None]:
# Optionally, explode the data array to flatten it for further analysis or storage
df = df.withColumn("data_exploded", explode("data"))
df.select("school_id", "data_exploded.*").show(truncate=False)

In [None]:
from pyspark.sql.functions import explode, col

# Assuming `df` already has the necessary data with the exploded 'data' array
# Now explode the 'weather' array to transform each dictionary in the array into a separate row
df = df.withColumn("weather_exploded", explode("data_exploded.weather"))

# Flatten the structure by selecting the fields, including weather details
flattened_df = df.select(
    col("school_id"),
    col("data_exploded.dt").alias("timestamp"),
    col("data_exploded.sunrise").alias("sunrise"),
    col("data_exploded.sunset").alias("sunset"),
    col("data_exploded.temp").alias("temperature"),
    col("data_exploded.feels_like").alias("feels_like"),
    col("data_exploded.pressure").alias("pressure"),
    col("data_exploded.humidity").alias("humidity"),
    col("data_exploded.dew_point").alias("dew_point"),
    col("data_exploded.clouds").alias("clouds"),
    col("data_exploded.visibility").alias("visibility"),
    col("data_exploded.wind_speed").alias("wind_speed"),
    col("data_exploded.wind_deg").alias("wind_deg"),
    col("weather_exploded.id").alias("weather_id"),
    col("weather_exploded.main").alias("weather_main"),
    col("weather_exploded.description").alias("weather_description"),
    col("weather_exploded.icon").alias("weather_icon")
)

# Print the schema to verify the new structure
flattened_df.printSchema()

# Show some data to ensure correctness
flattened_df.show(truncate=False)


In [None]:
# Write to JSON file named after school_id
output_path = f"{bronze_path}/{school_id}"
flattened_df.write.mode('overwrite').json(output_path)