### Function to create and load data into fact table for hourly weather

In [0]:
@logger
def load_hourly_weather_data(clean_weather_df):
    
    # importing required libraries
    from pyspark.sql.functions import col, hour
    from datetime import datetime
    
    clean_weather_df = clean_weather_df.withColumn("Date", col("created_on").cast('Date')).withColumn("timeID", hour('created_on'))
    
    try:
        timeID, date = clean_weather_df.select('timeID', 'Date').first()
        dateID = str(date).replace('-', '')
        query = f"delete from fact_hourly_weather where timeID='{timeID}' and date_key='{dateID}';"
        spark.sql(query)
    except:
        pass
    
    date_df = spark.table("dim_date_table")
    fact_weather_df = clean_weather_df.join(date_df, date_df.full_date == clean_weather_df.Date).select(
                            clean_weather_df.timeID,
                            date_df.date_key,
                            clean_weather_df.city_id,
                            clean_weather_df.temperature,
                            clean_weather_df.minimum_temperature,
                            clean_weather_df.maximum_temperature,
                            clean_weather_df.pressure,
                            clean_weather_df.humidity,
                            clean_weather_df.visibility,
                            clean_weather_df.wind_speed,
                            clean_weather_df.wind_degree,
                            clean_weather_df.wind_gust,
                            clean_weather_df.clouds_all
    )
    
    start = datetime.fromtimestamp(clean_weather_df.selectExpr("min(dt)").first()[0])
    end = datetime.fromtimestamp(clean_weather_df.selectExpr("max(dt)").first()[0])
    
    return fact_weather_df, start, end
    

### Function to create and load data into fact table for daily weather

In [0]:
@logger
def load_daily_weather_data():
    
    from pyspark.sql.functions import col, mean, max, min
    from datetime import datetime
    
    
    hourly_weather_df = spark.sql(f"SELECT\
                                      *\
                                  FROM\
                                      fact_hourly_weather\
                                  WHERE\
                                      date_key =\
                                      (\
                                         SELECT max(date_key) FROM fact_hourly_weather\
                                      );\
                                   ").drop('load_run_id', 'created_on', 'created_by', 'timeID')
    try:
        spark.sql(f"delete from fact_daily_weather where date_key = (select max(date_key) from fact_hourly_weather);")
    except:
        pass
    
    daily_weather_df = hourly_weather_df.groupby(
                       col('city_id'), col('date_key')).agg(mean("temperature").alias('temperature'), 
                             min('minimum_temperature').alias('minimum_temperature'), 
                             max('maximum_temperature').alias('maximum_temperature'), 
                             mean('pressure').cast('int').alias('pressure'), 
                             mean('humidity').cast('int').alias('humidity'), 
                             mean('visibility').cast('int').alias('visibility'),
                             mean('wind_speed').cast('int').alias('wind_speed'),
                             mean('wind_degree').cast('int').alias('wind_degree'), 
                             mean('wind_gust').alias('wind_gust'),
                             mean('clouds_all').alias('clouds_all')
                            )
    start = datetime.fromtimestamp(hourly_weather_df.selectExpr("min(date_key)").first()[0])
    end = datetime.fromtimestamp(hourly_weather_df.selectExpr("max(date_key)").first()[0])
    
    return daily_weather_df, start, end