In [None]:
def aggregate_stock_data(df, aggr_window: str):
    """Takes the stock dataframe and aggregates it to 1hour, 12hour or daily


    Parameters
    ----------
    df: pyspark dataframe
        dataframe of our stockdata

    aggr_window : str
        '1hour', '12hour', 'daily'
        

    Returns
    -------
    new aggregated pyspark dataframe 
    """
    
    

    df.createOrReplaceTempView("stock_table")

    sql_query = """
    WITH DailyFirstOpen AS (
        -- This CTE finds the first 'open' for each day
        SELECT
            DATE(timestamp) AS event_date,
            open,
            ROW_NUMBER() OVER (PARTITION BY DATE(timestamp) ORDER BY timestamp ASC) as rn_open
        FROM
            stock_table
    ),
    RankedDailyFirstOpen AS (
        SELECT event_date, open
        FROM DailyFirstOpen
        WHERE rn_open = 1
    ),
    DailyLastClose AS (
        -- This CTE finds the last 'close' for each day
        SELECT
            DATE(timestamp) AS event_date,
            close,
            ROW_NUMBER() OVER (PARTITION BY DATE(timestamp) ORDER BY timestamp DESC) as rn_close
        FROM
            stock_table
    ),
    RankedDailyLastClose AS (
        SELECT event_date, close
        FROM DailyLastClose
        WHERE rn_close = 1
    )
    
    -- Join the results to get first open and last close on the same row per day
    SELECT
        r_open.event_date,
        r_open.open AS first_open,
        r_close.close AS last_close
    FROM
        RankedDailyFirstOpen r_open
    JOIN
        RankedDailyLastClose r_close ON r_open.event_date = r_close.event_date
    ORDER BY
        r_open.event_date
    """



    if aggr_window == 'daily':
        new_df = spark.sql(sql_query)

    return new_df
        
