In [35]:
import utils
import pandas as pd
import numpy as np
from copy import deepcopy
import os
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
df = pd.concat([pd.read_csv('hka-aqm-am/' + f.removeprefix('._'), skiprows=1, sep=';', engine='python') for f in os.listdir('hka-aqm-am/')])


In [48]:
def get_data_for_multivarate_sequential_forecast(df: pd.DataFrame, y_feature: str='CO2', window_size: int=5, aggregation_level: str = 'half_hour', batch_size: int=utils.get_batch_size(), clean_data: bool=True, drop_columns: list=[], extend_training_data: bool=True) -> np.array:
    df_cpy = utils.clean_df(df, clean_data)

    # Get list of holidays and exams at HKA and add features for them
    holiday_list, exam_dates = utils.get_list_of_special_dates()
    df_cpy['isHoliday'] = df_cpy['date_time'].dt.date.isin(holiday_list).astype(int)
    df_cpy['isExamTime'] = df_cpy['date_time'].dt.date.isin(exam_dates).astype(int)

    # round date_time by half hour, group by date_time_rounded and device_id and take the mean of the other columns
    if aggregation_level == "hour":
        freq='60T'
        df_cpy['date_time_rounded'] = df_cpy['date_time'].dt.round('60T')
    elif aggregation_level == "half_hour":
        freq='30T'
        df_cpy['date_time_rounded'] = df_cpy['date_time'].dt.round('30T')
    elif aggregation_level == "quarter_hour":
        freq='15T'
        df_cpy['date_time_rounded'] = df_cpy['date_time'].dt.round('15T')
    else:
        raise ValueError("Invalid aggregation_level. Please choose one of 'hour', 'half_hour', or 'quarter_hour'.")

    # encode cyclic features
    df_cpy['weekday_sin'] = np.sin(2 * np.pi * df_cpy['date_time_rounded'].dt.weekday / 7)
    df_cpy['weekday_cos'] = np.cos(2 * np.pi * df_cpy['date_time_rounded'].dt.weekday / 7)
    df_cpy['month_sin'] = np.sin(2 * np.pi * df_cpy['date_time_rounded'].dt.month / 12)
    df_cpy['month_cos'] = np.cos(2 * np.pi * df_cpy['date_time_rounded'].dt.month / 12)
    df_cpy['time_sin'] = np.sin(2 * np.pi * (df_cpy['date_time_rounded'].dt.hour * 3600 + df_cpy['date_time'].dt.minute * 60) / 86400.0)
    df_cpy['time_cos'] = np.cos(2 * np.pi * (df_cpy['date_time_rounded'].dt.hour * 3600 + df_cpy['date_time'].dt.minute * 60) / 86400.0)
    df_cpy['semester'] = 'WS22/23'
    df_cpy.loc[df_cpy['date_time_rounded'] >= '2023-03-01', 'semester'] = 'SS23'
    df_cpy.loc[df_cpy['date_time_rounded'] >= '2023-09-01', 'semester'] = 'WS23/24'
    df_cpy = pd.get_dummies(df_cpy, columns=['semester'])

    df_cpy = df_cpy.groupby(['device_id', 'date_time_rounded']).mean(numeric_only=True).reset_index()

    df_cpy_extended = pd.DataFrame()
    
    def fill_consecutive_nans(device_df: pd.DataFrame, window_size: int):
        max_consecutive_nans = window_size // 4
        device_df = device_df.sort_values('date_time_rounded')
        device_df['row_contains_no_nan'] = device_df.notna().all(axis=1).astype(int)
        device_df['group'] = device_df['row_contains_no_nan'].cumsum()
        reduced_df = deepcopy(device_df[device_df['row_contains_no_nan'] == False])
        nan_count_in_group = reduced_df.groupby('group')['row_contains_no_nan'].transform('count')
        #reduced_df['nan_count_in_group'] = np.where(reduced_df[y_feature].isna(), nan_count_in_group, 0)
        #reduced_df.loc[reduced_df.isna().any(axis=1), 'nan_count_in_group'] = nan_count_in_group
        device_df.loc[device_df.isna().any(axis=1), 'nan_count_in_group'] = nan_count_in_group.loc[device_df.isna().any(axis=1)]

    
        #device_df = pd.merge(device_df, reduced_df[['date_time_rounded', 'nan_count_in_group']], on='date_time_rounded', how='left')
        device_df['nan_count_in_group'] = device_df['nan_count_in_group'].fillna(0)
        device_df['device_id'] = device_df['device_id'].fillna(method='ffill')
        device_df = device_df[device_df['nan_count_in_group'] <= max_consecutive_nans]
        device_df.drop(columns=['row_contains_no_nan', 'group', 'nan_count_in_group'], inplace=True)
        device_df[[col for col in device_df.columns if col != 'date_time_rounded']] = device_df[[col for col in device_df.columns if col != 'date_time_rounded']].interpolate(method='linear', axis=0)
        
        return device_df

    
    if extend_training_data:
        for device_id in df_cpy['device_id'].unique():
            device_data = df_cpy[df_cpy['device_id'] == device_id]
            min_date = device_data['date_time_rounded'].min()
            max_date = device_data['date_time_rounded'].max()
            date_range = pd.date_range(start=min_date, end=max_date, freq=freq)
            device_df = pd.DataFrame(date_range, columns=['date_time_rounded'])
            #device_df['device_id'] = device_id
            merged_df = pd.merge(device_df, device_data, on='date_time_rounded', how='left')
            merged_df_filled = fill_consecutive_nans(merged_df, window_size)

            df_cpy_extended = pd.concat([df_cpy_extended, merged_df_filled], ignore_index=True)
        
        print(f'extended data shape from {df_cpy.shape} to {df_cpy_extended.shape}')

        df_cpy = deepcopy(df_cpy_extended)
    
    
    # the data is now in the state that it needs to be so that it can be used as the context data at later predictions
    full_preprocessed_df_unscaled = deepcopy(df_cpy)
    
    # create 'consecutive_data_point' thats 1 if the previous data point is <freq> before the current data point and device_id is the same, else 0
    df_cpy['consecutive_data_point'] = (df_cpy['date_time_rounded'] - df_cpy['date_time_rounded'].shift(1)).dt.total_seconds() == pd.to_timedelta(freq).total_seconds()
    df_cpy['consecutive_data_point'] = df_cpy['consecutive_data_point'].astype(int)
    
    # Identify changes and resets (when the value is '0' or there's a change in 'device_id')
    df_cpy['reset'] = (df_cpy['consecutive_data_point'] == 0) | (df_cpy['device_id'] != df_cpy['device_id'].shift(1))

    # Create a group identifier that increments every time a reset occurs
    df_cpy['group'] = df_cpy['reset'].cumsum()

    # Calculate cumulative sum of "1"s within each group
    df_cpy['consecutive_data_points'] = df_cpy.groupby(['device_id', 'group'])['consecutive_data_point'].cumsum() - df_cpy['consecutive_data_point']
    df_cpy['group_size'] = df_cpy.groupby(['device_id', 'group'])['consecutive_data_point'].transform('count')

    df_cpy = df_cpy[df_cpy['group_size'] > window_size]

    # You may want to drop the 'reset' and 'group' columns if they are no longer needed
    df_cpy.drop(['reset', 'consecutive_data_point', 'consecutive_data_points', 'group_size'], axis=1, inplace=True)
    
    threshold_date = df_cpy.sort_values('date_time_rounded', ascending=True)['date_time_rounded'].quantile(0.8)
    print('training data cutoff: ', threshold_date)

    df_cpy['device_id'] = df_cpy['device_id'].astype('category').cat.codes

    df_train = deepcopy(df_cpy[df_cpy['date_time_rounded'] < threshold_date])
    df_test = deepcopy(df_cpy[df_cpy['date_time_rounded'] >= threshold_date])

    # Create the scaler instance
    scaler = StandardScaler()

    # Get the columns to scale
    columns_to_scale = [col for col in df_train.columns if col not in ['date_time_rounded', 'device_id', 'group']]

    # Fit on training data and transform both training and test data
    df_train[columns_to_scale] = scaler.fit_transform(df_train[columns_to_scale])
    df_test[columns_to_scale] = scaler.transform(df_test[columns_to_scale])
    y_feature_scaler_index = columns_to_scale.index(y_feature)

    # drop unconvertible columns
    df_train.drop(['date_time_rounded'], axis=1, inplace=True)
    df_test.drop(['date_time_rounded'], axis=1, inplace=True)
    
    print(df_train.dtypes)

    def to_sequences(seq_size: int, obs: pd.DataFrame):
        x = []
        y = []
        device_ids = []
        for g_id in obs['group'].unique():
            group_df = obs[obs['group'] == g_id]
            feature_values = group_df[f'{y_feature}'].tolist()
            for i in range(len(group_df) - seq_size):
                window = group_df[i:(i + seq_size)]
                after_window = feature_values[i + seq_size]
                x.append(window.drop(columns=['device_id', 'group']).values)
                device_ids.append(window['device_id'].values[-1])
                y.append(after_window)
        feature_count = x[0].shape[1]
        return (torch.tensor(np.array(x), dtype=torch.float32).view(-1, seq_size, feature_count),
                torch.tensor(device_ids, dtype=torch.long),
                torch.tensor(y, dtype=torch.float32).view(-1, 1))

    print("Creating sequences...")
    x_train, train_device_ids, y_train = to_sequences(window_size, df_train)
    x_test, test_device_ids, y_test = to_sequences(window_size, df_test)

    print("Training data shape:", x_train.shape, train_device_ids.shape,y_train.shape)
    print("Testing data shape:", x_test.shape, test_device_ids.shape, y_test.shape)

    # combining the datasets again and shuffle them. This is because we cant shuffle them before because of the context. Maybe we can, but im too layzy to think about it.
    x_combined = torch.cat((x_train, x_test), dim=0)
    device_ids_combined = torch.cat((train_device_ids, test_device_ids), dim=0)
    y_combined = torch.cat((y_train, y_test), dim=0)

    # Shuffle the combined dataset
    combined_dataset = TensorDataset(x_combined, device_ids_combined, y_combined)
    combined_loader = DataLoader(combined_dataset, batch_size=len(combined_dataset), shuffle=True)

    # Get shuffled data from the loader
    for batch in combined_loader:
        x_combined_shuffled, device_ids_combined_shuffled, y_combined_shuffled = batch

    # Split back into training and test sets (80:20 split)
    split_index = int(0.8 * len(x_combined_shuffled))
    x_train_shuffled, x_test_shuffled = x_combined_shuffled[:split_index], x_combined_shuffled[split_index:]
    train_device_ids_shuffled, test_device_ids_shuffled = device_ids_combined_shuffled[:split_index], device_ids_combined_shuffled[split_index:]
    y_train_shuffled, y_test_shuffled = y_combined_shuffled[:split_index], y_combined_shuffled[split_index:]

    print("Shuffled Training data shape:", x_train_shuffled.shape, train_device_ids_shuffled.shape, y_train_shuffled.shape)
    print("Shuffled Testing data shape:", x_test_shuffled.shape, test_device_ids_shuffled.shape, y_test_shuffled.shape)

    # Setup data loaders for batch
    # train_dataset = TensorDataset(x_train, train_device_ids, y_train)
    train_dataset = TensorDataset(x_train_shuffled, train_device_ids_shuffled, y_train_shuffled)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)


    # test_dataset = TensorDataset(x_test, test_device_ids, y_test)
    test_dataset = TensorDataset(x_test_shuffled, test_device_ids_shuffled, y_test_shuffled)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True)

    return train_dataset, test_dataset, train_loader, test_loader, scaler, y_test, full_preprocessed_df_unscaled, y_feature_scaler_index

train_dataset, test_dataset, train_loader, test_loader, scaler, y_test, full_preprocessed_df_unscaled, y_feature_scaler_index = get_data_for_multivarate_sequential_forecast(df, y_feature='CO2', window_size=20, aggregation_level='quarter_hour')

# full_preprocessed_df_unscaled = get_data_for_multivarate_sequential_forecast(df, y_feature='CO2', window_size=20, aggregation_level='quarter_hour')
full_preprocessed_df_unscaled.head(20)


extended data shape from (395686, 27) to (401930, 27)
training data cutoff:  2023-07-14 03:45:00
device_id              int8
tmp                 float64
hum                 float64
CO2                 float64
VOC                 float64
vis                 float64
IR                  float64
WIFI                float64
BLE                 float64
rssi                float64
channel_rssi        float64
channel_index       float64
spreading_factor    float64
bandwidth           float64
f_cnt               float64
isHoliday           float64
isExamTime          float64
weekday_sin         float64
weekday_cos         float64
month_sin           float64
month_cos           float64
time_sin            float64
time_cos            float64
semester_SS23       float64
semester_WS22/23    float64
semester_WS23/24    float64
group                 int64
dtype: object
Creating sequences...
Training data shape: torch.Size([316477, 20, 25]) torch.Size([316477]) torch.Size([316477, 1])
Testing data sha

AttributeError: 'TensorDataset' object has no attribute 'shape'