In [None]:
import numpy as np
import torch

In [None]:
# fun, super flexible, thing

def extract_features_and_target(data, target_time_index=0, feature_time_index=0):
    """
    Extract features and target based on a numpy array (dataframe) and specified time indices for the target and features respectively.

    Parameters:
    - data (numpy.ndarray): A dataset with columns for index, time index, group id, features, and target - in this order.
    - target_time_index (int): The time index for the target value.
    - feature_time_index (int): The time index for the features to consider.

    Returns:
    - features_ft (numpy.ndarray): Features selected based on the feature time index. Shape: (num_groups, num_features)
    - target_tt (numpy.ndarray): Target values selected based on the target time index. Shape: (num_groups,)
    """
    # Extract the time index column from the synthetic dataset - should be the second column
    time_column = data[:, 1].astype(int)

    # Determine the maximum time index 't' from the time array
    max_time_index = np.max(time_column) + 1 # Should throw an error if the max time index is less than the target time index or feature time index!

    if target_time_index >= max_time_index:
        raise ValueError("Target time index must be less than the maximum time index.")
    
    if feature_time_index >= max_time_index:
        raise ValueError("Feature time index must be less than the maximum time index.")

    # Perform one-hot encoding for the time index
    time_one_hot_encoding = np.eye(max_time_index)[time_column] 

    # Extract the target values from the synthetic dataset - add a new axis to make it a column vector with shape (n, 1). Should be the last column
    target_column = data[:, -1][:, np.newaxis]

    # Create a temporal target matrix by multiplying target values with the one-hot encoding - uses broadcasting to create a matrix of shape (n, max_time_index)
    target_matrix = target_column * time_one_hot_encoding

    # Create a mask to filter 'target_tt' based on the target time (tt) index
    target_mask = target_matrix[:, target_time_index] != 0

    # Extract 'target_tt' values using the mask
    target_tt = target_matrix[:, target_time_index][target_mask]

    # Create a mask to filter 'features_ft' based on the feature time (ft) index
    feature_mask = data[:, 1] == feature_time_index

    # Extract 'features_ft' (Xs) using the mask. Features start from the fourth column (after index, time_index, and group_id) and end at the second last column (before the target column)
    features_ft = data[:, 3:-1][feature_mask]

    # Extract the original index values, time index values, and group id values corresponding to the features for bookkeeping
    index_ft = data[:, 0][feature_mask] # should be the first column
    time_column_ft = data[:, 1][feature_mask] # should be the second column
    group_id_ft = data[:, 2][feature_mask] # should be the third column

    new_data = np.column_stack((index_ft, time_column_ft, group_id_ft, features_ft, target_tt)) # combine all columns into a single numpy array (dataset-like) - same order as the original dataset and of shape (groups, 3 + num_features + 1)

    return new_data

In [None]:
# numpy version

def lead_column_within_groups(data, steps_to_lead=1, column_to_lead = -1, group_column = 2):
    """
    Lead (shift down) a specified column within each group in a 2D numpy array.

    Parameters:
    - data (numpy.ndarray): The original 2D numpy array.
    - column_to_lead (int): The index of the column to be led within each group.
    - group_column (int): The index of the column that represents the groups.
    - steps_to_lead (int, optional): The number of steps to lead (default is 1).

    Returns:
    - new_data (numpy.ndarray): The modified data with the specified column led within each group.
    """
    # Create a copy of the original data as new_data
    new_data = np.copy(data)

    # throw an error if the steps_to_lead is greater than the number of time steps in the data
    if steps_to_lead > np.max(data[:, 1]):
        raise ValueError("Steps to lead must be less than the maximum time index.")

    # Lead (shift down) the specified column within each group by the specified number of steps
    unique_groups = np.unique(data[:, group_column])  # Extract unique group values

    for group in unique_groups:
        # Find rows that belong to the current group
        group_indices = np.where(data[:, group_column] == group)[0]

        # Lead (shift down) the specified column within the group by the specified number of steps
        new_data[group_indices[steps_to_lead:], column_to_lead] = data[group_indices[:-steps_to_lead], column_to_lead]


    # Fill in the top rows within each group with a desired value, e.g., 0
    for group in unique_groups:
        group_indices = np.where(data[:, group_column] == group)[0]
        new_data[group_indices[:steps_to_lead], column_to_lead] = 0

    return new_data
