In [1]:
import numpy as np
import torch
import time

print("NumPy version: ", np.__version__)
print("PyTorch version: ", torch.__version__)
print("CUDA version: ", torch.version.cuda)

NumPy version:  1.24.3
PyTorch version:  2.0.1
CUDA version:  11.7


# Utils:

In [2]:
def timing_decorator(func):

    """ A decorator that times a function and prints the execution time."""

    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"{func.__name__} took {execution_time:.6f} seconds to run.")
        return result
    
    return wrapper

In [3]:
@timing_decorator
def sort_tensor_by_indices(tensor, primary_index, secondary_index):
    """
    Sort a PyTorch tensor based on both primary and secondary indices. This function is vital since the shifter requires the input tensor to be sorted by both group and time.

    Args:
        tensor (torch.Tensor): The input tensor to be sorted.
        primary_index (int): The primary index (column) based on which to sort the tensor. Should be group coulumn
        secondary_index (int): The secondary index (column) based on which to sort the tensor. Should be time column

    Returns:
        torch.Tensor: The sorted tensor.

    Raises:
        ValueError: If the input is not a PyTorch tensor, or if the primary or secondary index is not a natural number.

    """
    # # Check if the input is a PyTorch tensor
    #if not isinstance(tensor, torch.Tensor) or not isinstance(tensor, np.ndarray):
    #    raise ValueError("Input 'tensor' must be a PyTorch tensor or numpy array!")


    # Sort by both columns using NumPy's lexsort
    sorted_indices = np.lexsort((tensor[:, secondary_index], tensor[:,primary_index])) # torch does not have lexsort. Use numpy instead

    # Apply the sorting order to the original PyTorch tensor
    sorted_tensor = tensor[sorted_indices]

    return sorted_tensor


In [4]:
@timing_decorator
def shuffle_tensor_rows(tensor):
    """
    Shuffle the rows of a PyTorch tensor. Just for testing the sorting function.

    Args:
        tensor (torch.Tensor): The input tensor to be shuffled.

    Returns:
        torch.Tensor: The shuffled tensor.

    """
    # Generate random indices for shuffling
    random_indices = torch.randperm(tensor.size(0))

    # Shuffle the tensor using the random indices
    shuffled_tensor = tensor[random_indices]

    return shuffled_tensor

In [5]:
@timing_decorator
def check_if_subset(data, new_data):
    """
    Check if new_data is a subset of the original data by comparing specific columns.
    Adds up the group column and target column of the original data and the new data. 
    If the shifting and censoring is done correctly, the new column should be a subset of the original column.

    Args:
        data (torch.Tensor): Original data tensor.
        new_data (torch.Tensor): Data to be checked for subset.

    Returns:
        bool: True if new_data is a subset of the original data; False otherwise.
    """
    
    # Calculate the sum of specific columns
    sum_col = data[:, 2] + data[:, -1]  # group column + target column
    new_sum_col = new_data[:, 2] + new_data[:, -1]  # group column + target column

    # Check if new_sum_col is a subset of sum_col
    is_subset = torch.all(torch.isin(new_sum_col.cpu(), sum_col.cpu()))

    if is_subset:
        print("The new column is a subset of the original column.")
    else:
        print("The new column is NOT a subset of the original column.")

    return is_subset


In [6]:
@timing_decorator
def check_if_length_correct(data, new_data, steps):
    """
    Check if the number of censored rows in new_data is correct by comparing it with the expected value.

    Args:
        data (torch.Tensor): Original data tensor.
        new_data (torch.Tensor): Data processed by the shift_and_mask_column function.
        steps (int): Number of steps to shift the specified column down.

    Returns:
        bool: True if the number of censored rows in new_data is correct; False otherwise.
    """

    num_groups = torch.unique(data[:, 2]).shape[0]
    num_row_censured = num_groups * abs(steps)

    length_diff = data.shape[0] - new_data.shape[0]

    is_lenght_correct = length_diff == num_row_censured

    if is_lenght_correct:
        print("The number of rows censured is correct.")
    else:
        print("The number of rows censured is incorrect.")

    return is_lenght_correct

In [7]:
@timing_decorator
def generate_synthetic_temporal_dataset(num_groups=3, num_time_steps=5, num_features=1):
    """
    Generate a synthetic temporal dataset with cumulative target values.

    Parameters:
    - num_groups (int): Number of groups.
    - num_time_steps (int): Number of time steps.
    - num_features (int): Number of feature columns.

    Returns:
    - synthetic_dataset (numpy.ndarray): The synthetic dataset with columns for index, time index, group id, features, and target.
    """
    
    n_rows = num_groups * num_time_steps
    
    # Generate the group id column as a repeat pattern
    group_array = np.repeat(np.arange(num_groups), num_time_steps)
    
    # Generate the time index column as a repeat pattern
    time_array = np.tile(np.arange(num_time_steps), num_groups)
    
    # Generate random feature columns
    feature_array = np.random.rand(n_rows, num_features)
    
    # Generate the index column
    indx_array = np.arange(n_rows)
    
    # Initialize the target column with random values
    target_array = np.random.rand(n_rows, 1)
    
    # Combine all columns into a single numpy array
    synthetic_dataset = np.column_stack((indx_array, time_array, group_array, feature_array, target_array))
    
    return synthetic_dataset

# Example usage:
# synthetic_data = generate_synthetic_temporal_dataset(num_groups=3, num_time_steps=5, num_features=1)


# Shifter

In [8]:
import torch

@timing_decorator
def shift_and_mask_column(data_org, column_to_shift_idx=-1, time_column_idx=1, steps= 1, force_cpu=False, return_full=False, retain_unshifted_col = False):
    """
    Shifts a specified column down by a given number of steps and replaces specific values with NaN.

    Args:
        data_org (torch.Tensor or numpy.ndarray): Input data tensor. If a numpy array is provided, it will be converted to a torch.Tensor.
        column_to_shift_idx (int, optional): Index of the column to be shifted. Default is -1. I.e., the last column which is usually the target column.
        time_column_idx (int, optional): Index of the time column. Default is 1.
        steps (int, optional): Number of steps to shift the specified column down. Default is 1. I.e. laggging by one step.
        force_cpu (bool, optional): If True, forces computation on CPU. Default is False.
        return_full (bool, optional): Whether to return the full data with NaN rows included (default is False). Nice for testing and debugging.

    Returns:
        torch.Tensor: Processed data tensor. If 'return_full' is True, NaN rows are retained.

    """

    # # Convert the input NumPy array to a PyTorch tensor if it is not already
    if not isinstance(data_org, torch.Tensor):
        data_org = torch.from_numpy(data_org)

    # Move the data to the GPU if available
    if torch.cuda.is_available() and not force_cpu:
        data = data_org.to('cuda') # No need to clone since we are moving the data to the GPU

    elif force_cpu:
        data = data_org.clone() # Need to clone since we are staying on the CPU


    unique_time_idx = torch.unique(data[:, time_column_idx])  
    time_values_to_mask = torch.sort(unique_time_idx).values[:steps]
    time_values_mask = torch.isin(data[:, time_column_idx], time_values_to_mask)
    column_to_shift = data[:, -time_column_idx]
    masked_column_to_shift = torch.where(~time_values_mask, column_to_shift, torch.full_like(column_to_shift, float('nan')))

    masked_column_shifted = torch.roll(masked_column_to_shift, shifts = - steps, dims=0) # needs negative steps here to lag and not lead.


    if retain_unshifted_col == False:
    # Replace the original column with the shifted column
        data[:, column_to_shift_idx] = masked_column_shifted

    else:
        data = torch.column_stack([data, masked_column_shifted])
    
    # This is mostly for testing and debugging purposes
    if return_full == False: 
        nan_mask = ~torch.isnan(data[:, column_to_shift_idx])
        data = data[nan_mask]

    return data


# TESTING!

In [9]:
def run_tests(steps_to_shift = 1):

    # num_groups would be number of unique pg_id, time would be month_id, num_features would be, well, number of features.
    data_org = generate_synthetic_temporal_dataset(num_groups=15000, num_time_steps=400, num_features=100) 

    # Convert the input NumPy array to a PyTorch tensor if it is not already - should nut be here...
    if not isinstance(data_org, torch.Tensor):
        data_org = torch.from_numpy(data_org)

    data_org = shuffle_tensor_rows(data_org) # to test the sorting function

    primary_index = 2 # group column
    secondary_index = 1 # time column
    data_org = sort_tensor_by_indices(data_org, primary_index, secondary_index) # this step is paramount for the rolling done in shift_and_mask_column to be accurate


    new_data = shift_and_mask_column(data_org, steps=steps_to_shift, return_full=False, force_cpu=True) # force CPU now since GPU runs out of mem. Fix later.

    print()
    test1  = check_if_subset(data_org, new_data)
    print()
    test2 = check_if_length_correct(data_org, new_data, steps_to_shift)


    if test1 and test2:
        print("\nAll tests passed!")

In [10]:
run_tests(1)

generate_synthetic_temporal_dataset took 3.836895 seconds to run.
shuffle_tensor_rows took 0.447205 seconds to run.
sort_tensor_by_indices took 1.864262 seconds to run.
shift_and_mask_column took 1.272718 seconds to run.

The new column is a subset of the original column.
check_if_subset took 5.635707 seconds to run.

The number of rows censured is correct.
check_if_length_correct took 0.055111 seconds to run.

All tests passed!


In [11]:
run_tests(2)

generate_synthetic_temporal_dataset took 3.941547 seconds to run.
shuffle_tensor_rows took 0.412981 seconds to run.
sort_tensor_by_indices took 1.831906 seconds to run.
shift_and_mask_column took 0.860739 seconds to run.

The new column is a subset of the original column.
check_if_subset took 5.596295 seconds to run.

The number of rows censured is correct.
check_if_length_correct took 0.052355 seconds to run.

All tests passed!


In [12]:
run_tests(3)

generate_synthetic_temporal_dataset took 3.894655 seconds to run.
shuffle_tensor_rows took 0.475444 seconds to run.
sort_tensor_by_indices took 1.857387 seconds to run.
shift_and_mask_column took 0.869786 seconds to run.

The new column is a subset of the original column.
check_if_subset took 5.534508 seconds to run.

The number of rows censured is correct.
check_if_length_correct took 0.053815 seconds to run.

All tests passed!


In [13]:
run_tests(4)

generate_synthetic_temporal_dataset took 3.881997 seconds to run.
shuffle_tensor_rows took 0.419707 seconds to run.
sort_tensor_by_indices took 1.851285 seconds to run.
shift_and_mask_column took 0.860537 seconds to run.

The new column is a subset of the original column.
check_if_subset took 5.571023 seconds to run.

The number of rows censured is correct.
check_if_length_correct took 0.048917 seconds to run.

All tests passed!


In [14]:
def viz_test(steps_to_shift = 1):

    data_org = generate_synthetic_temporal_dataset(num_groups=5, num_time_steps=4, num_features=1) 

    # Convert the input NumPy array to a PyTorch tensor if it is not already - should nut be here...
    if not isinstance(data_org, torch.Tensor):
        data_org = torch.from_numpy(data_org)

    data_org = shuffle_tensor_rows(data_org) # to test the sorting function

    primary_index = 2 # group column
    secondary_index = 1 # time column
    data_org = sort_tensor_by_indices(data_org, primary_index, secondary_index) # this step is paramount for the rolling done in shift_and_mask_column to be accurate

    new_data = shift_and_mask_column(data_org, steps=steps_to_shift, return_full=True, force_cpu=True, retain_unshifted_col=True) # force CPU now since GPU runs out of mem. Fix later.

    print()
    print(new_data)


In [15]:
viz_test(1) # lag by 1 step. Remember that column 2 is the group column and column 1 is the time column while 0 is the index column.

generate_synthetic_temporal_dataset took 0.000117 seconds to run.
shuffle_tensor_rows took 0.000113 seconds to run.
sort_tensor_by_indices took 0.000403 seconds to run.
shift_and_mask_column took 0.002195 seconds to run.

tensor([[ 0.0000,  0.0000,  0.0000,  0.8756,  0.8129,  0.7878],
        [ 1.0000,  1.0000,  0.0000,  0.7367,  0.7878,  0.6556],
        [ 2.0000,  2.0000,  0.0000,  0.5285,  0.6556,  0.2438],
        [ 3.0000,  3.0000,  0.0000,  0.4179,  0.2438,     nan],
        [ 4.0000,  0.0000,  1.0000,  0.1165,  0.2763,  0.5425],
        [ 5.0000,  1.0000,  1.0000,  0.9187,  0.5425,  0.0457],
        [ 6.0000,  2.0000,  1.0000,  0.1132,  0.0457,  0.2634],
        [ 7.0000,  3.0000,  1.0000,  0.6321,  0.2634,     nan],
        [ 8.0000,  0.0000,  2.0000,  0.7566,  0.2106,  0.4109],
        [ 9.0000,  1.0000,  2.0000,  0.7207,  0.4109,  0.5155],
        [10.0000,  2.0000,  2.0000,  0.9400,  0.5155,  0.4699],
        [11.0000,  3.0000,  2.0000,  0.1085,  0.4699,     nan],
        [1

In [16]:
viz_test(2)

generate_synthetic_temporal_dataset took 0.000229 seconds to run.
shuffle_tensor_rows took 0.000238 seconds to run.
sort_tensor_by_indices took 0.000248 seconds to run.
shift_and_mask_column took 0.000277 seconds to run.

tensor([[ 0.0000,  0.0000,  0.0000,  0.7917,  0.7443,  0.0229],
        [ 1.0000,  1.0000,  0.0000,  0.0465,  0.1550,  0.1394],
        [ 2.0000,  2.0000,  0.0000,  0.4313,  0.0229,     nan],
        [ 3.0000,  3.0000,  0.0000,  0.2962,  0.1394,     nan],
        [ 4.0000,  0.0000,  1.0000,  0.0968,  0.4952,  0.3835],
        [ 5.0000,  1.0000,  1.0000,  0.5813,  0.9874,  0.9790],
        [ 6.0000,  2.0000,  1.0000,  0.8565,  0.3835,     nan],
        [ 7.0000,  3.0000,  1.0000,  0.2171,  0.9790,     nan],
        [ 8.0000,  0.0000,  2.0000,  0.7208,  0.2359,  0.0660],
        [ 9.0000,  1.0000,  2.0000,  0.7603,  0.2580,  0.3006],
        [10.0000,  2.0000,  2.0000,  0.8957,  0.0660,     nan],
        [11.0000,  3.0000,  2.0000,  0.4017,  0.3006,     nan],
        [1

In [17]:
viz_test(3)

generate_synthetic_temporal_dataset took 0.000373 seconds to run.
shuffle_tensor_rows took 0.000132 seconds to run.
sort_tensor_by_indices took 0.000084 seconds to run.
shift_and_mask_column took 0.000159 seconds to run.

tensor([[ 0.0000,  0.0000,  0.0000,  0.4698,  0.5509,  0.5347],
        [ 1.0000,  1.0000,  0.0000,  0.6937,  0.9644,     nan],
        [ 2.0000,  2.0000,  0.0000,  0.7465,  0.3936,     nan],
        [ 3.0000,  3.0000,  0.0000,  0.1871,  0.5347,     nan],
        [ 4.0000,  0.0000,  1.0000,  0.0921,  0.4029,  0.2876],
        [ 5.0000,  1.0000,  1.0000,  0.0369,  0.0663,     nan],
        [ 6.0000,  2.0000,  1.0000,  0.5445,  0.1087,     nan],
        [ 7.0000,  3.0000,  1.0000,  0.4566,  0.2876,     nan],
        [ 8.0000,  0.0000,  2.0000,  0.8035,  0.9868,  0.0342],
        [ 9.0000,  1.0000,  2.0000,  0.5276,  0.7462,     nan],
        [10.0000,  2.0000,  2.0000,  0.2500,  0.2717,     nan],
        [11.0000,  3.0000,  2.0000,  0.3406,  0.0342,     nan],
        [1