In [23]:
import torch
import numpy as np
import random
import time
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt

In [24]:
def load_csv_to_numpy(file_path, delimiter=',', dtype=np.float32, skip_header=1, usecols=None):
    """
    Load a CSV file into a NumPy array, skipping headers and selecting specific columns.

    Args:
        file_path (str): Path to the CSV file.
        delimiter (str): Delimiter used in the CSV file. Default is ','.
        dtype (data-type): Data type of the resulting array. Default is np.float32.
        skip_header (int): Number of header rows to skip. Default is 1.
        usecols (list or int): Columns to load. Default is None (load all columns).

    Returns:
        np.ndarray: NumPy array containing the data from the CSV file.
    """
    # Load the CSV file into a NumPy array, skipping headers and selecting specific columns
    data = np.loadtxt(file_path, delimiter=delimiter, dtype=dtype, skiprows=skip_header, usecols=usecols)
    return data

def load_csv_to_tensor(file_path, delimiter=',', dtype=torch.float32, skip_header=1, usecols=None):
    """
    Load a CSV file into a PyTorch tensor, skipping headers and selecting specific columns.

    Args:
        file_path (str): Path to the CSV file.
        delimiter (str): Delimiter used in the CSV file. Default is ','.
        dtype (torch.dtype): Data type of the resulting tensor. Default is torch.float32.
        skip_header (int): Number of header rows to skip. Default is 1.
        usecols (list or int): Columns to load. Default is None (load all columns).

    Returns:
        torch.Tensor: PyTorch tensor containing the data from the CSV file.
    """
    # Load the CSV file into a NumPy array
    data = load_csv_to_numpy(file_path, delimiter=delimiter, dtype=np.float32, skip_header=skip_header, usecols=usecols)
    
    # Convert the NumPy array to a PyTorch tensor
    tensor = torch.tensor(data, dtype=dtype)
    
    return tensor

def load_csv_with_pandas(file_path, delimiter=','):
    """
    Load a CSV file into a pandas DataFrame for more flexible handling.

    Args:
        file_path (str): Path to the CSV file.
        delimiter (str): Delimiter used in the CSV file. Default is ','.

    Returns:
        pd.DataFrame: DataFrame containing the data from the CSV file.
    """
    return pd.read_csv(file_path, delimiter=delimiter)

def pandas_to_numpy(df, columns=None, dtype=np.float32):
    """
    Convert specific columns of a pandas DataFrame to a NumPy array.

    Args:
        df (pd.DataFrame): Input DataFrame.
        columns (list or str): Columns to convert. Default is None (all numeric columns).
        dtype (data-type): Data type of the resulting array. Default is np.float32.

    Returns:
        np.ndarray: NumPy array containing the selected columns.
    """
    if columns is not None:
        df = df[columns]
    return df.to_numpy(dtype=dtype)

def pandas_to_tensor(df, columns=None, dtype=torch.float32):
    """
    Convert specific columns of a pandas DataFrame to a PyTorch tensor.

    Args:
        df (pd.DataFrame): Input DataFrame.
        columns (list or str): Columns to convert. Default is None (all numeric columns).
        dtype (torch.dtype): Data type of the resulting tensor. Default is torch.float32.

    Returns:
        torch.Tensor: PyTorch tensor containing the selected columns.
    """
    if columns is not None:
        df = df[columns]
    return torch.tensor(df.to_numpy(), dtype=dtype)

# Example usage:
file_path = '/home/eduino/Synthetic_TS_Data_Gen/data/Real_GDP.csv'

# Using pandas to load the CSV file
df = load_csv_with_pandas(file_path)

# Convert specific columns to NumPy arrays or PyTorch tensors
dates = df['observation_date'].values  # Convert dates to a NumPy array
gdp_values_numpy = pandas_to_numpy(df, columns=['GDPC1'])  # Convert GDP values to a NumPy array
gdp_values_tensor = pandas_to_tensor(df, columns=['GDPC1'])  # Convert GDP values to a PyTorch tensor

print("DataFrame:")
print(df)

print("\nDates (NumPy Array):")
print(dates)

print("\nGDP Values (NumPy Array):")
print(gdp_values_numpy)

print("\nGDP Values (PyTorch Tensor):")
print(gdp_values_tensor)


DataFrame:
    observation_date      GDPC1
0         1947-01-01   2182.681
1         1947-04-01   2176.892
2         1947-07-01   2172.432
3         1947-10-01   2206.452
4         1948-01-01   2239.682
..               ...        ...
306       2023-07-01  22780.933
307       2023-10-01  22960.600
308       2024-01-01  23053.545
309       2024-04-01  23223.906
310       2024-07-01  23400.294

[311 rows x 2 columns]

Dates (NumPy Array):
['1947-01-01' '1947-04-01' '1947-07-01' '1947-10-01' '1948-01-01'
 '1948-04-01' '1948-07-01' '1948-10-01' '1949-01-01' '1949-04-01'
 '1949-07-01' '1949-10-01' '1950-01-01' '1950-04-01' '1950-07-01'
 '1950-10-01' '1951-01-01' '1951-04-01' '1951-07-01' '1951-10-01'
 '1952-01-01' '1952-04-01' '1952-07-01' '1952-10-01' '1953-01-01'
 '1953-04-01' '1953-07-01' '1953-10-01' '1954-01-01' '1954-04-01'
 '1954-07-01' '1954-10-01' '1955-01-01' '1955-04-01' '1955-07-01'
 '1955-10-01' '1956-01-01' '1956-04-01' '1956-07-01' '1956-10-01'
 '1957-01-01' '1957-04-01' '195

In [25]:
import os
import pandas as pd

# Define the directory containing the CSV files
data_directory = '/home/eduino/Synthetic_TS_Data_Gen/data'

# Initialize an empty list to store DataFrames
dataframes = []

# Iterate through all files in the directory
for file in os.listdir(data_directory):
    if file.endswith(".csv"):
        print(f"Loading file: {file}")
        # Load the CSV file into a DataFrame
        file_path = os.path.join(data_directory, file)
        df = pd.read_csv(file_path)  # Assuming `load_csv_with_pandas` is a placeholder for `pd.read_csv`
        
        # Convert the 'observation_date' column to datetime format
        df['observation_date'] = pd.to_datetime(df['observation_date'])
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Merge all DataFrames on the 'observation_date' column
merged_df = dataframes[0]  # Start with the first DataFrame
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df, on='observation_date', how='outer')

# Sort the merged DataFrame by 'observation_date'
merged_df = merged_df.sort_values(by='observation_date')

# Drop rows with NaN values
merged_df = merged_df.dropna()

# Create a NumPy array of shape (3, 64)
# Rows: Inflation, GDP, Unemployment
# Columns: Observations aligned by 'observation_date'
time_series_numpy = merged_df[['FPCPITOTLZGUSA', 'GDPC1', 'UNRATE']].T.to_numpy()

# Display the reshaped NumPy array
print(time_series_numpy)
print(time_series_numpy.shape)

Loading file: Real_GDP.csv
Loading file: inflation.csv
Loading file: unemployment.csv
[[ 1.45797599e+00  1.07072415e+00  1.19877335e+00  1.23966942e+00
   1.27891156e+00  1.58516926e+00  3.01507538e+00  2.77278562e+00
   4.27179615e+00  5.46238620e+00  5.83825534e+00  4.29276669e+00
   3.27227825e+00  6.17776006e+00  1.10548048e+01  9.14314686e+00
   5.74481264e+00  6.50168399e+00  7.63096384e+00  1.12544711e+01
   1.35492020e+01  1.03347153e+01  6.13142700e+00  3.21243523e+00
   4.30053548e+00  3.54564415e+00  1.89804772e+00  3.66456322e+00
   4.07774111e+00  4.82700303e+00  5.39795644e+00  4.23496396e+00
   3.02881968e+00  2.95165697e+00  2.60744159e+00  2.80541969e+00
   2.93120420e+00  2.33768994e+00  1.55227910e+00  2.18802720e+00
   3.37685727e+00  2.82617112e+00  1.58603163e+00  2.27009497e+00
   2.67723669e+00  3.39274685e+00  3.22594410e+00  2.85267248e+00
   3.83910030e+00 -3.55546266e-01  1.64004344e+00  3.15684157e+00
   2.06933727e+00  1.46483266e+00  1.62222298e+00  1.186

In [26]:
### Convert to time_VAE formatting ###
# time_VAE formatting: [samples, timestamps, features_list]
# TODO: How do I get more samples? 

# Reshape the array into the format [samples, timestamps, features_list]
# Since there is only 1 sample, we add an extra dimension at the beginning
num_timestamps = time_series_numpy.shape[1]  # Number of timestamps
num_features = time_series_numpy.shape[0]    # Number of features

# Reshape to [1, timestamps, features]
time_VAE_format = time_series_numpy.reshape(num_features, num_timestamps, 1)

# Print the shape of the resulting array
print("Shape of time_VAE_format:", time_VAE_format.shape)
print(time_VAE_format)


Shape of time_VAE_format: (3, 64, 1)
[[[ 1.45797599e+00]
  [ 1.07072415e+00]
  [ 1.19877335e+00]
  [ 1.23966942e+00]
  [ 1.27891156e+00]
  [ 1.58516926e+00]
  [ 3.01507538e+00]
  [ 2.77278562e+00]
  [ 4.27179615e+00]
  [ 5.46238620e+00]
  [ 5.83825534e+00]
  [ 4.29276669e+00]
  [ 3.27227825e+00]
  [ 6.17776006e+00]
  [ 1.10548048e+01]
  [ 9.14314686e+00]
  [ 5.74481264e+00]
  [ 6.50168399e+00]
  [ 7.63096384e+00]
  [ 1.12544711e+01]
  [ 1.35492020e+01]
  [ 1.03347153e+01]
  [ 6.13142700e+00]
  [ 3.21243523e+00]
  [ 4.30053548e+00]
  [ 3.54564415e+00]
  [ 1.89804772e+00]
  [ 3.66456322e+00]
  [ 4.07774111e+00]
  [ 4.82700303e+00]
  [ 5.39795644e+00]
  [ 4.23496396e+00]
  [ 3.02881968e+00]
  [ 2.95165697e+00]
  [ 2.60744159e+00]
  [ 2.80541969e+00]
  [ 2.93120420e+00]
  [ 2.33768994e+00]
  [ 1.55227910e+00]
  [ 2.18802720e+00]
  [ 3.37685727e+00]
  [ 2.82617112e+00]
  [ 1.58603163e+00]
  [ 2.27009497e+00]
  [ 2.67723669e+00]
  [ 3.39274685e+00]
  [ 3.22594410e+00]
  [ 2.85267248e+00]
  [

In [27]:
###Data Augmentation for more Samples###
# Original time series data (1 sample, timestamps, features)
# original_data = time_VAE_format

# # Number of augmented samples to create
# num_augmented_samples = 1000

# # Initialize a list to store augmented samples
# augmented_data = []

# for _ in range(num_augmented_samples):
#     # Add random noise to the original data
#     noise = np.random.normal(0, 0.05, original_data.shape)  # Adjust noise level as needed
#     augmented_sample = original_data + noise
    
#     # Append the augmented sample to the list
#     augmented_data.append(augmented_sample)

# # Convert the list to a numpy array
# augmented_data = np.concatenate(augmented_data, axis=0)

# print("Augmented data shape:", augmented_data.shape)


In [28]:
###Save the data to a .npz file###
def save_numpy_to_npz(data, npz_file_path):
    """
    Save a NumPy array directly into a compressed .npz file.

    Args:
        data (np.ndarray): The NumPy array to save.
        npz_file_path (str): Path to save the compressed .npz file.

    Returns:
        None
    """
    # Save the NumPy array as a compressed .npz file
    np.savez_compressed(npz_file_path, data=data)
    
    print(f"Data successfully saved to {npz_file_path}")

data_directory = '/home/eduino/Synthetic_TS_Data_Gen/data/'
npz_file_name = "macroeconomic.npz"
npz_file_path = os.path.join(data_directory, npz_file_name)

save_numpy_to_npz(time_VAE_format, npz_file_path)


Data successfully saved to /home/eduino/Synthetic_TS_Data_Gen/data/macroeconomic.npz
