In [1]:
import torch
import numpy as np
import random
import time
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
def load_csv_to_numpy(file_path, delimiter=',', dtype=np.float32, skip_header=1, usecols=None):
    """
    Load a CSV file into a NumPy array, skipping headers and selecting specific columns.

    Args:
        file_path (str): Path to the CSV file.
        delimiter (str): Delimiter used in the CSV file. Default is ','.
        dtype (data-type): Data type of the resulting array. Default is np.float32.
        skip_header (int): Number of header rows to skip. Default is 1.
        usecols (list or int): Columns to load. Default is None (load all columns).

    Returns:
        np.ndarray: NumPy array containing the data from the CSV file.
    """
    # Load the CSV file into a NumPy array, skipping headers and selecting specific columns
    data = np.loadtxt(file_path, delimiter=delimiter, dtype=dtype, skiprows=skip_header, usecols=usecols)
    return data

def load_csv_to_tensor(file_path, delimiter=',', dtype=torch.float32, skip_header=1, usecols=None):
    """
    Load a CSV file into a PyTorch tensor, skipping headers and selecting specific columns.

    Args:
        file_path (str): Path to the CSV file.
        delimiter (str): Delimiter used in the CSV file. Default is ','.
        dtype (torch.dtype): Data type of the resulting tensor. Default is torch.float32.
        skip_header (int): Number of header rows to skip. Default is 1.
        usecols (list or int): Columns to load. Default is None (load all columns).

    Returns:
        torch.Tensor: PyTorch tensor containing the data from the CSV file.
    """
    # Load the CSV file into a NumPy array
    data = load_csv_to_numpy(file_path, delimiter=delimiter, dtype=np.float32, skip_header=skip_header, usecols=usecols)
    
    # Convert the NumPy array to a PyTorch tensor
    tensor = torch.tensor(data, dtype=dtype)
    
    return tensor

def load_csv_with_pandas(file_path, delimiter=','):
    """
    Load a CSV file into a pandas DataFrame for more flexible handling.

    Args:
        file_path (str): Path to the CSV file.
        delimiter (str): Delimiter used in the CSV file. Default is ','.

    Returns:
        pd.DataFrame: DataFrame containing the data from the CSV file.
    """
    return pd.read_csv(file_path, delimiter=delimiter)

def pandas_to_numpy(df, columns=None, dtype=np.float32):
    """
    Convert specific columns of a pandas DataFrame to a NumPy array.

    Args:
        df (pd.DataFrame): Input DataFrame.
        columns (list or str): Columns to convert. Default is None (all numeric columns).
        dtype (data-type): Data type of the resulting array. Default is np.float32.

    Returns:
        np.ndarray: NumPy array containing the selected columns.
    """
    if columns is not None:
        df = df[columns]
    return df.to_numpy(dtype=dtype)

def pandas_to_tensor(df, columns=None, dtype=torch.float32):
    """
    Convert specific columns of a pandas DataFrame to a PyTorch tensor.

    Args:
        df (pd.DataFrame): Input DataFrame.
        columns (list or str): Columns to convert. Default is None (all numeric columns).
        dtype (torch.dtype): Data type of the resulting tensor. Default is torch.float32.

    Returns:
        torch.Tensor: PyTorch tensor containing the selected columns.
    """
    if columns is not None:
        df = df[columns]
    return torch.tensor(df.to_numpy(), dtype=dtype)

def monthly_inflation_rate(annual_rate):
    """
    Calculate the monthly inflation rate given the annual inflation rate.

    Parameters:
    annual_rate (float): The annual inflation rate in decimal form (e.g., 0.12 for 12%).

    Returns:
    float: The monthly inflation rate in decimal form.
    """
    monthly_rate = (1 + annual_rate) ** (1 / 12) - 1
    return monthly_rate

# # Example usage:
# file_path = '/home/vincentwork/Synthetic_TS_Data_Gen/data/Real_GDP.csv'

# # Using pandas to load the CSV file
# df = load_csv_with_pandas(file_path)

# # Convert specific columns to NumPy arrays or PyTorch tensors
# dates = df['observation_date'].values  # Convert dates to a NumPy array
# gdp_values_numpy = pandas_to_numpy(df, columns=['GDPC1'])  # Convert GDP values to a NumPy array
# gdp_values_tensor = pandas_to_tensor(df, columns=['GDPC1'])  # Convert GDP values to a PyTorch tensor

# print("DataFrame:")
# print(df)

# print("\nDates (NumPy Array):")
# print(dates)

# print("\nGDP Values (NumPy Array):")
# print(gdp_values_numpy)

# print("\nGDP Values (PyTorch Tensor):")
# print(gdp_values_tensor)


In [13]:
# Loop through the data directory and load all the csv files

# Define the directory containing the CSV files
data_directory = '/home/vincentwork/Synthetic_TS_Data_Gen/data/'

# Initialize an empty list to store DataFrames
dataframes = []

# Iterate through all files in the directory
for file in os.listdir(data_directory):
    if file.endswith(".csv"):
        print(f"Loading file: {file}")
        # Load the CSV file into a DataFrame
        file_path = os.path.join(data_directory, file)
        df = load_csv_with_pandas(file_path)
        
        # Convert the 'observation_date' column to datetime format
        df['observation_date'] = pd.to_datetime(df['observation_date'])
        
        feature = df.columns[-1]
        
        if feature == 'FPCPITOTLZGUSA':
            expanded_data = []
            for index, row in df.iterrows():
                year = row['observation_date'].year
                value = row[feature]
                montly_value = monthly_inflation_rate(value)
                for month in range(1, 13):
                    expanded_data.append({
                        'observation_date': datetime(year, month, 1),  
                        feature: montly_value
                    })
            expanded_df = pd.DataFrame(expanded_data)
            df = expanded_df.sort_values(by='observation_date').reset_index(drop=True)
        
        if feature == 'GDPC1':
            df.set_index('observation_date', inplace=True)
            monthly_df = df.resample('MS').interpolate(method='linear')
            monthly_df.reset_index(inplace=True)
            df = monthly_df
            
        # Append the DataFrame to the list
        dataframes.append(df)

        # Merge all DataFrames on the 'observation_date' column
        merged_df = dataframes[0]  # Start with the first DataFrame


for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df, on='observation_date', how='outer')

# Sort the merged DataFrame by 'observation_date'
merged_df = merged_df.sort_values(by='observation_date')

# Fill in NaN data with the previous value
# merged_df = merged_df.fillna(method='ffill')

merged_df.drop(columns=['observation_date'], inplace=True)

merged_df.dropna(inplace=True)

merged_df.to_csv('/home/vincentwork/Synthetic_TS_Data_Gen/data/diffusion3.csv', index=False)


headers = [col for col in merged_df.columns if col != 'observation_date']

# Drop nan values (rows)
# merged_df = merged_df.dropna()

# Convert to Numpy array
# Select only the columns containing the time series data
# time_series_numpy = pandas_to_numpy(merged_df, columns=headers)
# print(time_series_numpy)
    

Loading file: unemployment.csv
Loading file: inflation.csv
Loading file: Real_GDP.csv


In [10]:
### Convert to time_VAE formatting ###
# time_VAE formatting: [samples, timestamps, features_list]
# TODO: How do I get more samples? 

# Reshape the array into the format [samples, timestamps, features_list]
# Since there is only 1 sample, we add an extra dimension at the beginning
# num_timestamps = time_series_numpy.shape[0]  # Number of timestamps
# num_features = time_series_numpy.shape[1]    # Number of features

# Reshape to [1, timestamps, features]
# time_VAE_format = time_series_numpy.reshape(1, num_timestamps, num_features)

# Reshape to [num years (samples), timestamps (monthly), features (unemployment, inflation, GDP)]
# Extract year from observation_date
merged_df['year'] = merged_df['observation_date'].dt.year

# Split the DataFrame by year into a dictionary of DataFrames
yearly_dict = {year: group.drop(columns=['year']) for year, group in merged_df.groupby('year')}

# Convert each yearly DataFrame to a 2D numpy array and store in a list
yearly_arrays = [pandas_to_numpy(year_df, headers) for year, year_df in yearly_dict.items()]

# Combine the 2D arrays into a 3D numpy array
time_series_numpy = np.stack(yearly_arrays)
    

# Print the shape of the resulting array
print("Shape of time_VAE_format:", time_series_numpy.shape)
print(time_series_numpy)


Shape of time_VAE_format: (64, 12, 3)
[[[5.1999998e+00 7.7824712e-02 3.5171809e+03]
  [4.8000002e+00 7.7824712e-02 3.5108694e+03]
  [5.4000001e+00 7.7824712e-02 3.5045576e+03]
  ...
  [6.0999999e+00 7.7824712e-02 3.4702781e+03]
  [6.0999999e+00 7.7824712e-02 3.4780864e+03]
  [6.5999999e+00 7.7824712e-02 3.4858948e+03]]

 [[6.5999999e+00 6.2535673e-02 3.4937029e+03]
  [6.9000001e+00 6.2535673e-02 3.5134756e+03]
  [6.9000001e+00 6.2535673e-02 3.5332483e+03]
  ...
  [6.5000000e+00 6.2535673e-02 3.6922891e+03]
  [6.0999999e+00 6.2535673e-02 3.7142417e+03]
  [6.0000000e+00 6.2535673e-02 3.7361943e+03]]

 [[5.8000002e+00 6.7861773e-02 3.7581470e+03]
  [5.5000000e+00 6.7861773e-02 3.7694810e+03]
  [5.5999999e+00 6.7861773e-02 3.7808149e+03]
  ...
  [5.4000001e+00 6.7861773e-02 3.8514209e+03]
  [5.6999998e+00 6.7861773e-02 3.8654414e+03]
  [5.5000000e+00 6.7861773e-02 3.8794617e+03]]

 ...

 [[6.4000001e+00 1.5604828e-01 2.1058379e+04]
  [6.1999998e+00 1.5604828e-01 2.1168588e+04]
  [6.0999999

In [35]:
# ###Data Augmentation for more Samples###
# # Original time series data (1 sample, timestamps, features)
# original_data = time_series_numpy

# # Number of augmented samples to create
# num_augmented_samples = 1000

# # Initialize a list to store augmented samples
# augmented_data = []

# for _ in range(num_augmented_samples):
#     # Add random noise to the original data
#     noise = np.random.normal(0, 0.05, original_data.shape)  # Adjust noise level as needed
#     augmented_sample = original_data + noise
    
#     # Append the augmented sample to the list
#     augmented_data.append(augmented_sample)

# # Convert the list to a numpy array
# augmented_data = np.concatenate(augmented_data, axis=0)

# print("Augmented data shape:", augmented_data.shape)


In [11]:
###Save the data to a .npz file###
def save_numpy_to_npz(data, npz_file_path):
    """
    Save a NumPy array directly into a compressed .npz file.

    Args:
        data (np.ndarray): The NumPy array to save.
        npz_file_path (str): Path to save the compressed .npz file.

    Returns:
        None
    """
    # Save the NumPy array as a compressed .npz file
    np.savez_compressed(npz_file_path, data=data)
    
    print(f"Data successfully saved to {npz_file_path}")

data_directory = '/home/vincentwork/Synthetic_TS_Data_Gen/data/'
npz_file_name = "macroeconomic.npz"
npz_file_path = os.path.join(data_directory, npz_file_name)

save_numpy_to_npz(time_series_numpy, npz_file_path)


Data successfully saved to /home/vincentwork/Synthetic_TS_Data_Gen/data/macroeconomic.npz
