### Construct the Full Training and Testing Set

### One-Hot Encode the states Column

In [None]:
# import neccessary library
from general_used_functions import *
from sklearn.preprocessing import OneHotEncoder

config_data = load_config_file()
selected_unsupervised_model = config_data['selected_unsupervised_model']
stock_list = config_data['stock_dict']

def load_states_data(stock, selected_model):
    DATA_DIR = os.getcwd() + f'/data/{selected_model}_states/{stock}'

    # Load the states data
    training_states_data = pd.read_excel(DATA_DIR + f'/{stock}_{selected_model}_states(Training).xlsx')
    test_states_data = pd.read_excel(DATA_DIR + f'/{stock}_{selected_model}_states(Testing).xlsx')

    return training_states_data, test_states_data

# Define a function for one-hot encoding the 'states' column
def one_hot_encode_states(df):
    # Create OneHotEncoder instance; sparse=False will return a dense array
    ohe = OneHotEncoder(sparse=False)
    # Fit and transform the 'states' column (must be 2D)
    state_array = ohe.fit_transform(df[['states']])
    # Create column names for each state, e.g., state_0, state_1, etc.
    state_feature_names = [f"state_{i}" for i in range(state_array.shape[1])]
    # Create a DataFrame for the encoded states, retaining the original index
    state_df = pd.DataFrame(state_array, columns=state_feature_names, index=df.index)
    # Concatenate the new one-hot encoded columns with the original DataFrame and drop the original 'states' column
    df_encoded = pd.concat([df.drop(columns=['states']), state_df], axis=1)
    return df_encoded

# Process each stock and encode the regime states
states_encoded_dict = {}
for stock in stock_list:
    if stock not in selected_unsupervised_model:
        continue

    # Load training and test states data
    training_states_data, test_states_data = load_states_data(stock, selected_unsupervised_model[stock])

    # Apply one-hot encoding on both training and test data
    training_states_encoded = one_hot_encode_states(training_states_data)
    test_states_encoded = one_hot_encode_states(test_states_data)

    # Save the encoded data in a dictionary for later use
    states_encoded_dict[stock] = (training_states_encoded, test_states_encoded)

# Example: print the first few rows of the one-hot encoded training data for AAPL
print(states_encoded_dict['AMZN'][0].head())

In [None]:
# Load feature data
training_feature_df = load_training_data()
testing_feature_df = load_testing_data()

def merge_encoded_states_with_features(stock, training_feature_df, testing_feature_df, states_encoded_dict):
    # Extract the target columns for the current stock
    target_columns = config_data['selected_features_dict'][stock]

    # Extract feature data for the current stock
    stock_training_features = training_feature_df[stock][target_columns]
    stock_testing_features = testing_feature_df[stock][target_columns]

    # Retrieve the one-hot encoded regime states
    training_states_encoded, testing_states_encoded = states_encoded_dict[stock]

    # Merge the feature data with the one-hot encoded regime states
    merged_training_data = pd.concat([stock_training_features, training_states_encoded], axis=1)
    merged_testing_data = pd.concat([stock_testing_features, testing_states_encoded], axis=1)

    # Make sure the merged data has unique column names, lower(), otherwise, drop duplicates
    merged_training_data.columns = merged_training_data.columns.str.lower()
    merged_testing_data.columns = merged_testing_data.columns.str.lower()
    merged_training_data = merged_training_data.loc[:, ~merged_training_data.columns.duplicated()]
    merged_testing_data = merged_testing_data.loc[:, ~merged_testing_data.columns.duplicated()]

    # Drop the date
    merged_training_data = merged_training_data.drop(columns=['date'], errors='ignore')
    merged_testing_data = merged_testing_data.drop(columns=['date'], errors='ignore')

    return merged_training_data, merged_testing_data

def drop_states(df):
    # Drop the one-hot encoded state columns
    state_columns = [col for col in df.columns if col.startswith('state_')]
    df_no_states = df.drop(columns=state_columns, errors='ignore')
    return df_no_states

for stock in stock_list:
    if stock not in selected_unsupervised_model:
        continue

    # Merge the encoded states with the feature data for the current stock
    merged_training_data, merged_testing_data = merge_encoded_states_with_features(stock, training_feature_df, testing_feature_df, states_encoded_dict)

    # Saving Directory
    training_save_dir = os.getcwd() + f'/ARMD/Data/diffusion_training_data/{stock}'
    testing_save_dir = os.getcwd() + f'/ARMD/Data/diffusion_testing_data/{stock}'
    os.makedirs(training_save_dir, exist_ok=True)
    os.makedirs(testing_save_dir, exist_ok=True)

    # Save the data in csv format
    merged_training_data.to_csv(training_save_dir + f'/{stock}_training_data.csv', index=False)
    merged_testing_data.to_csv(testing_save_dir + f'/{stock}_testing_data.csv', index=False)

    # Save the data that does not have the regime states
    merged_training_data_no_states = drop_states(merged_training_data)
    merged_testing_data_no_states = drop_states(merged_testing_data)

    merged_training_data_no_states.to_csv(training_save_dir + f'/{stock}_training_data_no_states.csv', index=False)
    merged_testing_data_no_states.to_csv(testing_save_dir + f'/{stock}_testing_data_no_states.csv', index=False)

### Configure File Generation

In [1]:
# import the necessary libraries
import os
import yaml
import copy
from general_used_functions import load_config_file

base_config_path = "./ARMD/Config/base_config.yaml"

with open(base_config_path, "r") as f:
    base_config = yaml.safe_load(f)

config_data = load_config_file()
diffusion_stock_info = config_data['diffusion_stock_info']


config_output_dir = "./ARMD/Config"

# Iterate over each stock and generate a stock-specific config file
for stock, params in diffusion_stock_info.items():
    # Create a deep copy of the base configuration for each stock
    stock_config = copy.deepcopy(base_config)
    
    # Update model parameters: set feature_size from stock-specific info
    stock_config["model"]["params"]["feature_size"] = params["feature_size"]
    
    # Update data paths in the dataloader section with the stock-specific merged file paths
    stock_config["dataloader"]["train_dataset"]["params"]["name"] = params["name"]
    stock_config["dataloader"]["train_dataset"]["params"]["data_root"] = params["data_root_train"]

    stock_config["dataloader"]["test_dataset"]["params"]["data_root"] = params["data_root_test"]
    stock_config["dataloader"]["test_dataset"]["params"]["name"] = params["name"]

    # Update results_folder in solver
    stock_config["solver"]["results_folder"] = params["results_folder"]
    
    # Construct the output config filename and save it
    config_filename = f"{stock}_armd_config.yaml"
    config_filepath = os.path.join(config_output_dir, config_filename)
    
    with open(config_filepath, "w") as f:
        yaml.dump(stock_config, f, default_flow_style=False)
    
    print(f"Generated config file for {stock}: {config_filepath}")

    # Generate the config file without the regime states
    # Update data paths in the dataloader section with the stock-specific merged file paths, before the .csv
    stock_config["model"]["params"]["feature_size"] = params["feature_size_without_states"]

    base, ext = os.path.splitext(params["data_root_train"])
    stock_config["dataloader"]["train_dataset"]["params"]["name"] = f'{stock}' + '_no_states' 
    stock_config["dataloader"]["train_dataset"]["params"]["data_root"] = base + '_no_states' + ext

    base, ext = os.path.splitext(params["data_root_test"])
    stock_config["dataloader"]["test_dataset"]["params"]["name"] =  f'{stock}' + '_no_states' 
    stock_config["dataloader"]["test_dataset"]["params"]["data_root"] = base + '_no_states' + ext

    # Update results_folder in solver
    stock_config["solver"]["results_folder"] = params["results_folder"] + '_no_states'

    # Construct the output config filename and save it
    config_filename_no_states = f"{stock}_armd_config_no_states.yaml"
    config_filepath_no_states = os.path.join(config_output_dir, config_filename_no_states)
    with open(config_filepath_no_states, "w") as f:
        yaml.dump(stock_config, f, default_flow_style=False)

    print(f"Generated config file for {stock} without states: {config_filepath_no_states}")

Generated config file for AMZN: ./ARMD/Config/AMZN_armd_config.yaml
Generated config file for AMZN without states: ./ARMD/Config/AMZN_armd_config_no_states.yaml
Generated config file for GOOGL: ./ARMD/Config/GOOGL_armd_config.yaml
Generated config file for GOOGL without states: ./ARMD/Config/GOOGL_armd_config_no_states.yaml
Generated config file for MSFT: ./ARMD/Config/MSFT_armd_config.yaml
Generated config file for MSFT without states: ./ARMD/Config/MSFT_armd_config_no_states.yaml
Generated config file for NVDA: ./ARMD/Config/NVDA_armd_config.yaml
Generated config file for NVDA without states: ./ARMD/Config/NVDA_armd_config_no_states.yaml
Generated config file for NFLX: ./ARMD/Config/NFLX_armd_config.yaml
Generated config file for NFLX without states: ./ARMD/Config/NFLX_armd_config_no_states.yaml
Generated config file for TSLA: ./ARMD/Config/TSLA_armd_config.yaml
Generated config file for TSLA without states: ./ARMD/Config/TSLA_armd_config_no_states.yaml
Generated config file for META