### Construct the Full Training and Testing Set

### One-Hot Encode the states Column

In [1]:
# import neccessary library
from general_used_functions import *
from sklearn.preprocessing import OneHotEncoder

config_data = load_config_file()
selected_unsupervised_model = config_data['selected_unsupervised_model']
stock_list = config_data['stock_dict']

def load_states_data(stock, selected_model):
    DATA_DIR = os.getcwd() + f'/data/{selected_model}_states/{stock}'

    # Load the states data
    training_states_data = pd.read_excel(DATA_DIR + f'/{stock}_{selected_model}_states(Training).xlsx')
    test_states_data = pd.read_excel(DATA_DIR + f'/{stock}_{selected_model}_states(Testing).xlsx')

    return training_states_data, test_states_data

# Define a function for one-hot encoding the 'states' column
def one_hot_encode_states(df):
    # Create OneHotEncoder instance; sparse=False will return a dense array
    ohe = OneHotEncoder(sparse_output=False)
    # Fit and transform the 'states' column (must be 2D)
    state_array = ohe.fit_transform(df[['states']])
    # Create column names for each state, e.g., state_0, state_1, etc.
    state_feature_names = [f"state_{i}" for i in range(state_array.shape[1])]
    # Create a DataFrame for the encoded states, retaining the original index
    state_df = pd.DataFrame(state_array, columns=state_feature_names, index=df.index)
    # Concatenate the new one-hot encoded columns with the original DataFrame and drop the original 'states' column
    df_encoded = pd.concat([df.drop(columns=['states']), state_df], axis=1)
    return df_encoded

# Process each stock and encode the regime states
states_encoded_dict = {}
for stock in stock_list:
    if stock not in selected_unsupervised_model:
        continue

    # Load training and test states data
    training_states_data, test_states_data = load_states_data(stock, selected_unsupervised_model[stock])

    # Apply one-hot encoding on both training and test data
    training_states_encoded = one_hot_encode_states(training_states_data)
    test_states_encoded = one_hot_encode_states(test_states_data)

    # Save the encoded data in a dictionary for later use
    states_encoded_dict[stock] = (training_states_encoded, test_states_encoded)

# Example: print the first few rows of the one-hot encoded training data for AAPL
print(states_encoded_dict['AMZN'][0].head())

        Date       AMZN  state_0  state_1  state_2  state_3  state_4
0 2013-12-02  19.615000      0.0      0.0      0.0      1.0      0.0
1 2013-12-03  19.233000      0.0      0.0      0.0      1.0      0.0
2 2013-12-04  19.298000      0.0      0.0      0.0      1.0      0.0
3 2013-12-05  19.224501      0.0      0.0      0.0      1.0      0.0
4 2013-12-06  19.347500      0.0      0.0      0.0      1.0      0.0


In [2]:
# Load feature data
training_feature_df = load_training_data()
testing_feature_df = load_testing_data()

def merge_encoded_states_with_features(stock, training_feature_df, testing_feature_df, states_encoded_dict):
    # Extract the target columns for the current stock
    target_columns = config_data['selected_features_dict'][stock]

    # Extract feature data for the current stock
    stock_training_features = training_feature_df[stock][target_columns]
    stock_testing_features = testing_feature_df[stock][target_columns]

    # Retrieve the one-hot encoded regime states
    training_states_encoded, testing_states_encoded = states_encoded_dict[stock]

    # Merge the feature data with the one-hot encoded regime states
    merged_training_data = pd.concat([stock_training_features, training_states_encoded], axis=1)
    merged_testing_data = pd.concat([stock_testing_features, testing_states_encoded], axis=1)

    # Make sure the merged data has unique column names, lower(), otherwise, drop duplicates
    merged_training_data.columns = merged_training_data.columns.str.lower()
    merged_testing_data.columns = merged_testing_data.columns.str.lower()
    merged_training_data = merged_training_data.loc[:, ~merged_training_data.columns.duplicated()]
    merged_testing_data = merged_testing_data.loc[:, ~merged_testing_data.columns.duplicated()]

    return merged_training_data, merged_testing_data

for stock in stock_list:
    if stock not in selected_unsupervised_model:
        continue

    # Merge the encoded states with the feature data for the current stock
    merged_training_data, merged_testing_data = merge_encoded_states_with_features(stock, training_feature_df, testing_feature_df, states_encoded_dict)
   

    # Save the merged data to Excel files
    training_save_dir = os.getcwd() + f'/data/diffusion_training_data/{stock}'
    testing_save_dir = os.getcwd() + f'/data/diffusion_testing_data/{stock}'
    os.makedirs(training_save_dir, exist_ok=True)
    os.makedirs(testing_save_dir, exist_ok=True)

    merged_training_data.to_excel(training_save_dir + f'/{stock}_training_data.xlsx', index=False)
    merged_testing_data.to_excel(testing_save_dir + f'/{stock}_testing_data.xlsx', index=False)