## Imports

In [1]:
import pandas as pd
from pathlib import Path
from simpful.gp_fuzzy_system.gp_evolution import genetic_algorithm_loop
from simpful.gp_fuzzy_system.auto_lvs import FuzzyLinguisticVariableProcessor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler




## Load Data

In [2]:
import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Paths to the prepped data files
base_path = './daily_data/'
scaled_path = os.path.join(base_path, 'scaled')

# Create the scaled directory if it doesn't exist
os.makedirs(scaled_path, exist_ok=True)

X_train_path = base_path + 'X_train.csv'
X_test_path = base_path + 'X_test.csv'
y_train_path = base_path + 'y_train.csv'
y_test_path = base_path + 'y_test.csv'

# Loading the datasets
X_train = pd.read_csv(X_train_path)
X_test = pd.read_csv(X_test_path)
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# Rename the column in y_train to 'target'
y_train.rename(columns={'close': 'target'}, inplace=True)

# Rename the column in y_test to 'target'
y_test.rename(columns={'close': 'target'}, inplace=True)

# Impute missing values in 'value' and 'classification_numerical' columns
imputer = SimpleImputer(strategy='mean')
X_train[['value', 'classification_numerical']] = imputer.fit_transform(X_train[['value', 'classification_numerical']])

# Verify that there are no NaN values left
print("NaN values in X_train after imputation:\n", X_train.isna().sum().sum())

# Check for extreme values in the data
print("Extreme values in X_train:", (X_train > 1e9).sum().sum())

# Exclude non-numerical columns before scaling
columns_to_scale = X_train.columns  # All columns in X_train are numerical

# Initialize the scaler for the features
scaler_X = MinMaxScaler()

# Scale 'X' features
x_train_scaled = scaler_X.fit_transform(X_train[columns_to_scale])
x_test_scaled = scaler_X.transform(X_test[columns_to_scale])

# Initialize a separate scaler for the target variable
scaler_y = MinMaxScaler()

# Scale 'y' (the target variable)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

# Paths to save the scaled data
scaled_X_train_path = os.path.join(scaled_path, 'X_train_scaled.csv')
scaled_X_test_path = os.path.join(scaled_path, 'X_test_scaled.csv')
scaled_y_train_path = os.path.join(scaled_path, 'y_train_scaled.csv')
scaled_y_test_path = os.path.join(scaled_path, 'y_test_scaled.csv')

# Save the preprocessed and scaled data to the 'scaled' directory
pd.DataFrame(x_train_scaled, columns=columns_to_scale).to_csv(scaled_X_train_path, index=False)
pd.DataFrame(x_test_scaled, columns=columns_to_scale).to_csv(scaled_X_test_path, index=False)
pd.DataFrame(y_train_scaled, columns=['target']).to_csv(scaled_y_train_path, index=False)
pd.DataFrame(y_test_scaled, columns=['target']).to_csv(scaled_y_test_path, index=False)


NaN values in X_train after imputation:
 0
Extreme values in X_train: 2367


## Further Processing

In [3]:
# Initialize the FuzzyLinguisticVariableProcessor with the scaled X_train path
exclude_columns = 'month,day,hour'
terms_dict_path = './terms/terms_dict.py'
verbose = True

processor = FuzzyLinguisticVariableProcessor(
    file_path=scaled_X_train_path,  # Use the scaled data path
    terms_dict_path=terms_dict_path, 
    verbose=verbose, 
    exclude_columns=exclude_columns.split(','),
    mf_type='sigmoid',
)
variable_store = processor.process_dataset()

Defined sigmoid fuzzy set for term 'VERY_LOW' with mean: 0.01897952633422215, slope: 0.06209609300520463
Defined sigmoid fuzzy set for term 'LOW' with mean: 0.12419218601040927, slope: 0.14019963555809287
Defined sigmoid fuzzy set for term 'MEDIUM' with mean: 0.2993787974504079, slope: 0.23164142437871277
Defined sigmoid fuzzy set for term 'HIGH' with mean: 0.5874750347678348, slope: 0.26648731966127504
Defined sigmoid fuzzy set for term 'VERY_HIGH' with mean: 0.8323534367729579, slope: 0.2062624826160826
Created linguistic variable for column 'open' with 5 terms
Defined sigmoid fuzzy set for term 'VERY_LOW' with mean: 0.04314046190211634, slope: 0.05267054070660693
Defined sigmoid fuzzy set for term 'LOW' with mean: 0.10534108141321386, slope: 0.07052469727328028
Defined sigmoid fuzzy set for term 'MEDIUM' with mean: 0.18418985644867691, slope: 0.11443813441386974
Defined sigmoid fuzzy set for term 'HIGH' with mean: 0.33421735024095334, slope: 0.24128127397758642
Defined sigmoid fuzzy

In [None]:
# Define the parameters
population_size = 10
max_generations = 4
mutation_rate = 0.5
crossover_rate = 0.8
selection_method = 'hybrid'
tournament_size = 3
elitism_rate = 0.15
max_rules = 7
min_rules = 3
verbose = False

# Run the genetic algorithm
best_system, best_fitness_per_generation, average_fitness_per_generation = genetic_algorithm_loop(population_size,
    max_generations, x_train_scaled, y_train_scaled, variable_store, selection_method,
    tournament_size, crossover_rate, mutation_rate, elitism_rate, max_rules, min_rules, verbose)

Generations:   0%|                                                                                                                                                              | 0/4 [00:00<?, ?gen/s]


--- Error Log ---
Error adding rule: IF (dia_close IS LOW) AND (foreign_direct_investment_value IS LOW) OR (durables_monthly IS LOW) OR (dia_volume IS MEDIUM) OR (trade_in_services_value IS HIGH) THEN (PricePrediction IS PricePrediction)
Exception: ERROR: badly formatted rule, please check capitalization and syntax.
 ---- PROBLEMATIC RULE:
(dia_close IS LOW) AND (foreign_direct_investment_value IS LOW) OR (durables_monthly IS LOW) OR (dia_volume IS MEDIUM) OR (trade_in_services_value IS HIGH)
--- End of Error Log ---



Generations:  25%|█████████████████████████████████████▎                                                                                                               | 1/4 [03:09<09:29, 189.84s/gen]