In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Paths to the prepped data files
base_path = './data/prepped_data/'  # Adjusted base path to where your files are now
X_train_path = base_path + 'X_train.csv'
X_test_path = base_path + 'X_test.csv'
y_train_path = base_path + 'y_train.csv'
y_test_path = base_path + 'y_test.csv'

# Loading the datasets
X_train = pd.read_csv(X_train_path)
X_test = pd.read_csv(X_test_path)
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# Assuming the last columns of X_train and X_test are year, month, day, hour for reconstructing the datetime index
X_train['datetime'] = pd.to_datetime(X_train[['year', 'month', 'day', 'hour']])
X_test['datetime'] = pd.to_datetime(X_test[['year', 'month', 'day', 'hour']])
y_train.index = pd.to_datetime(X_train['datetime'])
y_test.index = pd.to_datetime(X_test['datetime'])

# Exclude non-numerical columns before scaling
b = X_train.columns.difference(['year', 'month', 'day', 'hour', 'datetime'])

# Initialize the scaler for the features
scaler_X = MinMaxScaler()

# Scale 'X' features (excluding non-numerical columns)
X_train_scaled = scaler_X.fit_transform(X_train[columns_to_scale])
X_test_scaled = scaler_X.transform(X_test[columns_to_scale])

# Initialize a separate scaler for the target variable
scaler_y = MinMaxScaler()

# Scale 'y' (the target variable)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()


In [22]:
import pandas as pd

# List of column names from columns_to_scale for X features
feature_names = list(columns_to_scale)

# Add a placeholder for the scaled target variables
feature_names.append("Scaled Target Variable (y)")

# Create a DataFrame for displaying in a table format
table_data = [{"Feature Name": name} for name in feature_names]

# Adding description for y_train_scaled and y_test_scaled as they do not have feature names
table_data[-1]["Description"] = "Represents the scaled target variable"

# Display the table
table_df = pd.DataFrame(table_data)
print(table_df)


                                         Feature Name  \
0                      central_bank_policy_rate_prcnt   
1                            classification_numerical   
2                                         cpi_monthly   
3                                           dia_close   
4                                          dia_volume   
5                                 domestic_credit_gdp   
6                                    durables_monthly   
7                            federal_funds_rate_daily   
8                     foreign_direct_investment_value   
9                             gdp_growth_annual_prcnt   
10                                          gld_close   
11                                         gld_volume   
12  individuals_using_the_internet_prcnt_of_popula...   
13                               inflation_rate_value   
14                                               macd   
15       mobile_cellular_subscriptions_per_100_people   
16                            n

In [23]:
import numpy as np
import random

def select_random_features(X_scaled, feature_names, num_features):
    """
    Select a random subset of features from the scaled data.
    
    Args:
    - X_scaled: numpy array of scaled features.
    - feature_names: list of all feature names.
    - num_features: number of features to select.
    
    Returns:
    - X_subset: subset of the scaled features.
    - selected_feature_names: names of the selected features.
    """
    # Ensure num_features does not exceed the number of available features
    num_features = min(num_features, len(feature_names))
    
    # Generate random indices for feature selection
    selected_indices = random.sample(range(len(feature_names)), num_features)
    
    # Select the features based on the random indices
    X_subset = X_scaled[:, selected_indices]
    
    # Extract the names of the selected features
    selected_feature_names = [feature_names[i] for i in selected_indices]
    
    return X_subset, selected_feature_names

# Example usage:
num_features_to_select = 4
X_subset, selected_feature_names = select_random_features(X_train_scaled, columns_to_scale, num_features_to_select)
print("Selected features:", selected_feature_names)


Selected features: ['real_gdp_per_capita_quarterly', 'spy_close', 'classification_numerical', 'federal_funds_rate_daily']


In [34]:
import numpy as np
from simpful import *
from sklearn.cluster import KMeans

# Perform KMeans clustering to find cluster centers for Gaussian membership functions
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X_train_scaled[:, feature_indices])
centers = kmeans.cluster_centers_

# Initialize the Fuzzy Inference System
FS = FuzzySystem()

# Correctly adding linguistic variables with their fuzzy sets
for i, feature_name in enumerate(selected_feature_names):
    fuzzy_sets = []
    for cluster_index, center in enumerate(centers[:, i]):
        # Create a Gaussian fuzzy set for each cluster center
        fs = GaussianFuzzySet(mu=center, sigma=0.1, term=f"cluster_{cluster_index+1}")
        fuzzy_sets.append(fs)
    # Create and add the linguistic variable with its fuzzy sets
    LV = LinguisticVariable(fuzzy_sets, concept=feature_name)
    FS.add_linguistic_variable(feature_name, LV)

# Define an output variable with a dummy fuzzy set
dummy_fs = GaussianFuzzySet(mu=0, sigma=1, term="dummy")
dummy_lv = LinguisticVariable([dummy_fs], concept="dummy_output")
FS.add_linguistic_variable("PricePrediction", dummy_lv)

# Define the output function for the system (ensure it matches your model's logic)
FS.set_output_function("PricePrediction", " + ".join([f"0.25*{name}" for name in selected_feature_names]))

# Define fuzzy rules based on your model's logic
rules = [
    # Example rules - modify according to your system's logic
    "IF (real_gdp_per_capita_quarterly IS cluster_1) THEN (PricePrediction IS PricePrediction)",
    "IF (spy_close IS cluster_2) THEN (PricePrediction IS PricePrediction)",
    # Add more rules as needed
]
FS.add_rules(rules)

# Function to make predictions with the FIS
def predict_with_fis(FS, input_features):
    for feature_name, value in zip(selected_feature_names, input_features):
        FS.set_variable(feature_name, value)
    result = FS.inference(["PricePrediction"])
    return result["PricePrediction"]

# Example usage
input_features = X_test_scaled[0, feature_indices]  # Adjust indices accordingly
prediction = predict_with_fis(FS, input_features)
print("Fuzzy prediction:", prediction)

  ____  __  _  _  ____  ____  _  _  __   
 / ___)(  )( \/ )(  _ \(  __)/ )( \(  ) v2.12.0 
 \___ \ )( / \/ \ ) __/ ) _) ) \/ (/ (_/\ 
 (____/(__)\_)(_/(__)  (__)  \____/\____/

 https://github.com/aresio/simpful

 * Detected Sugeno model type
Fuzzy prediction: 0.5753061156183121
