# E) DJI ML - Feature selection - for different types of models 

In [1]:
#imports
import yfinance as yf
import numpy as np
import pandas as pd
import os
import exchange_calendars as xcals
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import zscore
from scipy.stats.mstats import winsorize
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from itertools import product
from bayes_opt import BayesianOptimization

## Feature selection:

## Functions for feature selection

### 1. Feature selection: Pearson correlation function: 

In [2]:
def remove_multicollinear_columns(X_train, X_test, threshold=0.85):
    corr_matrix = X_train.corr().abs()  # Compute absolute correlation
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  # Keep upper triangle

    # Find columns to drop
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    # Drop columns
    df_train = X_train.drop(columns=to_drop)
    df_test =X_test[df_train.columns]
    
    return df_train,df_test

### 2. Feature selection: Variance inflation factor function: 

In [3]:
def compute_vif_batches(X, X_test, threshold=5, batch_size=50):
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    # Stop if X_train is empty
    if X.shape[1] == 0:
        print(" Warning: X_train is empty after removing multicollinear features. Returning empty DataFrame.")
        return pd.DataFrame(), pd.DataFrame()

    X = X.copy()  # Avoid modifying the original dataset
    features = list(X.columns)  # Get feature names
    dropped_features = set()  # Track removed features

    while True:
        # Stop if no features remain
        if len(features) == 0:
            print("Warning: All features were removed due to high VIF. Returning empty DataFrame.")
            return pd.DataFrame(), pd.DataFrame()

        vif_data = pd.DataFrame()
        vif_data["Feature"] = features
        vif_values = []

        # Process VIF in batches
        for i in range(0, len(features), batch_size):
            batch = features[i : i + batch_size]

            # Skip empty batches
            if len(batch) == 0:
                continue

            # Ensure the batch has more than one feature (VIF requires at least 2)
            if len(batch) < 2:
                print(f"Warning: Only one feature left in batch {batch}. Skipping VIF computation.")
                continue

            # Compute VIF for current batch
            try:
                vif_batch = [variance_inflation_factor(X[batch].values, j) for j in range(len(batch))]
            except np.linalg.LinAlgError:
                print(" Singular matrix encountered, skipping this batch.")
                continue

            vif_values.extend(vif_batch)

        # Ensure VIF data is non-empty
        if len(vif_values) == 0:
            break

        # Ensure vif_values matches the number of features
        if len(vif_values) != len(features):
            print(f"Warning: Mismatch detected! Features: {len(features)}, VIF values: {len(vif_values)}. Adjusting data...")
            vif_data = vif_data.iloc[:len(vif_values)]  # Trim extra features

        vif_data["VIF"] = vif_values


        # Drop features with VIF above threshold
        high_vif_features = vif_data[vif_data["VIF"] > threshold]["Feature"].tolist()

        if not high_vif_features:
            break  # Stop if all VIF values are below threshold

        # Keep only low-VIF features
        features = [f for f in features if f not in high_vif_features]
        dropped_features.update(high_vif_features)

        print(f"Removed {len(high_vif_features)} features, {len(features)} remaining...")

    # Ensure test data has matching columns
    X_train_filtered = X[features] if features else pd.DataFrame()
    X_test_filtered = X_test[features] if features else pd.DataFrame()

    return X_train_filtered, X_test_filtered



### 3. Feature selection: PCA function:

In [26]:
def apply_optimal_pca(X_train, X_test, variance_threshold=0.95, n_components=None):
    from sklearn.decomposition import PCA
    import numpy as np
    import pandas as pd

    if n_components is not None:
        print(f"Using fixed number of components: {n_components}")
        pca = PCA(n_components=n_components)
    else:
        # Step 1: Fit PCA to Training Data to find optimal number of components
        pca = PCA().fit(X_train)
        optimal_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) > variance_threshold) + 1
        print(f"Optimal PCA Components: {optimal_components}")
        print(f"Removed {len(X_train.columns) - optimal_components} features, {optimal_components} remaining")
        pca = PCA(n_components=optimal_components)

    # Step 2: Transform Training and Testing Data
    X_train_pc = pca.fit_transform(X_train)
    X_test_pc = pca.transform(X_test)

    # Step 3: Convert Transformed Data into DataFrame
    component_count = pca.n_components_
    X_train_pca = pd.DataFrame(X_train_pc, columns=[f'PC{i+1}' for i in range(component_count)])
    X_test_pca = pd.DataFrame(X_test_pc, columns=[f'PC{i+1}' for i in range(component_count)])

    return X_train_pca, X_test_pca


### 4. Feature selection: Lasso regression analysis:

In [5]:
def lasso_reg_features(X_train,y_train, X_test):
    from sklearn.linear_model import LassoCV

    lasso = LassoCV(cv=5).fit(X_train, y_train)
    selected_features = X_train.columns[lasso.coef_ != 0]

    X_train = X_train[selected_features]
    X_test = X_test[selected_features]
    return X_train, X_test

### 5. Feature selection: Recursive feature elimination: 

In [6]:
def recursive_f_features(column_len, features_to_select,X_train, y_train,X_test):
    from sklearn.feature_selection import RFE
    from sklearn.ensemble import RandomForestRegressor

    rfe = RFE(RandomForestRegressor(n_estimators=column_len, random_state=42), n_features_to_select=features_to_select)
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)
    return X_train, X_test

### 6. Feature selection: Variance thresholding 

In [7]:
def var_thresh(X_train, X_test, threshold=0.01):
    from sklearn.feature_selection import VarianceThreshold
    import pandas as pd

    selector = VarianceThreshold(threshold=threshold)
    X_train_reduced = selector.fit_transform(X_train)
    X_test_reduced = selector.transform(X_test)
    
    # Get selected feature names
    selected_columns = X_train.columns[selector.get_support()]
    removed_columns = list(X_train.columns[~selector.get_support()])
    
    # Rebuild DataFrames with retained columns
    X_train_reduced = pd.DataFrame(X_train_reduced, columns=selected_columns, index=X_train.index)
    X_test_reduced = pd.DataFrame(X_test_reduced, columns=selected_columns, index=X_test.index)
    
    print(f"Variance thresholding complete. Removed {len(removed_columns)} features.")
    
    return X_train_reduced, X_test_reduced


### 7. Feature selection: Random forest filtering

In [8]:
def rand_forest_feat(X_train, y_train, X_test=None, top_n=None, threshold=None, random_state=42):
    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=100, random_state=random_state, n_jobs=-1)
    rf.fit(X_train, y_train)

    importances = rf.feature_importances_
    feature_names = np.array(X_train.columns)

    if top_n is not None:
        # Get top N features
        indices = np.argsort(importances)[::-1][:top_n]
    elif threshold is not None:
        # Keep features with importance > threshold
        indices = np.where(importances > threshold)[0]
    else:
        raise ValueError("Specify either top_n or threshold.")

    selected_features = feature_names[indices]
    X_train_reduced = X_train[selected_features]

    if X_test is not None:
        X_test_reduced = X_test[selected_features]
        return X_train_reduced, X_test_reduced



### 8. Feature selection: Select best classifiers

In [31]:
def select_best(X_train,y_train,X_test):
    from sklearn.feature_selection import SelectKBest, f_classif

    # Fit selector
    selector = SelectKBest(f_classif, k=50)
    selector.fit(X_train, y_train)

    # Get the selected column names
    selected_cols = X_train.columns[selector.get_support()]

    # Use these to slice original DataFrames
    X_train_selected = X_train[selected_cols]
    X_test_selected = X_test[selected_cols]
    return X_train_selected, X_test_selected

### 9. Feature selection: Best classifiers  XGboost

In [43]:
def xg_feat(X_train, y_train, X_test, num_features=50):
    import xgboost as xgb
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    importances = model.get_booster().get_score(importance_type='gain')
    importance_df = pd.DataFrame.from_dict(importances, orient='index', columns=['gain'])
    importance_df = importance_df.sort_values(by='gain', ascending=False)

    top_features = importance_df.head(num_features).index.tolist()
    X_train = X_train[top_features]
    X_test = X_test[top_features]
    return X_train, X_test

## Loading datasets

In [10]:
base_path = "C:\\Users\\tgsog\\OneDrive\\Desktop\\DowJones_ML_project\\DataFiles\\D) feature engineered data\\"
ols_path =base_path+"1. OLS data\\"
lstm_path = base_path+"2. LSTM\\"
gru_path= base_path + "3. Gradiant data\\"
nb_path = base_path+"3. Naive bayes data\\"
xg_path = base_path+"3. xgboost data\\"
cat_path = base_path+"3. catboost data\\" 

In [11]:
#MAking bulkloader function
def bulk_load_files(path, name_prefix=None):
    import pandas as pd
    import glob
    import os

    csv_files = glob.glob(os.path.join(path, "*.csv"))

    dataframes = {}
    for file in csv_files:
        key = os.path.basename(file).replace(".csv", "")
        if name_prefix:
            key = f"{name_prefix}_{key}"
        dataframes[key] = pd.read_csv(file)

    return dataframes


In [12]:
#Bulk loading all the data
ols_dfs = bulk_load_files(ols_path, name_prefix=None)
lstm_dfs = bulk_load_files(lstm_path, name_prefix=None)
gru_dfs = bulk_load_files(gru_path, name_prefix=None)
nb_dfs = bulk_load_files(nb_path, name_prefix=None)
xg_dfs = bulk_load_files(xg_path, name_prefix=None)
cat_dfs = bulk_load_files(cat_path, name_prefix=None)

dfs_list = [ols_dfs,lstm_dfs,gru_dfs,nb_dfs,xg_dfs,cat_dfs,]


In [13]:
# Store all your dataset dictionaries in a master dict with model tags
all_dfs = {
    "ols": ols_dfs,
    "lstm": lstm_dfs,
    "gru": gru_dfs,
    "nb": nb_dfs,
    "xg": xg_dfs,
    "cat": cat_dfs
}

# Create a list to keep track of created variable names
all_loaded_variables = []

# Loop through each dictionary and assign variables dynamically
for model_name, model_dfs in all_dfs.items():
    for key, df in model_dfs.items():
        # This assumes keys are already named like "X_train_ols", etc.
        globals()[key] = df
        all_loaded_variables.append(key)


## OLS selection

### OLS model feature selection 1

In [14]:
### OLS process: VIF->PCA->lasso
#Step 1: Pearson
X_train_ols1,X_test_ols1 =remove_multicollinear_columns(X_train_ols, X_test_ols, threshold=0.85)

#Step 2: Lasso
X_train_ols1,X_test_ols1 =lasso_reg_features(X_train_ols1,y_train_ols, X_test_ols1)

#Step 3: RFE
X_train_ols1, X_test_ols1=recursive_f_features(column_len=len(X_train_ols1.columns), features_to_select=30,X_train=X_train_ols1, y_train=y_train_ols,X_test=X_test_ols1)


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  retur

In [15]:
len(X_train_ols1.columns)

76

### OLS Model feature 2

In [16]:
### OLS process: VIF->PCA->lasso
#Step 1: VIF
X_train_ols2,X_test_ols2 =compute_vif_batches(X_train_ols,X_test_ols, threshold=5, batch_size=50)
#Step 2: PCA
X_train_ols2,X_test_ols2 =apply_optimal_pca(X_train_ols2, X_test_ols2, variance_threshold=0.70)
#Step 3: Lasso
X_train_ols2,X_test_ols2 =lasso_reg_features(X_train_ols2,y_train_ols, X_test_ols2)

#Step 4: RFE
X_train_ols2, X_test_ols2=recursive_f_features(column_len=len(X_train_ols2.columns), features_to_select=30,X_train=X_train_ols2, y_train=y_train_ols,X_test=X_test_ols2)


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 

Removed 950 features, 271 remaining...
Removed 40 features, 231 remaining...
Removed 6 features, 225 remaining...
Optimal PCA Components: 55
Removed 170 features, 55 remaining


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [17]:
len(X_train_ols2.columns)

37

### OLS feature selection 3

In [18]:
#Step 1: Pearson
X_train_ols3, X_test_ols3= remove_multicollinear_columns(X_train_ols,X_test_ols, threshold=0.70)

#Step 2: VIF
X_train_ols3,X_test_ols3 =compute_vif_batches(X_train_ols3,X_test_ols3, threshold=5, batch_size=50)

#Step 3: PCA
X_train_ols3,X_test_ols3 =apply_optimal_pca(X_train_ols3, X_test_ols3, variance_threshold=0.70)

#Step 4: Lasso
X_train_ols3,X_test_ols3 =lasso_reg_features(X_train_ols3,y_train_ols, X_test_ols3)

X_train_ols3,X_test_ols3 = var_thresh(X_train_ols3,X_test_ols3,threshold=0.3)


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


Removed 154 features, 197 remaining...
Optimal PCA Components: 57
Removed 140 features, 57 remaining
Variance thresholding complete. Removed 0 features.


  y = column_or_1d(y, warn=True)


## LSTM feature selection 1

In [33]:
#LSTM process: Variance thresholding
X_train_lstm1, X_test_lstm1 = var_thresh(X_train_lstm, X_test_lstm, threshold=0.01)
# LSTM process: Pearson correlation
X_train_lstm1, X_test_lstm1= remove_multicollinear_columns(X_train_lstm1,X_test_lstm1, threshold=0.70)

#LSTM process: PCA
X_train_lstm1, X_test_lstm1 = apply_optimal_pca(X_train = X_train_lstm1, X_test=X_test_lstm1, variance_threshold=0.95, n_components=None)

# LSTM process: RFE
X_train_lstm1, X_test_lstm1=recursive_f_features(len(X_train_lstm1), features_to_select=20,X_train =X_train_lstm1, y_train =y_train_lstm,X_test =X_test_lstm1)

Variance thresholding complete. Removed 485 features.
Optimal PCA Components: 123
Removed 340 features, 123 remaining


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [34]:
len(X_train_lstm1.columns)

123

len(X_train_lstm1.columns)

## LSTM feature selection 2

In [None]:
#LSTM process: Var thresh
X_train_lstm2, X_test_lstm2 = var_thresh(X_train_lstm, X_test_lstm, threshold=0.01)

#LSTM process: Pearson
X_train_lstm2, X_test_lstm2 = remove_multicollinear_columns(X_train_lstm2, X_test_lstm2, threshold=0.70)


# LSTM process: Random forest featurizing
X_train_lstm2, X_test_lstm2 = rand_forest_feat(X_train=X_train_lstm2, y_train =y_train_lstm, X_test=X_test_lstm2, top_n=50, threshold=None, random_state=42)

# LSTM process: RFE
X_train_lstm2, X_test_lstm2=recursive_f_features(column_len =len(X_train_lstm2), features_to_select=30,X_train =X_train_lstm2, y_train =y_train_lstm,X_test =X_test_lstm2)


Variance thresholding complete. Removed 485 features.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [20]:
len(X_train_lstm2.columns)

50

## Categorical models feature selection process

### naive bayes  feature  process:

In [35]:
#Variance threshold
X_train_nb, X_test_nb = var_thresh(X_train =X_train_nb , X_test =X_test_nb, threshold=0.01)

#Pearson correlation
X_train_nb, X_test_nb = remove_multicollinear_columns(X_train_nb, X_test_nb, threshold=0.80)

#Best selector
X_train_nb,X_test_nb =select_best(X_train_nb,y_train_nb,X_test_nb)


Variance thresholding complete. Removed 0 features.


  y = column_or_1d(y, warn=True)


In [36]:
len(X_train_nb.columns)

1

### XG boost feature selection process

In [44]:
#Variance threshold
X_train_xg,X_test_xg =var_thresh(X_train =X_train_xg , X_test =X_test_xg, threshold=0.01)

#Pearson correlation:
X_train_xg,X_test_xg =remove_multicollinear_columns(X_train_xg, X_test_xg, threshold=0.80)

#Best XG features
X_train_xg,X_test_xg =xg_feat(X_train=X_train_xg, y_train=y_train_xg, X_test = X_test_xg, num_features=50)


Variance thresholding complete. Removed 0 features.


In [45]:
len(X_train_xg.columns)

50

### Cat boost feature selection

Made into catboost function 

In [46]:
def cat_boost_filtering(X_train_cat,y_train_cat,X_test_cat, num_features, itterations = 500, depth = 6, learning_rate = 0.1, verbose =0):
    from catboost import CatBoostClassifier, Pool
    import pandas as pd
    import numpy as np

    # Initialize CatBoost model (no need to train multiple times)
    cat_model = CatBoostClassifier(iterations=itterations, depth=depth, learning_rate=learning_rate, verbose=verbose)

    # Convert dataset into CatBoost format
    train_pool = Pool(X_train_cat, label=y_train_cat)

    # Train the model
    cat_model.fit(train_pool)

    # Get feature importance
    feature_importance = cat_model.get_feature_importance(train_pool)
    feature_names = X_train_cat.columns

    # Convert to DataFrame
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

    # Sort by importance
    importance_df = importance_df.sort_values(by="Importance", ascending=False)

    # Select top 100 most important features
    selected_features = importance_df.head(num_features)['Feature'].tolist()
    X_train_cat_reduced = X_train_cat[selected_features]
    X_test_cat_reduced = X_test_cat[selected_features]

    print(f"Reduced from {X_train_cat.shape[1]} to {X_train_cat_reduced.shape[1]} features")
    return X_train_cat_reduced, X_test_cat_reduced


In [47]:
X_train_cat, X_test_cat =cat_boost_filtering(X_train_cat=X_train_cat,y_train_cat=y_train_cat,X_test_cat=X_test_cat, num_features=50, itterations = 500, depth = 6, learning_rate = 0.1, verbose =0)

Reduced from 2950 to 50 features


In [48]:
len(X_train_cat.columns)

50

### GRU feature selection

In [49]:
#1. Variance thresholding
X_train_gru, X_test_gru= var_thresh(X_train =X_train_gru , X_test=X_test_gru, threshold=0.01)

#2. Pearson correlations
X_train_gru, X_test_gru = remove_multicollinear_columns(X_train= X_train_gru, X_test=X_test_gru, threshold=0.85)

#3. Model based selection
X_train_gru, X_test_gru =xg_feat(X_train=X_train_gru, y_train=y_train_gru, X_test = X_test_gru, num_features=50)

#4. Univariate selection
X_train_gru, X_test_gru = select_best(X_train_gru,y_train_gru,X_test_gru)


Variance thresholding complete. Removed 485 features.


  y = column_or_1d(y, warn=True)


In [50]:
len(X_train_gru.columns)

50

##  Removing ^DJI  - can be used to 

def remove_columns_with_keyword(df, keyword="^DJI"):
    return df.loc[:, ~df.columns.str.contains(keyword, case=False, regex=True)]
df_cleaned = remove_columns_with_keyword(df)
print("Remaining columns:", df_cleaned.columns)

###  Data loading

In [61]:
import pandas as pd
import os

# 1. Get all dataframes from memory
dataframes = {k: v for k, v in globals().items() if isinstance(v, pd.DataFrame)}
df_names = list(dataframes.keys())

# 2. Filter out:
# - Any name containing 'X' and ending with 'lstm' or 'ols'
# - Any name ending with 'df'
filtered_list = [
    name for name in df_names
    if not (
        (name.endswith("lstm") and "X" in name)
        or (name.endswith("ols") and "X" in name)
        or name.endswith("df")
    )
]

# 3. Define base upload directory
upload_base = "C:\\Users\\tgsog\\OneDrive\\Desktop\\DowJones_ML_project\\DataFiles\\E) Feature selected data\\"

# 4. Mapping: keyword ➜ folder name
keyword_folder_map = {
    "cat": "catboost data",
    "gru": "Gradient recurrent unit",
    "lstm": "LSTM data",
    "ols": "OLS data",
    "nb": "Naive bayes data",
    "xg": "xgboost data"
}

# 5. Save each DataFrame based on keyword in its name
for df_name in filtered_list:
    df = dataframes[df_name]
    
    for keyword, folder in keyword_folder_map.items():
        if keyword in df_name.lower():
            dst_folder = os.path.join(upload_base, folder)
            os.makedirs(dst_folder, exist_ok=True)  # Make sure the folder exists

            file_path = os.path.join(dst_folder, f"{df_name}.csv")
            df.to_csv(file_path, index=False)
            print(f"Saved {df_name} to {folder}")
            break  # Stop after first match


Saved y_test_ols to OLS data
Saved y_train_ols to OLS data
Saved y_test_lstm to LSTM data
Saved y_train_lstm to LSTM data
Saved X_test_gru to Gradient recurrent unit
Saved X_train_gru to Gradient recurrent unit
Saved y_test_gru to Gradient recurrent unit
Saved y_train_gru to Gradient recurrent unit
Saved X_test_nb to Naive bayes data
Saved X_train_nb to Naive bayes data
Saved y_test_nb to Naive bayes data
Saved y_train_nb to Naive bayes data
Saved X_test_xg to xgboost data
Saved X_train_xg to xgboost data
Saved y_test_xg to xgboost data
Saved y_train_xg to xgboost data
Saved X_test_cat to catboost data
Saved X_train_cat to catboost data
Saved y_test_cat to catboost data
Saved y_train_cat to catboost data
Saved X_train_ols1 to OLS data
Saved X_test_ols1 to OLS data
Saved X_train_ols2 to OLS data
Saved X_test_ols2 to OLS data
Saved X_train_ols3 to OLS data
Saved X_test_ols3 to OLS data
Saved X_train_lstm2 to LSTM data
Saved X_test_lstm2 to LSTM data
Saved X_train_lstm1 to LSTM data
Saved