---
# **SWEFI: Applications to Macroeconomics**

---

In [3]:
from stability_weighted_ensemble_feature_importance import *
from synthetic_dataset_generation import *

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import pandas_datareader.data as web
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime

# Define two time periods: pre-inflation spike (1991-2020) and elevated inflation (2021-2023)
start1 = '1991-01-01'
end1   = '2020-12-31'
start2 = '2021-01-01'
end2   = '2023-12-31'  # adjust as needed

# ------------------ Data Acquisition ------------------
# Download CPI data to compute inflation.
cpi1 = web.DataReader("CPIAUCSL", "fred", start1, end1)
cpi2 = web.DataReader("CPIAUCSL", "fred", start2, end2)
inflation1 = cpi1.pct_change() * 100
inflation1 = inflation1.rename(columns={'CPIAUCSL': 'Inflation'})
inflation2 = cpi2.pct_change() * 100
inflation2 = inflation2.rename(columns={'CPIAUCSL': 'Inflation'})

# Federal Funds Rate.
fedfunds1 = web.DataReader("FEDFUNDS", "fred", start1, end1)
fedfunds2 = web.DataReader("FEDFUNDS", "fred", start2, end2)
# 10-Year Treasury Constant Maturity Rate.
gs10_1 = web.DataReader("GS10", "fred", start1, end1)
gs10_2 = web.DataReader("GS10", "fred", start2, end2)
# Unemployment Rate.
unrate1 = web.DataReader("UNRATE", "fred", start1, end1)
unrate2 = web.DataReader("UNRATE", "fred", start2, end2)

# Download S&P500 data from yfinance (monthly frequency).
sp500_data1 = yf.download("^GSPC", start=start1, end=end1, interval="1mo")
if 'Adj Close' in sp500_data1.columns:
    sp500_1 = sp500_data1['Adj Close']
else:
    sp500_1 = sp500_data1['Close']

sp500_data2 = yf.download("^GSPC", start=start2, end=end2, interval="1mo")
if 'Adj Close' in sp500_data2.columns:
    sp500_2 = sp500_data2['Adj Close']
else:
    sp500_2 = sp500_data2['Close']

# ------------------ Data Preparation ------------------
# For period 1: 1991-2020.
df1 = inflation1.join([fedfunds1, gs10_1, unrate1])
df1 = df1.dropna()
df1['SP500'] = sp500_1.reindex(df1.index)
df1 = df1.dropna()
df1['SP500_Return'] = df1['SP500'].pct_change()
df1 = df1.dropna()
y1 = pd.Series(np.where(df1['SP500_Return'] > 0, 1, 0), name='Direction', index=df1.index)
X1 = df1.drop(columns=['SP500', 'SP500_Return'])

# For period 2: 2021-2023.
df2 = inflation2.join([fedfunds2, gs10_2, unrate2])
df2 = df2.dropna()
df2['SP500'] = sp500_2.reindex(df2.index)
df2 = df2.dropna()
df2['SP500_Return'] = df2['SP500'].pct_change()
df2 = df2.dropna()
y2 = pd.Series(np.where(df2['SP500_Return'] > 0, 1, 0), name='Direction', index=df2.index)
X2 = df2.drop(columns=['SP500', 'SP500_Return'])

# ------------------ SWEFI Setup and Execution ------------------
# Define common SWEFI parameters.
select_n_model = 5
bootstrap_method = SWEFI.stationary_bootstrap
hpo_n_fold = 4
hpo_n_iter = 25
hpo_metric = 'AUC'
hpo_search_library = 'scikit-optimize'
hpo_search_algorithm = 'bayesian'
n_iteration = 5
percentage = 0.6

# For period 1, use the standard setup.
swefi1 = SWEFI(X1, y1, n_fold=10)
swefi1.select_models(select_n_model=select_n_model)
swefi1.fine_tune_selected_models(
    hpo_n_fold=hpo_n_fold,
    hpo_n_iter=hpo_n_iter,
    hpo_metric=hpo_metric,
    hpo_search_algorithm=hpo_search_algorithm,
    hpo_search_library=hpo_search_library
)
swefi1.select_univariate_analysis_measurements(measurements=[
    UAMeasure.MUTUAL_INFORMATION.value,
    UAMeasure.ANOVA_F.value,
])
swefi1.compute_feature_importance_data(bootstrap_method=bootstrap_method, n_iteration=n_iteration, n_repeats=10)
swefi1.compute_swefi_scores(percentage=percentage)
swefi_scores1 = swefi1.get_swefi_scores()

# For period 2, if the number of samples is too small, duplicate the rows until we have at least 101 samples.
if X2.shape[0] < 101:
    repeat_factor = int(np.ceil(101 / X2.shape[0]))
    X2_mod = pd.concat([X2]*repeat_factor, ignore_index=True)
    y2_mod = pd.concat([y2]*repeat_factor, ignore_index=True)
else:
    X2_mod, y2_mod = X2.copy(), y2.copy()

# Initialize SWEFI for period 2 using the expanded data.
swefi2 = SWEFI(X2_mod, y2_mod, n_fold=10)
# (The above call uses the default train_size=0.99, which now results in a test set of at least 2 samples.)
swefi2.select_models(select_n_model=select_n_model)
swefi2.fine_tune_selected_models(
    hpo_n_fold=hpo_n_fold,
    hpo_n_iter=hpo_n_iter,
    hpo_metric=hpo_metric,
    hpo_search_algorithm=hpo_search_algorithm,
    hpo_search_library=hpo_search_library
)
swefi2.select_univariate_analysis_measurements(measurements=[
    UAMeasure.MUTUAL_INFORMATION.value,
    UAMeasure.ANOVA_F.value,
])
swefi2.compute_feature_importance_data(bootstrap_method=bootstrap_method, n_iteration=n_iteration, n_repeats=10)
swefi2.compute_swefi_scores(percentage=percentage)
swefi_scores2 = swefi2.get_swefi_scores()

# Save the SWEFI results to CSV files.
swefi_scores1.to_csv('swefi_results_1991_2020.csv')
swefi_scores2.to_csv('swefi_results_2021_2023.csv')



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Direction
2,Target type,Binary
3,Original data shape,"(358, 5)"
4,Transformed data shape,"(358, 5)"
5,Transformed train set shape,"(354, 5)"
6,Transformed test set shape,"(4, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
1,Logistic Regression,0.6441,0.4595,1.0,0.6441,0.7835,0.0,0.0,0.003
3,Ridge Classifier,0.6441,0.46,1.0,0.6441,0.7835,0.0,0.0,0.003
5,Linear Discriminant Analysis,0.6441,0.4618,1.0,0.6441,0.7835,0.0,0.0,0.004
9,SVM - Radial Kernel,0.6441,0.5224,1.0,0.6441,0.7835,0.0,0.0,0.005
8,Random Forest Classifier,0.6329,0.5684,0.797,0.6849,0.7346,0.139,0.1436,0.024
6,MLP Classifier,0.6272,0.5458,0.8988,0.6528,0.7554,0.0381,0.0477,0.035
4,Quadratic Discriminant Analysis,0.6271,0.5008,0.9298,0.6465,0.7621,0.0126,0.0276,0.004
0,Gradient Boosting Classifier,0.616,0.5319,0.8067,0.6695,0.7301,0.0777,0.0792,0.013
2,Extra Trees Classifier,0.5902,0.5485,0.7399,0.661,0.697,0.0604,0.0622,0.019
7,Decision Tree Classifier,0.5422,0.5063,0.6267,0.6473,0.6348,0.0152,0.0155,0.005


--------------------------------------------------------------------------------
LogisticRegression
Custom config ...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6404,0.4068,1.0,0.6404,0.7808,0.0,0.0
1,0.6517,0.4583,1.0,0.6477,0.7862,0.0397,0.1423
2,0.6477,0.5025,1.0,0.6477,0.7862,0.0,0.0
3,0.6477,0.4867,1.0,0.6477,0.7862,0.0,0.0
Mean,0.6469,0.4636,1.0,0.6459,0.7849,0.0099,0.0356
Std,0.0041,0.0364,0.0,0.0032,0.0023,0.0172,0.0616


Fitting 4 folds for each of 1 candidates, totalling 4 fits
--------------------------------------------------------------------------------
RidgeClassifier
Custom config ...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6404,0.4052,1.0,0.6404,0.7808,0.0,0.0
1,0.6517,0.4589,1.0,0.6477,0.7862,0.0397,0.1423
2,0.6477,0.5031,1.0,0.6477,0.7862,0.0,0.0
3,0.6477,0.4839,1.0,0.6477,0.7862,0.0,0.0
Mean,0.6469,0.4628,1.0,0.6459,0.7849,0.0099,0.0356
Std,0.0041,0.0368,0.0,0.0032,0.0023,0.0172,0.0616


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
--------------------------------------------------------------------------------
LinearDiscriminantAnalysis
Custom config ...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6404,0.4052,1.0,0.6404,0.7808,0.0,0.0
1,0.6517,0.4622,1.0,0.6477,0.7862,0.0397,0.1423
2,0.6477,0.5014,1.0,0.6477,0.7862,0.0,0.0
3,0.6477,0.4844,1.0,0.6477,0.7862,0.0,0.0
Mean,0.6469,0.4633,1.0,0.6459,0.7849,0.0099,0.0356
Std,0.0041,0.0363,0.0,0.0032,0.0023,0.0172,0.0616


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
--------------------------------------------------------------------------------
SVC
Custom config ...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6404,0.3558,1.0,0.6404,0.7808,0.0,0.0
1,0.6404,0.4479,1.0,0.6404,0.7808,0.0,0.0
2,0.6477,0.511,1.0,0.6477,0.7862,0.0,0.0
3,0.6477,0.5597,1.0,0.6477,0.7862,0.0,0.0
Mean,0.6441,0.4686,1.0,0.6441,0.7835,0.0,0.0
Std,0.0036,0.0762,0.0,0.0036,0.0027,0.0,0.0


Fitting 4 folds for each of 1 candidates, totalling 4 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
--------------------------------------------------------------------------------
RandomForestClassifier


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits


In [None]:
# ------------------ Data Visualization ------------------

# Figure 1: S&P500 Price and Inflation (1991-2020)
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=df1.index, y=df1['SP500'], mode='lines', name='S&P500 (1991-2020)'))
fig1.add_trace(go.Scatter(x=df1.index, y=df1['Inflation'], mode='lines', name='Inflation (1991-2020)', yaxis="y2"))
fig1.update_layout(
    title="S&P500 Price and Inflation (1991-2020)",
    xaxis_title="Date",
    yaxis=dict(title="S&P500 Price"),
    yaxis2=dict(title="Inflation (\%)", overlaying="y", side="right")
)
fig1.show()

# Figure 2: S&P500 Price and Inflation (2021-2023)
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=df2.index, y=df2['SP500'], mode='lines', name='S&P500 (2021-2023)'))
fig2.add_trace(go.Scatter(x=df2.index, y=df2['Inflation'], mode='lines', name='Inflation (2021-2023)', yaxis="y2"))
fig2.update_layout(
    title="S&P500 Price and Inflation (2021-2023)",
    xaxis_title="Date",
    yaxis=dict(title="S&P500 Price"),
    yaxis2=dict(title="Inflation (\%)", overlaying="y", side="right")
)
fig2.show()

# Figure 3: Comparison of Feature Importance Scores Across Periods.
fig3 = go.Figure(data=[
    go.Bar(
        name="1991-2020",
        x=swefi_scores1.index,
        y=swefi_scores1['mean(SWEFI)'],
        error_y=dict(type='data', array=swefi_scores1['std(SWEFI)'])
    ),
    go.Bar(
        name="2021-2023",
        x=swefi_scores2.index,
        y=swefi_scores2['mean(SWEFI)'],
        error_y=dict(type='data', array=swefi_scores2['std(SWEFI)'])
    )
])
fig3.update_layout(
    title="Comparison of SWEFI Feature Importance Across Periods",
    xaxis_title="Feature",
    yaxis_title="Mean SWEFI Score",
    barmode='group'
)
fig3.show()


NameError: name 'swefi_scores2' is not defined