In [None]:
import sys
import os
import pandas as pd
# Add the parent directory (where 'src' folder is located) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import build_stock_features_orchestrator, apply_kalman_filter_with_lag
import warnings
warnings.filterwarnings('ignore')

# Define the parameters for your data pipeline
tickers_list = ['PG', 'KO', 'PEP', 'WMT', 'COST', '^GSPC']
start_date_str = '2010-01-01'
end_date_str = '2023-01-01'
output_filename = "stock_and_macro.csv"

# Define the new parameters for the stock feature pipeline
target_ticker = 'WMT'
supplier_tickers = ['KO', 'PEP']
benchmark_ticker = '^GSPC'

# 1. Build the stock feature pipeline
print("\n--- Step 1: Building Stock Feature Pipeline ---")
consumer_stocks_df = build_stock_features_orchestrator(
    tickers=tickers_list,
    target_ticker=target_ticker,
    supplier_tickers=supplier_tickers,
    benchmark_ticker=benchmark_ticker,
    #start_date=start_date_str,
    #end_date=end_date_str,
)

# 2. Apply Kalman filter with lag on the output
print("\n--- Step 2: Applying Kalman Filter ---")
tickers_to_filter = tickers_list
lags_to_use = [1, 5, 10]
consumer_stocks_df = apply_kalman_filter_with_lag(consumer_stocks_df, tickers_to_filter, lags_to_use)




--- Step 1: Building Stock Feature Pipeline ---


TypeError: build_stock_features_orchestrator() missing 1 required positional argument: 'end_date'

In [2]:
from src.macro_features import macro_data_orchestrator

FRED_series_ids = {
        'CPI': 'CPIAUCSL',
        'FEDERAL_FUNDS_RATE': 'DFF',
        'TREASURY_YIELD': 'DGS10',
        'UNEMPLOYMENT': 'UNRATE',
        'REAL_GDP': 'GDPC1',
        'RETAIL_SALES': 'RSAFS',
        'PAYEMS': 'PAYEMS' 
    }

macro_funcs = { 'CPI', 'FEDERAL_FUNDS_RATE', 'TREASURY_YIELD', 
                'UNEMPLOYMENT', 'REAL_GDP', 'RETAIL_SALES', 'PAYEMS' }


# Now you pass the dictionary explicitly as an argument.
combined_macro_df = macro_data_orchestrator(
    macro_funcs_to_fetch=macro_funcs,
    fred_series_ids_dict=FRED_series_ids,
)

if not combined_macro_df.empty:
    print("\n--- Final Merged and Cleaned DataFrame ---")
    print(combined_macro_df.head())
    print("\nFinal DataFrame info:")
    print(combined_macro_df.info())
    print("\nFinal DataFrame NaN count:")
    print(combined_macro_df.isna().sum())
else:
    print("Orchestrator returned an empty DataFrame.")

Starting FRED data orchestration pipeline...
Fetching and processing data for: UNEMPLOYMENT (UNRATE)
Fetching and processing data for: REAL_GDP (GDPC1)
Fetching and processing data for: TREASURY_YIELD (DGS10)
Fetching and processing data for: CPI (CPIAUCSL)
Fetching and processing data for: PAYEMS (PAYEMS)
Fetching and processing data for: FEDERAL_FUNDS_RATE (DFF)
Fetching and processing data for: RETAIL_SALES (RSAFS)
Data orchestration complete.

--- Final Merged and Cleaned DataFrame ---
            UNRATE      GDPC1  DGS10  CPIAUCSL    PAYEMS   DFF     RSAFS
date                                                                    
1992-01-01     7.3  10236.435   6.71     138.3  108365.0  4.09  159177.0
1992-01-02     7.3  10236.435   6.78     138.3  108365.0  4.61  159177.0
1992-01-03     7.3  10236.435   6.85     138.3  108365.0  4.06  159177.0
1992-01-04     7.3  10236.435   6.85     138.3  108365.0  4.06  159177.0
1992-01-05     7.3  10236.435   6.85     138.3  108365.0  4.06  159

In [3]:
# 3. Merge the two DataFrames on their date index
# The how='left' argument keeps all rows from the stock DataFrame.
merged_df = pd.merge(
    consumer_stocks_df,
    combined_macro_df,
    left_index=True,
    right_index=True,
    how='left'
)



In [4]:
# print merged dataframe coloumns and info
print(merged_df.columns.tolist())
print(merged_df.info())


['Close_COST', 'Close_KO', 'Close_PEP', 'Close_PG', 'Close_WMT', 'Close_^GSPC', 'High_COST', 'High_KO', 'High_PEP', 'High_PG', 'High_WMT', 'High_^GSPC', 'Low_COST', 'Low_KO', 'Low_PEP', 'Low_PG', 'Low_WMT', 'Low_^GSPC', 'Open_COST', 'Open_KO', 'Open_PEP', 'Open_PG', 'Open_WMT', 'Open_^GSPC', 'Volume_COST', 'Volume_KO', 'Volume_PEP', 'Volume_PG', 'Volume_WMT', 'Volume_^GSPC', 'COST_HighLow_Range', 'COST_OpenClose_Range', 'COST_Close_to_Range_Ratio', 'COST_True_Range', 'COST_ATR14', 'COST_Volume_Daily_Change', 'COST_Volume_MA_20D', 'COST_Volume_MA_Ratio', 'COST_OBV', 'COST_RSI14', 'COST_MACD_Line', 'COST_MACD_Signal', 'COST_MACD_Hist', 'COST_SMA_10', 'COST_SMA_20', 'COST_SMA_50', 'COST_EMA_12', 'COST_EMA_26', 'COST_BB_Middle20', 'COST_BB_Upper20', 'COST_BB_Lower20', 'COST_BB_Bandwidth20', 'COST_BB_PctB20', 'COST_Stoch_K_14', 'COST_Stoch_D_14_3', 'COST_PlusDI_14', 'COST_MinusDI_14', 'COST_DX_14', 'COST_ADX_14', 'COST_RollingMean_Convergence_50', 'COST_ROC_12', 'COST_MFI_14', 'COST_CMF_21'

In [None]:
#save the merged DataFrame to a CSV file
output_file = r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\stock_and_macro.csv'
merged_df.to_csv(output_file, index=True)