In [4]:
import sys
import os
import pandas as pd
# Add the parent directory (where 'src' folder is located) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import build_stock_features_orchestrator, apply_kalman_filter_with_lag
import warnings
warnings.filterwarnings('ignore')

# Define the parameters for your data pipeline
tickers_list = ['PG', 'KO', 'PEP', 'WMT', 'COST', '^GSPC']
start_date_str = '2010-01-01'
end_date_str = '2023-01-01'
output_filename = "stock_and_macro.csv"

# Define the new parameters for the stock feature pipeline
target_ticker = 'WMT'
supplier_tickers = ['KO', 'PEP']
benchmark_ticker = '^GSPC'

# 1. Build the stock feature pipeline
print("\n--- Step 1: Building Stock Feature Pipeline ---")
consumer_stocks_df = build_stock_features_orchestrator(
    tickers=tickers_list,
    target_ticker=target_ticker,
    supplier_tickers=supplier_tickers,
    benchmark_ticker=benchmark_ticker,
    #start_date=start_date_str,
    #end_date=end_date_str,
)

# 2. Apply Kalman filter with lag on the output
print("\n--- Step 2: Applying Kalman Filter ---")
tickers_to_filter = tickers_list
lags_to_use = [1, 5, 10]
consumer_stocks_df = apply_kalman_filter_with_lag(consumer_stocks_df, tickers_to_filter, lags_to_use)



--- Step 1: Building Stock Feature Pipeline ---

--- Starting Stock Feature Pipeline ---
Fetching data for 6 tickers...
Date range: All available history to Current date
Fetching full history for PG...
Fetching full history for KO...
Fetching full history for PEP...
Fetching full history for WMT...
Fetching full history for COST...
Fetching full history for ^GSPC...

Final merged DataFrame has 9858 common entries.

Discovered stock prefixes: ['COST', 'KO', 'PEP', 'PG', 'WMT', '^GSPC']

Processing features for stock prefix: COST

Processing features for stock prefix: KO

Processing features for stock prefix: PEP

Processing features for stock prefix: PG

Processing features for stock prefix: WMT

Processing features for stock prefix: ^GSPC

Applying general features...

Dropped 49 rows due to NaN values after feature engineering.

--- Data Preparation Complete ---

--- Step 2: Applying Kalman Filter ---


In [11]:
from src.macro_features import macro_data_orchestrator

FRED_series_ids = {
        'CPI': 'CPIAUCSL',
        'FEDERAL_FUNDS_RATE': 'DFF',
        'TREASURY_YIELD': 'DGS10',
        'UNEMPLOYMENT': 'UNRATE',
        'REAL_GDP': 'GDPC1',
        'RETAIL_SALES': 'RSAFS',
        'PAYEMS': 'PAYEMS' 
    }

macro_funcs = { 'CPI', 'FEDERAL_FUNDS_RATE', 'TREASURY_YIELD', 
                'UNEMPLOYMENT', 'REAL_GDP', 'RETAIL_SALES', 'PAYEMS' }


# Now you pass the dictionary explicitly as an argument.
combined_macro_df = macro_data_orchestrator(
    macro_funcs_to_fetch=macro_funcs,
    fred_series_ids_dict=FRED_series_ids,
)

if not combined_macro_df.empty:
    print("\n--- Final Merged and Cleaned DataFrame ---")
    print(combined_macro_df.head())
    print("\nFinal DataFrame info:")
    print(combined_macro_df.info())
    print("\nFinal DataFrame NaN count:")
    print(combined_macro_df.isna().sum())
else:
    print("Orchestrator returned an empty DataFrame.")

Starting FRED data orchestration pipeline...
Fetching and processing data for: UNEMPLOYMENT (UNRATE)
Fetching and processing data for: CPI (CPIAUCSL)
Fetching and processing data for: PAYEMS (PAYEMS)
Fetching and processing data for: RETAIL_SALES (RSAFS)
Fetching and processing data for: TREASURY_YIELD (DGS10)
Fetching and processing data for: REAL_GDP (GDPC1)
Fetching and processing data for: FEDERAL_FUNDS_RATE (DFF)
Data orchestration complete.

--- Final Merged and Cleaned DataFrame ---
            UNRATE  CPIAUCSL    PAYEMS     RSAFS  DGS10      GDPC1   DFF
date                                                                    
1992-01-01     7.3     138.3  108365.0  159177.0   6.71  10236.435  4.09
1992-01-02     7.3     138.3  108365.0  159177.0   6.78  10236.435  4.61
1992-01-03     7.3     138.3  108365.0  159177.0   6.85  10236.435  4.06
1992-01-04     7.3     138.3  108365.0  159177.0   6.85  10236.435  4.06
1992-01-05     7.3     138.3  108365.0  159177.0   6.85  10236.435 

In [None]:
# 3. Merge the two DataFrames on their date index
# The how='left' argument keeps all rows from the stock DataFrame.

# Correct the time zone issue by making the stock index timezone-naive
# This aligns it with the macro data's index
consumer_stocks_df.index = consumer_stocks_df.index.tz_localize(None)

merged_df = pd.merge(
    consumer_stocks_df,
    combined_macro_df,
    left_index=True,
    right_index=True,
    how='left'
)

# Now, let's inspect the merged DataFrame
print(merged_df.info())
#print(merged_df.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9809 entries, 1986-09-17 to 2025-08-22
Columns: 292 entries, Open_PG to DFF
dtypes: float64(291), int64(1)
memory usage: 21.9 MB
None


In [10]:
#save the merged DataFrame to a CSV file
output_file = r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\consumer_staples_data.csv'
merged_df.to_csv(output_file, index=True)