In [3]:
import sys
import os
# Add the parent directory (where 'src' folder is located) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import prepare_data_for_ml, apply_kalman_filter_with_lag
import warnings
warnings.filterwarnings('ignore')

# Define the parameters for your data pipeline
tickers_list = ['PG', 'KO', 'PEP', 'WMT', 'COST', '^GSPC']
start_date_str = '2010-01-01'
end_date_str = '2023-01-01'
output_filename = "consumer_stocks.csv"

# Make the single function call to run the entire pipeline
consumer_stocks_df = prepare_data_for_ml(
    tickers=tickers_list,
    start_date=start_date_str,
    end_date=end_date_str,
    #output_engineered_csv=f'data/processed/{output_filename}'
)

# Apply Kalman filter with lag
tickers_to_filter = tickers_list
lags_to_use = [1, 5, 10]
consumer_stocks_df = apply_kalman_filter_with_lag(consumer_stocks_df, tickers_to_filter, lags_to_use)


--- Starting Data Preparation Pipeline ---
Fetching data for 6 tickers from 2010-01-01 to 2023-01-01...

Discovered stock prefixes: ['COST', 'KO', 'PEP', 'PG', 'WMT', '^GSPC']

Processing features for stock prefix: COST
  - Calculating ATR for COST (window=14)...
  - Calculating RSI for COST (window=14)...
  - Calculating MACD for COST (fast=12, slow=26, signal=9)...

Processing features for stock prefix: KO
  - Calculating ATR for KO (window=14)...
  - Calculating RSI for KO (window=14)...
  - Calculating MACD for KO (fast=12, slow=26, signal=9)...

Processing features for stock prefix: PEP
  - Calculating ATR for PEP (window=14)...
  - Calculating RSI for PEP (window=14)...
  - Calculating MACD for PEP (fast=12, slow=26, signal=9)...

Processing features for stock prefix: PG
  - Calculating ATR for PG (window=14)...
  - Calculating RSI for PG (window=14)...
  - Calculating MACD for PG (fast=12, slow=26, signal=9)...

Processing features for stock prefix: WMT
  - Calculating ATR for 

KeyboardInterrupt: 

In [None]:
consumer_stocks_df.head(3)
consumer_stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3223 entries, 2010-03-16 to 2022-12-30
Columns: 308 entries, Close_COST to Kalman_Filtered_Close_^GSPC_lag_10
dtypes: float64(302), int64(6)
memory usage: 7.6 MB


In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.macro_features import FRED_fetch_macro_data, macro_data_orchestrator
import pandas as pd

FRED_series_ids = {
        'CPI': 'CPIAUCSL',
        'FEDERAL_FUNDS_RATE': 'DFF',
        'TREASURY_YIELD': 'DGS10',
        'UNEMPLOYMENT': 'UNRATE',
        'REAL_GDP': 'GDPC1',
        'RETAIL_SALES': 'RSAFS',
        'PAYEMS': 'PAYEMS' 
    }

macro_funcs = { 'CPI', 'FEDERAL_FUNDS_RATE', 'TREASURY_YIELD', 
                'UNEMPLOYMENT', 'REAL_GDP', 'RETAIL_SALES', 'PAYEMS' }

# Specify a start date to test the new functionality
#START_DATE = '2010-01-01'

# Now you pass the dictionary explicitly as an argument.
combined_macro_df = macro_data_orchestrator(
    macro_funcs_to_fetch=macro_funcs,
    fred_series_ids_dict=FRED_series_ids,
)

if not combined_macro_df.empty:
    print("\n--- Final Merged and Cleaned DataFrame ---")
    print(combined_macro_df.head())
    print("\nFinal DataFrame info:")
    print(combined_macro_df.info())
    print("\nFinal DataFrame NaN count:")
    print(combined_macro_df.isna().sum())
else:
    print("Orchestrator returned an empty DataFrame.")

Starting FRED data orchestration pipeline...
Fetching and processing data for: CPI (CPIAUCSL)
Fetching and processing data for: RETAIL_SALES (RSAFS)
Fetching and processing data for: PAYEMS (PAYEMS)
Fetching and processing data for: REAL_GDP (GDPC1)
Fetching and processing data for: UNEMPLOYMENT (UNRATE)
Fetching and processing data for: TREASURY_YIELD (DGS10)
Fetching and processing data for: FEDERAL_FUNDS_RATE (DFF)
Data orchestration complete.

--- Final Merged and Cleaned DataFrame ---
            CPIAUCSL     RSAFS    PAYEMS      GDPC1  UNRATE  DGS10   DFF
date                                                                    
1992-01-01     138.3  159177.0  108365.0  10236.435     7.3   6.71  4.09
1992-01-02     138.3  159177.0  108365.0  10236.435     7.3   6.78  4.61
1992-01-03     138.3  159177.0  108365.0  10236.435     7.3   6.85  4.06
1992-01-04     138.3  159177.0  108365.0  10236.435     7.3   6.85  4.06
1992-01-05     138.3  159177.0  108365.0  10236.435     7.3   6.85 

In [None]:
# 3. Merge the two DataFrames on their date index
# The how='left' argument keeps all rows from the stock DataFrame.
merged_df = pd.merge(
    consumer_stocks_df,
    combined_macro_df,
    left_index=True,
    right_index=True,
    how='left'
)



In [None]:
#save the merged DataFrame to a CSV file
output_file = r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\stock_and_macro.csv'
merged_df.to_csv(output_file, index=True)

In [None]:
# Parameters for a more granular news sentiment search
# sentiment_symbol = 'WMT'
# sentiment_topics = ['retail_and_wholesale'] 
# # Use the same date range as your stock data for backtesting
# sentiment_start_date = '20230101T0000' # YYYYMMDDTHHMM
# sentiment_end_date = '20230801T0000'

# print("\n--- Fetching More Granular News Sentiment Data ---")
# news_df = fetch_news_sentiment(
#     symbol=sentiment_symbol,
#     topics=sentiment_topics,
#     sort_by='RELEVANCE',
#     time_from=sentiment_start_date,
#     time_to=sentiment_end_date
# )

# news_df.info()
# print(news_df.head())