In [None]:
import sys
import os
from pathlib import Path

# Try to find the project root from the current working directory
# This is a more robust approach for interactive environments like notebooks
try:
    # A common way to find the project root is to look for a known directory,
    # like 'src', from the current working directory.
    project_root = Path(os.getcwd())
    while not (project_root / 'src').exists():
        if project_root.parent == project_root: # Reached the root of the file system
            raise FileNotFoundError("Could not find project root containing a 'src' directory.")
        project_root = project_root.parent
    sys.path.insert(0, str(project_root))
    print(f"Project root added to path: {project_root}")
    
except NameError:
    # Fallback for when __file__ is not defined
    print("Warning: '__file__' not defined. Using a different method to find project root.")
    # You could also use a hard-coded path as a last resort
    # sys.path.insert(0, '/path/to/your/project/root')

from src.stock_features import prepare_data_for_ml
from src.macro_data import fetch_macro_data_orchestrator, fetch_news_sentiment


import warnings
warnings.filterwarnings('ignore')

Project root added to path: c:\Users\sparrott\Documents\Code\development\Quant_Strategy_Research


ModuleNotFoundError: No module named 'numpy'

In [None]:
# Define the parameters for your data pipeline
tickers_list = ['PG', 'KO', 'PEP', 'WMT', 'COST', '^GSPC']
start_date_str = '2010-01-01'
end_date_str = '2023-01-01'
output_filename = "consumer_stocks.csv"

# Make the single function call to run the entire pipeline
consumer_stocks_df = prepare_data_for_ml(
    tickers=tickers_list,
    start_date=start_date_str,
    end_date=end_date_str,
    output_engineered_csv=f'data/processed/{output_filename}'
)


--- Starting Data Preparation Pipeline ---
Fetching data for 6 tickers from 2010-01-01 to 2023-01-01...

Discovered stock prefixes: ['COST', 'KO', 'PEP', 'PG', 'WMT', '^GSPC']

Processing features for stock prefix: COST
  - Calculating ATR for COST (window=14)...
  - Calculating RSI for COST (window=14)...
  - Calculating MACD for COST (fast=12, slow=26, signal=9)...

Processing features for stock prefix: KO
  - Calculating ATR for KO (window=14)...
  - Calculating RSI for KO (window=14)...
  - Calculating MACD for KO (fast=12, slow=26, signal=9)...

Processing features for stock prefix: PEP
  - Calculating ATR for PEP (window=14)...
  - Calculating RSI for PEP (window=14)...
  - Calculating MACD for PEP (fast=12, slow=26, signal=9)...

Processing features for stock prefix: PG
  - Calculating ATR for PG (window=14)...
  - Calculating RSI for PG (window=14)...
  - Calculating MACD for PG (fast=12, slow=26, signal=9)...

Processing features for stock prefix: WMT
  - Calculating ATR for 

Unnamed: 0_level_0,Close_COST,Close_KO,Close_PEP,Close_PG,Close_WMT,Close_^GSPC,High_COST,High_KO,High_PEP,High_PG,...,Close_^GSPC_lag5,Close_^GSPC_daily_return_lag1,Close_^GSPC_daily_return_lag3,Close_^GSPC_daily_return_lag5,^GSPC_RSI14_lag1,^GSPC_RSI14_lag3,^GSPC_RSI14_lag5,^GSPC_Volume_MA_Ratio_lag1,^GSPC_Volume_MA_Ratio_lag3,^GSPC_Volume_MA_Ratio_lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-03-16,43.420578,16.792852,42.058552,40.499805,13.515336,1159.459961,43.456221,16.846014,42.204966,40.735679,...,1140.449951,0.000452,0.004042,0.001713,84.225092,84.551764,80.135434,0.975939,1.101132,1.234186
2010-03-17,43.719986,16.83663,42.376842,40.805805,13.498439,1166.209961,43.812656,16.883538,42.38321,40.818553,...,1145.609985,0.007779,-0.000217,0.004525,88.120987,83.960195,82.581951,1.020683,1.155056,1.296198
2010-03-18,43.677216,16.871031,42.345013,40.627289,13.50326,1165.829956,43.791274,16.899175,42.478698,40.882284,...,1150.23999,0.005822,0.000452,0.004042,90.222369,84.225092,84.551764,1.149842,0.975939,1.101132


In [None]:
consumer_stocks_df.head(3)
consumer_stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3223 entries, 2010-03-16 to 2022-12-30
Columns: 290 entries, Close_COST to ^GSPC_Volume_MA_Ratio_lag5
dtypes: float64(284), int64(6)
memory usage: 7.2 MB


In [None]:
# Define the lists of functions and tickers you want to fetch
macro_funcs_to_fetch = ['CPI', 'FEDERAL_FUNDS_RATE', 'TREASURY_YIELD' 'UNEMPLOYMENT', 'REAL_GDP', 'RETAIL_SALES']
fundamental_funcs_to_fetch = ['INCOME_STATEMENT', 'BALANCE_SHEET']
fred_series_ids_to_fetch = ['PAYEMS', 'CEU0500000003', 'UMCSENT', 'PCE', 'DFF', 'GDPC1', 'PCEPI']
google_trends_keywords = ['retail', 'consumer staples']
stock_tickers = ['WMT']

# Run the orchestrator
print("Starting data fetching orchestration...")
macro_consumer_stocks_df = fetch_macro_data_orchestrator(
        general_macro_funcs=macro_funcs_to_fetch,
        fundamental_funcs=fundamental_funcs_to_fetch,
        fred_series_ids=fred_series_ids_to_fetch,
        target_ticker=stock_tickers,
        monthly_interval='monthly',
        google_trends_keywords=google_trends_keywords,
        output_filename='macro_consumer_stocks.csv',
        output_directory='data/processed'
    )

In [None]:
macro_consumer_stocks_df.head(3)
macro_consumer_stocks_df.info()

In [None]:
# Parameters for the news sentiment function
sentiment_symbol = 'WMT'
sentiment_topics = ['retail', 'consumer staples']
sentiment_sort_by = 'LATEST'

print("\n--- Fetching News Sentiment Data ---")
news_df = fetch_news_sentiment(
    symbol=sentiment_symbol,
    topics=sentiment_topics,
    sort_by=sentiment_sort_by
)