In [1]:
import pandas as pd
import numpy as np
from ta.momentum import RSIIndicator
from ta.trend import MACD
from scipy.signal import savgol_filter
import requests
from time import sleep

In [2]:
# Load raw data
df = pd.read_csv('data/raw_data.csv', index_col=0, parse_dates=True)

In [3]:
# Create return features
df['Return'] = df['GSPC'].pct_change()
df['Return_5d'] = df['GSPC'].pct_change(5)
df['Return_20d'] = df['GSPC'].pct_change(20)

# Optional: Smoothed return using Savitzky-Golay filter
# Handle NaNs by filling with 0 temporarily for smoothing
df['Return_Smooth'] = savgol_filter(
    df['Return'].fillna(0), 
    window_length=21, 
    polyorder=3, 
    mode='interp'
)

In [4]:
# Calculate technical indicators
rsi_indicator = RSIIndicator(close=df['GSPC'], window=14)
df['RSI_14'] = rsi_indicator.rsi()

# ========================================
# Note: MACD can produce NaN values for the initial periods, which is expected. We will handle this in the modeling phase.
# ========================================
macd_indicator = MACD(
    close=df['GSPC'],
    window_fast=12,
    window_slow=26,
    window_sign=9
)
df['MACD_Line'] = macd_indicator.macd()
df['MACD_Signal'] = macd_indicator.macd_signal()
df['MACD_Hist'] = macd_indicator.macd_diff()

In [5]:
# Create macroeconomic features (YoY changes)
# GDP: quarterly YoY (4 quarters back)
df['GDP_YoY'] = df['GDP'].pct_change(4 * 91)  # approx 4 quarters in days

# Core Inflation: monthly YoY (12 months back)
df['Core_Inflation_YoY'] = df['Core_Inflation'].pct_change(12 * 30)  # approx 12 months

# M2 Money Stock: monthly YoY
df['M2_YoY'] = df['M2'].pct_change(12 * 30)

# Note: Unemployment Rate is typically used as-is (level, not YoY)

In [6]:
# Drawdown from peak
df['Peak'] = df['GSPC'].cummax()
df['Drawdown'] = (df['GSPC'] - df['Peak']) / df['Peak']

# VIX change
df['VIX_Change'] = df['VIX'].diff()
df['VIX_Change_5d'] = df['VIX'].diff(5)

In [7]:
# Drop initial rows with too many NaNs from indicators
df_clean = df.dropna()
df_clean['Regime'] = pd.NA # Placeholder for regime labels to be added in the next phase

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Regime'] = pd.NA # Placeholder for regime labels to be added in the next phase


In [8]:
# Save final dataset
df_clean.to_csv('data/master_data.csv')

print("=" * 70)
print("FINAL DATASET")
print("=" * 70)
print(f"Shape: {df_clean.shape}")
print(f"Date range: {df_clean.index.min().date()} to {df_clean.index.max().date()}")
print(f"Trading days: {len(df_clean)}")
print(f"Years covered: {(df_clean.index.max() - df_clean.index.min()).days / 365.25:.1f}")
print(f"\nMissing values per column:")
print(df_clean.isnull().sum()[df_clean.isnull().sum() > 0])
print("\n✓ Dataset ready for regime labeling and modeling!")

FINAL DATASET
Shape: (7954, 23)
Date range: 1994-07-11 to 2026-02-13
Trading days: 7954
Years covered: 31.6

Missing values per column:
Regime    7954
dtype: int64

✓ Dataset ready for regime labeling and modeling!


| Column             | Type     | Description                         | Source      |
| ------------------ | -------- | ----------------------------------- | ----------- |
| Date               | datetime | Trading date (index)                | -           |
| GSPC               | float    | S&P 500 closing price               | Yahoo       |
| VIX                | float    | VIX closing level                   | Yahoo       |
| SPY Volume         | int      | SPY trading volume                  | Yahoo       |
| GDP                | float    | Real GDP (billions, chained 2017 $) | FRED        |
| Core Inflation     | float    | Core PCE index                      | FRED        |
| Unemployment Rate  | float    | Unemployment rate (%)               | FRED        |
| M2 Money Stock     | float    | M2 money supply (billions $)        | FRED        |
| Return             | float    | Daily S&P 500 return                | Computed    |
| Return_5d          | float    | 5-day return                        | Computed    |
| Return_20d         | float    | 20-day return                       | Computed    |
| Return_Smooth      | float    | Smoothed daily return               | Computed    |
| RSI_14             | float    | 14-day RSI                          | Computed    |
| MACD_Line          | float    | MACD line                           | Computed    |
| MACD_Signal        | float    | MACD signal line                    | Computed    |
| MACD_Hist          | float    | MACD histogram                      | Computed    |
| GDP_YoY            | float    | YoY GDP growth rate                 | Computed    |
| Core_Inflation_YoY | float    | YoY inflation rate                  | Computed    |
| M2_YoY             | float    | YoY M2 growth rate                  | Computed    |
| Peak               | float    | Running max of S&P 500              | Computed    |
| Drawdown           | float    | % decline from peak                 | Computed    |
| VIX_Change         | float    | Daily VIX change                    | Computed    |
| VIX_Change_5d      | float    | 5-day VIX change                    | Computed    |
| Regime             | object   | Bull/Bear label (to be filled)      | Placeholder |

In [9]:

# # ========================================
# # The following is to test the sentiment data access from Refinitiv, could be skipped for now
# # ========================================

# # Check if you have Refinitiv access
# try:
#     import refinitiv.data as rd
    
#     # Try to open a session (requires Workspace Desktop running)
#     rd.open_session()
#     print("✓ Refinitiv Data Library accessible")
    
#     # Test if you can get news headlines
#     headlines = rd.get_news_headlines(
#         query="S&P 500",
#         count=5
#     )
#     print("✓ News headlines accessible")
#     print(headlines)
    
#     rd.close_session()
# except Exception as e:
#     print(f"✗ Refinitiv access issue: {e}")


In [10]:
# Fetch news sentiment from AlphaVantage
def get_alphavantage_sentiment(api_key, tickers=['SPY'], limit=1000):
    """
    Get news sentiment from AlphaVantage
    """
    url = "https://www.alphavantage.co/query"
    
    all_sentiment = []
    
    for ticker in tickers:
        params = {
            'function': 'NEWS_SENTIMENT',
            'tickers': ticker,
            'apikey': api_key,
            'limit': limit
        }
        
        response = requests.get(url, params=params)
        data = response.json()
        
        if 'feed' in data:
            for article in data['feed']:
                date = pd.to_datetime(article['time_published'][:8])
                
                # Extract sentiment for this ticker
                for sentiment_item in article.get('ticker_sentiment', []):
                    if sentiment_item['ticker'] == ticker:
                        all_sentiment.append({
                            'date': date,
                            'ticker': ticker,
                            'sentiment_score': float(sentiment_item.get('ticker_sentiment_score', 0)),
                            'sentiment_label': sentiment_item.get('ticker_sentiment_label', 'Neutral')
                        })
                        break
        
        sleep(12)  # AlphaVantage rate limit: 5 calls/min for free, premium allows more
    
    sentiment_df = pd.DataFrame(all_sentiment)
    
    # Aggregate to daily
    daily_sentiment = sentiment_df.groupby('date').agg({
        'sentiment_score': 'mean'
    }).rename(columns={'sentiment_score': 'AlphaVantage_Sentiment'})
    
    return daily_sentiment

# Use your premium key
API_KEY = "VJJRWG9F79UTFIJT"
sentiment = get_alphavantage_sentiment(API_KEY, tickers=['SPY'])

df = df_clean.join(sentiment, how='left')
df['AlphaVantage_Sentiment'] = df['AlphaVantage_Sentiment'].ffill()

df.to_csv('data/master_data_with_sentiment.csv')
print(f"Added sentiment for {df['AlphaVantage_Sentiment'].notna().sum()} days")


Added sentiment for 2982 days
