In [1]:
# In: notebooks/02_training_data_preparation.ipynb

import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dotenv import load_dotenv

# Add the project root to the Python path
if '..' not in sys.path:
    sys.path.append('..') 

# Import your existing data fetcher functions
from data_ingestion.yahoo_finance_fetcher import fetch_ticker_data
from data_ingestion.news_fetcher import fetch_news_headlines
from data_ingestion.fred_fetcher import fetch_macro_data

# For sentiment analysis
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

print("✅ Setup complete.")

✅ Setup complete.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# In Cell 2 of your notebook

# --- CONFIGURATION ---
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'NVDA']
END_DATE = datetime.now()
START_DATE = END_DATE - timedelta(days=365)

# --- 1. FETCH RAW DATA ---
print("Fetching 1 year of stock data...")
stock_df = fetch_ticker_data(TICKERS, period="1y")
if isinstance(stock_df.columns, pd.MultiIndex):
    stock_df.columns = ['_'.join(col).strip() for col in stock_df.columns.values]
stock_df.index = pd.to_datetime(stock_df.index).tz_localize(None)
print("✅ Stock data fetched.")

print("Fetching 1 year of news data...")
all_news = []
for ticker in TICKERS:
    news = fetch_news_headlines(query=f"{ticker} company", page_size=100)
    news['ticker'] = ticker
    all_news.append(news)
news_df = pd.concat(all_news, ignore_index=True)
news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
print("✅ News data fetched.")

print("Fetching FRED data...")
macro_data_dict = fetch_macro_data()
print("✅ FRED data fetched.")

# --- 2. ENGINEER SENTIMENT & EVENT FEATURES ---
print("Performing sentiment and event analysis...")
analyzer = SentimentIntensityAnalyzer()

NEGATIVE_KEYWORDS = ['layoffs', 'debt', 'downgrade', 'lawsuit', 'investigation', 'recall', 'outage', 'cuts', 'fine']
POSITIVE_KEYWORDS = ['expansion', 'profit', 'upgrade', 'hiring', 'record', 'partnership', 'launch', 'beats', 'growth']

def analyze_headline(headline):
    sentiment = analyzer.polarity_scores(headline)['compound']
    event_type = 'Neutral'
    if any(kw in headline.lower() for kw in NEGATIVE_KEYWORDS):
        event_type = 'Negative Event'
    elif any(kw in headline.lower() for kw in POSITIVE_KEYWORDS):
        event_type = 'Positive Event'
    return sentiment, event_type

news_df[['sentiment', 'event_type']] = news_df['title'].apply(analyze_headline).apply(pd.Series)
news_df['date'] = news_df['publishedAt'].dt.date
daily_features = news_df.groupby(['ticker', 'date']).agg(
    sentiment=('sentiment', 'mean'),
    positive_events=('event_type', lambda x: (x == 'Positive Event').sum()),
    negative_events=('event_type', lambda x: (x == 'Negative Event').sum())
).reset_index()
daily_features['date'] = pd.to_datetime(daily_features['date'])
print("✅ Sentiment and event analysis complete.")


# --- 3. COMBINE INTO MASTER DATAFRAME ---
print("Combining all data into a master DataFrame...")
master_df_list = []
for ticker in TICKERS:
    ticker_stock_df = stock_df[[col for col in stock_df.columns if ticker in col]].copy()
    ticker_stock_df.columns = [col.replace(f"_{ticker}", "") for col in ticker_stock_df.columns]
    ticker_stock_df['ticker'] = ticker
    ticker_stock_df.reset_index(inplace=True)
    ticker_stock_df.rename(columns={'index': 'date', 'Date': 'date'}, inplace=True)
    
    # --- NEW: TREND INDICATOR LOGIC ---
    ma_short = ticker_stock_df['Close'].rolling(window=30, min_periods=1).mean()
    ma_long = ticker_stock_df['Close'].rolling(window=90, min_periods=1).mean()
    ticker_stock_df['trend_indicator'] = ma_short / ma_long
    # ------------------------------------
    
    ticker_daily_features = daily_features[daily_features['ticker'] == ticker]
    
    merged_df = pd.merge(ticker_stock_df, ticker_daily_features, on=['date', 'ticker'], how='left')
    master_df_list.append(merged_df)

final_df = pd.concat(master_df_list, ignore_index=True)

if macro_data_dict:
    for series_name, value in macro_data_dict.items():
        final_df[series_name] = value

# Fill missing values
final_df[['sentiment', 'positive_events', 'negative_events']] = final_df[['sentiment', 'positive_events', 'negative_events']].fillna(0)
final_df['trend_indicator'] = final_df['trend_indicator'].fillna(1)
print("✅ Master DataFrame created.")
final_df.head()

  data = yf.download(tickers, period=period, interval=interval)


Fetching 1 year of stock data...
Fetching data for tickers: ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'NVDA']...


[*********************100%***********************]  5 of 5 completed


Data fetched successfully.
✅ Stock data fetched.
Fetching 1 year of news data...
Fetching news for query: 'AAPL company'...
Successfully fetched 100 articles.
Fetching news for query: 'MSFT company'...
Successfully fetched 100 articles.
Fetching news for query: 'GOOGL company'...
Successfully fetched 100 articles.
Fetching news for query: 'TSLA company'...
Successfully fetched 100 articles.
Fetching news for query: 'NVDA company'...
Successfully fetched 100 articles.
✅ News data fetched.
Fetching FRED data...
Successfully fetched macroeconomic data from FRED.
✅ FRED data fetched.
Performing sentiment and event analysis...
✅ Sentiment and event analysis complete.
Combining all data into a master DataFrame...
✅ Master DataFrame created.


Unnamed: 0,date,Close,High,Low,Open,Volume,ticker,trend_indicator,sentiment,positive_events,negative_events,GDP,CPI,FEDFUNDS,UNRATE,BAMLH0A0HYM2
0,2024-08-21,225.351227,226.923909,224.00749,225.470681,34765500,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
1,2024-08-22,223.489868,227.282216,222.862782,226.734761,43695300,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
2,2024-08-23,225.789169,227.162781,223.290802,224.614643,38677300,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
3,2024-08-26,226.127594,226.227137,222.852841,225.709541,30602200,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9
4,2024-08-27,226.973663,227.789872,223.84821,224.953068,35934600,AAPL,1.0,0.0,0.0,0.0,30331.117,322.132,4.33,4.2,2.9


In [3]:
# Save the final dataset to a CSV file in the project's root directory
output_path = '../training_dataset.csv'
final_df.to_csv(output_path, index=False)

print(f"✅ Final training dataset with {len(final_df)} rows saved to '{output_path}'")

✅ Final training dataset with 1250 rows saved to '../training_dataset.csv'


In [4]:
# In a new cell in your notebook

# Filter the DataFrame to find all rows where an event was detected
detected_events_df = news_df[news_df['event_type'] != 'Neutral']

if not detected_events_df.empty:
    print(f"Found {len(detected_events_df)} headlines classified as significant events:")
    # Display the ticker, event type, and the headline text
    print(detected_events_df[['ticker', 'event_type', 'title']])
else:
    print("No significant positive or negative events found in the fetched headlines.")
    print("This is normal if there was no major news for the selected tickers in the last month.")

Found 21 headlines classified as significant events:
    ticker      event_type                                              title
9     AAPL  Negative Event  Sterling Manor Financial LLC Cuts Position in ...
28    AAPL  Positive Event  Direxion Daily AAPL Bull 2X Shares (NASDAQ:AAP...
36    AAPL  Positive Event  Foxconn’s Apple era fades as AI servers drive ...
76    AAPL  Negative Event  AAPL INVESTOR ALERT: Bronstein, Gewirtz & Gros...
91    AAPL  Negative Event  Pacific Wealth Strategies Group Inc. Cuts Posi...
111   MSFT  Positive Event  OpenAI Launches ChatGPT Go in India at $4.6, I...
157   MSFT  Negative Event  Coastline Trust Co Cuts Stock Position in Micr...
203  GOOGL  Positive Event  Alphabet (GOOGL) Boosts Capex With $9B AI and ...
231  GOOGL  Positive Event  Why This 1 Growth Stock Could Be a Great Addit...
259  GOOGL  Negative Event  Maryland State Retirement & Pension System Cut...
286  GOOGL  Positive Event  Big Tech is driving the stock market to new re...
306   TSLA 