<a href="https://colab.research.google.com/github/Prithwi13/6302_stock/blob/main/6302_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

first cell imports all the necessary libraries for your project. Most importantly, it securely loads your API keys from the Colab "Secrets" (ðŸ”‘) tab.

In [None]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import time
import sys
from google.colab import userdata

# === SECURELY Load API keys from Colab Secrets ===
try:
    ALPHAVANTAGE_KEY = userdata.get('ALPHAVANTAGE_KEY')
    NEWSAPI_KEY = userdata.get('NEWSAPI_KEY')
    print("âœ“ API keys loaded successfully.")
except userdata.SecretNotFoundError:
    print("âœ— FATAL ERROR: API Key not found in Colab Secrets.")
    print("  Please click the 'Key' icon (ðŸ”‘) on the left sidebar,")
    print("  add 'ALPHAVANTAGE_KEY' and 'NEWSAPI_KEY',")
    print("  and make sure 'Notebook access' is toggled ON for both.")
    sys.exit(1) # Stop execution
except Exception as e:
    print(f"âœ— FATAL ERROR: Could not load secrets. {e}")
    sys.exit(1)

âœ“ API keys loaded successfully.


This code runs the import statements and then immediately tries to fetch your keys from the Colab Secrets manager. If it fails, it prints a helpful error message and stops the script.

we define the classes responsible for fetching data from the two external APIs. These are your "worker" classes.

In [None]:
class StockPriceAPI:
    """Handles stock price data using Alpha Vantage"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://www.alphavantage.co/query"
        print(f"âœ“ Alpha Vantage initialized (API key: ...{api_key[-4:]})")

    def get_intraday_quotes(self, ticker: str, interval='5m', period='7d') -> pd.DataFrame:
        print(f"  ... [Alpha Vantage] Waiting 13s to respect 5 calls/min rate limit...")
        time.sleep(13)
        av_interval = interval.replace('m', 'min')
        av_outputsize = 'compact' if period == '1d' else 'full'
        params = {
            'function': 'TIME_SERIES_INTRADAY', 'symbol': ticker, 'interval': av_interval,
            'outputsize': av_outputsize, 'apikey': self.api_key, 'datatype': 'json'
        }
        try:
            response = requests.get(self.base_url, params=params, timeout=20)
            response.raise_for_status()
            data = response.json()
            if 'Error Message' in data:
                print(f"âœ— [Alpha Vantage] API Error for {ticker}: {data['Error Message']}")
                return pd.DataFrame()
            if 'Note' in data:
                print(f"âœ— [Alpha Vantage] API Note for {ticker}: {data['Note']}")
                return pd.DataFrame()
            data_key = next((key for key in data.keys() if 'Time Series' in key), None)
            if data_key is None:
                print(f"âœ— [Alpha Vantage] Could not find 'Time Series' data key for {ticker}.")
                return pd.DataFrame()

            df = pd.DataFrame.from_dict(data[data_key], orient='index')
            if df.empty: return pd.DataFrame()

            df = df.reset_index().rename(columns={
                'index': 'timestamp', '1. open': 'open', '2. high': 'high',
                '3. low': 'low', '4. close': 'close', '5. volume': 'volume'
            })
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            for col in ['open', 'high', 'low', 'close', 'volume']:
                df[col] = pd.to_numeric(df[col])
            df['ticker'] = ticker
            try:
                df['timestamp'] = df['timestamp'].dt.tz_localize('America/New_York').dt.tz_convert('UTC')
            except Exception:
                df['timestamp'] = df['timestamp'].dt.tz_localize('UTC') # Fallback
            df = df[['timestamp', 'ticker', 'open', 'high', 'low', 'close', 'volume']]
            return df.sort_values('timestamp').reset_index(drop=True)
        except Exception as e:
            print(f"âœ— Error fetching Alpha Vantage data for {ticker}: {e}")
            return pd.DataFrame()

class MarketAuxAPI:
    """Handles financial news from MarketAux"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.marketaux.com/v1/news/all"
        print(f"âœ“ MarketAuxAPI initialized (API key: ...{api_key[-4:]})")

    def _parse_articles(self, articles: List[Dict], ticker: str) -> pd.DataFrame:
        """Helper to parse article JSON into a DataFrame."""
        records = []
        for article in articles:
            records.append({
                'timestamp': pd.to_datetime(article['published_at']).tz_convert('UTC'),
                'headline': article['title'],
                'description': article.get('description', '') or article.get('snippet', ''),
                'source': article.get('source', 'unknown'),
                'url': article['url'],
                'ticker': ticker
            })
        df = pd.DataFrame(records)
        if not df.empty:
            df['timestamp'] = df['timestamp'].dt.normalize()
        return df

    def get_news_incremental(self, ticker: str) -> pd.DataFrame:
        """Gets news from the last 3 days for daily updates."""
        print(f"  ... [MarketAux] Fetching recent news for {ticker}...")
        date_to = datetime.now()
        date_from = date_to - timedelta(days=3)

        params = {
            'api_token': self.api_key,
            'symbols': ticker,
            'language': 'en',
            'published_after': date_from.strftime('%Y-%m-%dT%H:%M:%S'),

            # --- THIS IS THE LIMIT YOU ASKED ABOUT ---
            # This asks the API for the max 50 recent articles.
            # If we remove this, it will default to 10 or 25.
            'limit': 50
        }
        try:
            response = requests.get(self.base_url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            articles = data.get('data', [])
            return self._parse_articles(articles, ticker)
        except Exception as e:
            print(f"âœ— Error fetching MarketAux incremental news for {ticker}: {e}")
            return pd.DataFrame()

    def get_news_backfill(self, ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
        """Gets historical news for a given date range. Loops by month."""
        print(f"  ... [MarketAux] Backfilling news for {ticker} from {start_date} to {end_date}...")
        all_articles = []
        current_date = pd.to_datetime(start_date)
        end_date_dt = pd.to_datetime(end_date)

        while current_date <= end_date_dt:
            month_start = current_date.strftime('%Y-%m-01')
            month_end_dt = (current_date + pd.offsets.MonthEnd(1))
            if month_end_dt > end_date_dt:
                month_end_dt = end_date_dt
            month_end = month_end_dt.strftime('%Y-%m-%d')

            print(f"    ... Fetching {ticker} news for {month_start} to {month_end}")
            params = {
                'api_token': self.api_key,
                'symbols': ticker,
                'language': 'en',
                'published_after': f"{month_start}T00:00:00",
                'published_before': f"{month_end}T23:59:59",

                # --- THIS IS THE OTHER LIMIT ---
                # This asks the API for the max 100 articles *for this month*.
                # This is the highest number the free plan allows per request.
                'limit': 100
            }
            try:
                response = requests.get(self.base_url, params=params, timeout=20)
                response.raise_for_status()
                data = response.json()
                articles = data.get('data', [])
                all_articles.extend(articles)
                time.sleep(13) # Wait to avoid rate limiting (5 req/min)
            except Exception as e:
                print(f"âœ— Error fetching MarketAux backfill for {ticker} ({month_start}): {e}")

            current_date = current_date + pd.offsets.MonthBegin(1)

        return self._parse_articles(all_articles, ticker)

This cell defines the classes. It doesn't run them yet, it just makes them available for later use. Note the critical time.sleep(13) in StockPriceAPIâ€”this is essential for not getting blocked by the free API.

These classes are the "brains" of the operation. IncrementalDataStorage is the most important partâ€”it handles saving data and preventing duplicates. DataCollector manages the entire process.

In [None]:
import os
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import time

# --- NOTE ---
# This code fragment assumes you have already defined the
# 'StockPriceAPI' and 'MarketAuxAPI' classes in a cell above this one.
# (You can get them from the 'stock_pipeline_classes.py' file)


# --- THIS IS THE CORRECTED STORAGE CLASS (with backfill fix) ---
class IncrementalDataStorage:
    """Handles incremental data storage and deduplication."""
    def __init__(self, data_dir='data/raw'):
        self.data_dir = data_dir
        os.makedirs(data_dir, exist_ok=True)
        self.prices_master_file = os.path.join(data_dir, 'prices_master.csv')
        self.news_master_file = os.path.join(data_dir, 'news_master.csv')
        print(f"âœ“ IncrementalDataStorage initialized.")
        print(f"  ... Price file: {self.prices_master_file}")
        print(f"  ... News file: {self.news_master_file}")

    def load_master_data(self) -> Dict[str, pd.DataFrame]:
        prices = pd.DataFrame()
        news = pd.DataFrame()
        if os.path.exists(self.prices_master_file):
            prices = pd.read_csv(self.prices_master_file)
            prices['timestamp'] = pd.to_datetime(prices['timestamp'], utc=True)
            print(f"âœ“ Loaded {len(prices):,} existing price records")
        else:
            print("â„¹ No existing price data found (starting fresh)")
        if os.path.exists(self.news_master_file):
            news = pd.read_csv(self.news_master_file)
            news['timestamp'] = pd.to_datetime(news['timestamp'], utc=True)
            print(f"âœ“ Loaded {len(news):,} existing news records")
        else:
            print("â„¹ No existing news data found (starting fresh)")
        return {'prices': prices, 'news': news}

    def append_new_data(self, new_data: Dict[str, pd.DataFrame], is_backfill: bool = False):
        """
        Appends new data, handling backfills and incremental updates.
        This is the fixed version.
        """
        existing = self.load_master_data()

        if 'prices' in new_data and not new_data['prices'].empty:
            if is_backfill:
                print("... Backfill detected. Overwriting price data.")
                existing['prices'] = pd.DataFrame(columns=new_data['prices'].columns)

            combined_prices = pd.concat([existing['prices'], new_data['prices']], ignore_index=True)
            combined_prices['timestamp'] = pd.to_datetime(combined_prices['timestamp'], utc=True)
            combined_prices = combined_prices.drop_duplicates(subset=['timestamp', 'ticker'], keep='last')
            combined_prices = combined_prices.sort_values(['ticker', 'timestamp']).reset_index(drop=True)
            combined_prices.to_csv(self.prices_master_file, index=False)
            added_count = len(combined_prices) - len(existing['prices'])
            print(f"âœ“ Saved {len(combined_prices):,} total price records (added {added_count} new)")

        if 'news' in new_data and not new_data['news'].empty:
            if is_backfill:
                print("... Backfill detected. Overwriting news data.")
                existing['news'] = pd.DataFrame(columns=new_data['news'].columns)

            combined_news = pd.concat([existing['news'], new_data['news']], ignore_index=True)
            combined_news['timestamp'] = pd.to_datetime(combined_news['timestamp'], utc=True)
            combined_news = combined_news.drop_duplicates(subset=['headline', 'ticker', 'timestamp'], keep='first')
            combined_news = combined_news.sort_values(['ticker', 'timestamp']).reset_index(drop=True)
            combined_news.to_csv(self.news_master_file, index=False)
            added_count = len(combined_news) - len(existing['news'])
            print(f"âœ“ Saved {len(combined_news):,} total news records (added {added_count} new)")

    def get_statistics(self) -> dict:
        data = self.load_master_data()
        stats = {'total_price_records': len(data['prices']), 'total_news_articles': len(data['news']), 'tickers': [], 'date_range': None}
        if not data['prices'].empty:
            stats['tickers'] = data['prices']['ticker'].unique().tolist()
            stats['date_range'] = {'start': data['prices']['timestamp'].min(), 'end': data['prices']['timestamp'].max()}
            stats['records_per_ticker'] = data['prices'].groupby('ticker').size().to_dict()
        return stats
# --- END OF CORRECTED STORAGE CLASS ---


# --- THIS IS THE CORRECTED DATA COLLECTOR CLASS ---
class DataCollector:
    """Main data collection orchestrator"""

    # --- FIX: It now accepts MarketAuxAPI, not the old NewsAPI ---
    def __init__(self, price_api: 'StockPriceAPI', news_api: 'MarketAuxAPI', storage: IncrementalDataStorage):
        self.price_api = price_api
        self.news_api = news_api
        self.storage = storage
        print("âœ“ DataCollector initialized.")

    def run_collection(self, tickers: List[str], collection_type: str, backfill_start_date: str):
        """
        This method is built for DAILY data and works with MarketAux.
        (It does not use 'interval' or 'period'.)
        """
        all_prices, all_news = [], []
        is_backfill = (collection_type == 'backfill')

        # --- 1. Price Collection (for DAILY data) ---
        for ticker in tickers:
            print(f"\nðŸ“Š Collecting prices for {ticker}...")
            if is_backfill:
                prices = self.price_api.get_daily_quotes_backfill(ticker)
            else: # 'incremental'
                prices = self.price_api.get_daily_quotes_incremental(ticker)

            if not prices.empty:
                all_prices.append(prices)
                print(f"  âœ“ Got {len(prices)} price records")

        if not all_prices:
            print("No price data fetched. Skipping news collection.")
            return

        all_prices_df = pd.concat(all_prices, ignore_index=True)

        # --- 2. News Collection (for DAILY data) ---
        min_price_date_str = all_prices_df['timestamp'].min().strftime('%Y-%m-%d')
        max_price_date_str = all_prices_df['timestamp'].max().strftime('%Y-%m-%d')

        # Use the user's requested start date, but don't ask for news
        # older than our oldest price data.
        final_backfill_start_date = max(min_price_date_str, backfill_start_date)

        for ticker in tickers:
            print(f"\nðŸ“° Collecting news for {ticker}...")
            if is_backfill:
                print(f"  ... Backfilling news from {final_backfill_start_date} to {max_price_date_str}")
                news = self.news_api.get_news_backfill(ticker, final_backfill_start_date, max_price_date_str)
            else: # 'incremental'
                news = self.news_api.get_news_incremental(ticker)

            if not news.empty:
                all_news.append(news)
                print(f"  âœ“ Got {len(news)} news articles")

        # --- 3. Storage ---
        new_data = {
            'prices': all_prices_df,
            'news': pd.concat(all_news, ignore_index=True) if all_news else pd.DataFrame()
        }
        print("\nðŸ’¾ Saving data to master files...")
        self.storage.append_new_data(new_data, is_backfill=is_backfill)
        return new_data

print("âœ“ Data storage and collector classes defined.")
# --- END OF CORRECTED DATA COLLECTOR ---

âœ“ Data storage and collector classes defined.


This cell defines the classes that handle saving, loading, deduplicating, and orchestrating the entire collection process. Again, no data is fetched or saved yet.

Now we create "instances" of our classes. We pass the API keys to the fetching classes and create the storage and collector objects.

In [None]:
import sys
from google.colab import userdata

# Configuration
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'AMZN']

# --- FIX: Load the CORRECT API keys ---
# We need ALPHAVANTAGE_KEY and MARKETAUX_KEY
try:
    ALPHAVANTAGE_KEY = userdata.get('ALPHAVANTAGE_KEY')
    MARKETAUX_KEY = userdata.get('MARKETAUX_KEY') # <-- Use the new key
    print("âœ“ API keys loaded successfully.")
except Exception as e:
    print(f"âœ— FATAL: Could not load keys. Make sure ALPHAVANTAGE_KEY and MARKETAUX_KEY are in Colab Secrets. Error: {e}")
    # sys.exit(1) # This might stop your Colab session, you can comment it out if you prefer
    raise e # A better way to show the error in Colab

# --- FIX: Initialize the CORRECT classes ---
# (These classes must be defined in a cell *before* this one.
#  You can get them from the 'stock_pipeline_classes.py' file)

price_api = StockPriceAPI(ALPHAVANTAGE_KEY)

# --- This is the main fix: ---
# 1. Use MarketAuxAPI, not NewsAPI
# 2. Pass it MARKETAUX_KEY,
news_api = MarketAuxAPI(MARKETAUX_KEY)
storage = IncrementalDataStorage('data/raw')
collector = DataCollector(price_api, news_api, storage)

print("\nâœ“ All components initialized and ready.")

âœ“ API keys loaded successfully.
âœ“ Alpha Vantage initialized (API key: ...Z82S)
âœ“ MarketAuxAPI initialized (API key: ...EtV7)
âœ“ IncrementalDataStorage initialized.
  ... Price file: data/raw/prices_master.csv
  ... News file: data/raw/news_master.csv
âœ“ DataCollector initialized.

âœ“ All components initialized and ready.


This is the final setup step. We've defined our list of TICKERS and created the objects we'll use in the next steps.

Let's check the status of our data files before we run the collection.

In [None]:
print("\n" + "="*70)
print("  STATUS BEFORE COLLECTION")
print("="*70)

stats_before = storage.get_statistics()
print(f"  Existing price records: {stats_before['total_price_records']:,}")
print(f"  Existing news articles: {stats_before['total_news_articles']:,}")


  STATUS BEFORE COLLECTION
âœ“ Loaded 16,996 existing price records
âœ“ Loaded 669 existing news records
  Existing price records: 16,996
  Existing news articles: 669


We call the get_statistics() method on our storage object. On the very first run, this will create the data/raw directory and report that no data exists.

This is the main event. We call collector.collect_and_store(). This cell will take over a minute to run because of the 13-second pause for each of the 5 tickers (5 * 13 = 65 seconds).

In [None]:
def main_data_collection(collection_type: str):
    """Main function to run the data collection step."""

    TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'AMZN']

    # --- FIX for "5 YEARS" ---
    # This calculates the date 5 years ago from today.
    FIVE_YEARS_AGO = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d')

    # Initialize components
    try:
        # Load keys from Colab Secrets
        ALPHAVANTAGE_KEY = userdata.get('ALPHAVANTAGE_KEY')
        MARKETAUX_KEY = userdata.get('MARKETAUX_KEY')

        price_api = StockPriceAPI(ALPHAVANTAGE_KEY)
        news_api = MarketAuxAPI(MARKETAUX_KEY)
        storage = IncrementalDataStorage('data/raw')
        collector = DataCollector(price_api, news_api, storage)
        print("\nâœ“ All components initialized and ready.")
    except Exception as e:
        print(f"âœ— FATAL: Could not initialize components. Check API keys. Error: {e}")
        return

    if collection_type == 'backfill':
        print("\n" + "="*70)
        print(f"  RUNNING ONE-TIME DATA BACKFILL (Targeting start date: {FIVE_YEARS_AGO})")
        print("  This will take several minutes...")
        print("="*70)

        # --- THIS IS THE FIX ---
        # 1. Call the new function: `run_collection`
        # 2. Use the new parameters: `collection_type` and `backfill_start_date`
        collector.run_collection(
            tickers=TICKERS,
            collection_type='backfill',
            backfill_start_date=FIVE_YEARS_AGO # Use our 5-year variable
        )
        # --- END OF FIX ---

        print("\n" + "="*70)
        print("  âœ… BACKFILL COMPLETE!")
        print("="*70)

    else: # 'incremental'
        print("\n" + "="*70)
        print("  RUNNING DAILY INCREMENTAL UPDATE...")
        print("="*70)

        # --- THIS IS THE FIX ---
        collector.run_collection(
            tickers=TICKERS,
            collection_type='incremental',
            backfill_start_date=FIVE_YEARS_AGO # Pass our 5-year variable
        )
        # --- END OF FIX ---

        print("\n" + "="*70)
        print("  âœ… DAILY UPDATE COMPLETE!")
        print("="*70)



The collector will now loop through each ticker. For each one, it will:

Call price_api.get_intraday_quotes() (pausing for 13 seconds).

Call news_api.get_news().

After the loop, it will call storage.append_new_data() to save the results.

Now that Cell 6 has finished, let's check the stats again. The numbers should now reflect the data we just downloaded.

In [None]:
print("\n" + "="*70)
print("  STATUS AFTER COLLECTION")
print("="*70)

stats_after = storage.get_statistics()
print(f"  Total price records: {stats_after['total_price_records']:,}")
print(f"  Total news articles: {stats_after['total_news_articles']:,}")
if stats_after.get('date_range'):
    print(f"  Date range: {stats_after['date_range']['start']} to {stats_after['date_range']['end']}")
if stats_after.get('records_per_ticker'):
    print("\n  Records per ticker:")
    for ticker, count in stats_after['records_per_ticker'].items():
        print(f"    â€¢ {ticker}: {count:,} records")


  STATUS AFTER COLLECTION
âœ“ Loaded 16,996 existing price records
âœ“ Loaded 669 existing news records
  Total price records: 16,996
  Total news articles: 669
  Date range: 2025-10-13 08:00:00+00:00 to 2025-11-12 00:55:00+00:00

  Records per ticker:
    â€¢ AAPL: 4,224 records
    â€¢ AMZN: 4,224 records
    â€¢ GOOGL: 4,224 records
    â€¢ MSFT: 100 records
    â€¢ TSLA: 4,224 records


Running get_statistics() now will find the prices_master.csv and news_master.csv files, load them, and report the new counts.

The stats are great, but let's look at the actual data we saved to confirm it's correct.

In [None]:
def view_sample_data():
    """Helper function to view accumulated data"""
    print("\n" + "="*70)
    print("  ACCUMULATED DATA VIEWER")
    print("="*70)
    storage = IncrementalDataStorage('data/raw')
    data = storage.load_master_data()

    if not data['prices'].empty:
        print("\n  ðŸ“Š Sample Price Data (latest 5 records):")
        print(data['prices'].tail(5)[['timestamp', 'ticker', 'close', 'volume']])
    else:
        print("\n  â„¹ No price data to display.")

    if not data['news'].empty:
        print("\n  ðŸ“° Sample News Data (latest 5 articles):")
        for _, row in data['news'].tail(5).iterrows():
            print(f"    â€¢ [{row['ticker']}] {row['headline'][:70]}...")
    else:
        print("\n  â„¹ No news data to display.")
    print("\n" + "="*70 + "\n")

# Run the view function
view_sample_data()


  ACCUMULATED DATA VIEWER
âœ“ IncrementalDataStorage initialized.
  ... Price file: data/raw/prices_master.csv
  ... News file: data/raw/news_master.csv
âœ“ Loaded 16,996 existing price records
âœ“ Loaded 669 existing news records

  ðŸ“Š Sample Price Data (latest 5 records):
                      timestamp ticker    close  volume
16991 2025-11-12 00:35:00+00:00   TSLA  439.110    6334
16992 2025-11-12 00:40:00+00:00   TSLA  439.190    2198
16993 2025-11-12 00:45:00+00:00   TSLA  439.190    9551
16994 2025-11-12 00:50:00+00:00   TSLA  439.245   13443
16995 2025-11-12 00:55:00+00:00   TSLA  439.290   15912

  ðŸ“° Sample News Data (latest 5 articles):
    â€¢ [TSLA] Ford â€˜canâ€™t walk away from EVsâ€™ or it risks falling even further behin...
    â€¢ [TSLA] Did Trump say Kamala Harris would cause 'bread lines'?...
    â€¢ [TSLA] Tesla's First Semi Vehicle Customer After Full Production Launch Isâ€¦.T...
    â€¢ [TSLA] Trump ally takes charge at controversial Israeli spy technology co

This cell defines and runs a simple function to load the master files and print the last 5 rows of each DataFrame. This is the best way to verify that the data looks correct.

In [None]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import time
import sys

# Try to import userdata, otherwise fall back to environment variables
try:
    from google.colab import userdata
    ALPHAVANTAGE_KEY = userdata.get('ALPHAVANTAGE_KEY')
    NEWSAPI_KEY = userdata.get('NEWSAPI_KEY')
    print("âœ“ Colab userdata keys loaded.")
except (ImportError, ModuleNotFoundError):
    print("Not in Colab. Trying environment variables...")
    ALPHAVANTAGE_KEY = os.environ.get('ALPHAVANTAGE_KEY')
    NEWSAPI_KEY = os.environ.get('NEWSAPI_KEY')
    if not ALPHAVANTAGE_KEY or not NEWSAPI_KEY:
        print("âœ— FATAL ERROR: API keys not found in environment variables.")
        print("  Please set ALPHAVANTAGE_KEY and NEWSAPI_KEY.")
        sys.exit(1)
    print("âœ“ Environment variable keys loaded.")
except Exception as e:
    print(f"âœ— FATAL ERROR: Could not load secrets. {e}")
    sys.exit(1)

# === DATA FETCHING CLASSES (Copied from your notebook) ===

class StockPriceAPI:
    """Handles stock price data using Alpha Vantage"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://www.alphavantage.co/query"
        print(f"âœ“ Alpha Vantage initialized (API key: ...{api_key[-4:]})")

    def get_intraday_quotes(self, ticker: str, interval='5m', period='7d') -> pd.DataFrame:
        print(f"  ... [Alpha Vantage] Waiting 13s to respect 5 calls/min rate limit...")
        time.sleep(13)
        av_interval = interval.replace('m', 'min')

        # --- THIS IS THE KEY CHANGE ---
        # We are passing a period != '1d', so this will set av_outputsize to 'full'
        av_outputsize = 'compact' if period == '1d' else 'full'
        print(f"  ... Requesting 'outputsize={av_outputsize}'")
        # --- END OF CHANGE ---

        params = {
            'function': 'TIME_SERIES_INTRADAY', 'symbol': ticker, 'interval': av_interval,
            'outputsize': av_outputsize, 'apikey': self.api_key, 'datatype': 'json'
        }
        try:
            response = requests.get(self.base_url, params=params, timeout=20)
            response.raise_for_status()
            data = response.json()
            if 'Error Message' in data:
                print(f"âœ— [Alpha Vantage] API Error for {ticker}: {data['Error Message']}")
                return pd.DataFrame()
            if 'Note' in data:
                print(f"âœ— [Alpha Vantage] API Note for {ticker}: {data['Note']}")
                return pd.DataFrame()
            data_key = next((key for key in data.keys() if 'Time Series' in key), None)
            if data_key is None:
                print(f"âœ— [Alpha Vantage] Could not find 'Time Series' data key for {ticker}.")
                return pd.DataFrame()

            df = pd.DataFrame.from_dict(data[data_key], orient='index')
            if df.empty: return pd.DataFrame()

            df = df.reset_index().rename(columns={
                'index': 'timestamp', '1. open': 'open', '2. high': 'high',
                '3. low': 'low', '4. close': 'close', '5. volume': 'volume'
            })
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            for col in ['open', 'high', 'low', 'close', 'volume']:
                df[col] = pd.to_numeric(df[col])
            df['ticker'] = ticker
            try:
                df['timestamp'] = df['timestamp'].dt.tz_localize('America/New_York').dt.tz_convert('UTC')
            except Exception:
                df['timestamp'] = df['timestamp'].dt.tz_localize('UTC') # Fallback
            df = df[['timestamp', 'ticker', 'open', 'high', 'low', 'close', 'volume']]
            return df.sort_values('timestamp').reset_index(drop=True)
        except Exception as e:
            print(f"âœ— Error fetching Alpha Vantage data for {ticker}: {e}")
            return pd.DataFrame()

class NewsAPI:
    """Handles financial news from NewsAPI"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        print(f"âœ“ NewsAPI initialized (API key: ...{api_key[-4:]})")

    def get_news(self, ticker: str, company_name: str = None, days_back: int = 7) -> pd.DataFrame:
        url = 'https://newsapi.org/v2/everything'
        query = company_name if company_name else ticker
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        params = {
            'q': query, 'from': from_date, 'sortBy': 'publishedAt',
            'language': 'en', 'apiKey': self.api_key, 'pageSize': 100
        }
        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            if data['status'] != 'ok':
                raise ValueError(f"API Error: {data.get('message', 'Unknown error')}")

            records = [{
                'timestamp': pd.to_datetime(article['publishedAt']).tz_convert('UTC'),
                'headline': article['title'],
                'description': article.get('description', ''),
                'source': article['source']['name'],
                'url': article['url'],
                'ticker': ticker
            } for article in data.get('articles', []) if article.get('title')]

            return pd.DataFrame(records).sort_values('timestamp').reset_index(drop=True) if records else pd.DataFrame()
        except Exception as e:
            print(f"âœ— Error fetching news for {ticker}: {e}")
            return pd.DataFrame()

class IncrementalDataStorage:
    """Handles incremental data storage and deduplication."""
    def __init__(self, data_dir='data/raw'):
        self.data_dir = data_dir
        os.makedirs(data_dir, exist_ok=True)
        self.prices_master_file = os.path.join(data_dir, 'prices_master.csv')
        self.news_master_file = os.path.join(data_dir, 'news_master.csv')
        self.backup_dir = os.path.join(data_dir, 'daily_backups')
        os.makedirs(self.backup_dir, exist_ok=True)

    def load_master_data(self) -> Dict[str, pd.DataFrame]:
        prices = pd.DataFrame()
        news = pd.DataFrame()
        if os.path.exists(self.prices_master_file):
            prices = pd.read_csv(self.prices_master_file)
            prices['timestamp'] = pd.to_datetime(prices['timestamp'])
            print(f"âœ“ Loaded {len(prices):,} existing price records")
        else:
            print("â„¹ No existing price data found (starting fresh)")
        if os.path.exists(self.news_master_file):
            news = pd.read_csv(self.news_master_file)
            news['timestamp'] = pd.to_datetime(news['timestamp'])
            print(f"âœ“ Loaded {len(news):,} existing news records")
        else:
            print("â„¹ No existing news data found (starting fresh)")
        return {'prices': prices, 'news': news}

    def append_new_data(self, new_data: Dict[str, pd.DataFrame]):
        existing = self.load_master_data()
        if not new_data['prices'].empty:
            combined_prices = pd.concat([existing['prices'], new_data['prices']], ignore_index=True)
            combined_prices = combined_prices.drop_duplicates(subset=['timestamp', 'ticker'], keep='last')
            combined_prices = combined_prices.sort_values(['ticker', 'timestamp']).reset_index(drop=True)
            combined_prices.to_csv(self.prices_master_file, index=False)
            print(f"âœ“ Saved {len(combined_prices):,} total price records (added {len(new_data['prices'])} new)")
        if not new_data['news'].empty:
            combined_news = pd.concat([existing['news'], new_data['news']], ignore_index=True)
            combined_news = combined_news.drop_duplicates(subset=['headline', 'ticker'], keep='first')
            combined_news = combined_news.sort_values(['ticker', 'timestamp']).reset_index(drop=True)
            combined_news.to_csv(self.news_master_file, index=False)
            print(f"âœ“ Saved {len(combined_news):,} total news records (added {len(new_data['news'])} new)")

    def get_statistics(self) -> dict:
        data = self.load_master_data()
        stats = {'total_price_records': len(data['prices']), 'total_news_articles': len(data['news']), 'tickers': [], 'date_range': None}
        if not data['prices'].empty:
            stats['tickers'] = data['prices']['ticker'].unique().tolist()
            stats['date_range'] = {'start': data['prices']['timestamp'].min(), 'end': data['prices']['timestamp'].max()}
            stats['records_per_ticker'] = data['prices'].groupby('ticker').size().to_dict()
        return stats

class DataCollector:
    """Main data collection orchestrator"""
    def __init__(self, price_api: StockPriceAPI, news_api: NewsAPI, storage: IncrementalDataStorage):
        self.price_api = price_api
        self.news_api = news_api
        self.storage = storage

    def collect_and_store(self, tickers: List[str], interval='5m', period='1d', news_days_back=1):
        all_prices, all_news = [], []
        company_map = {'AAPL': 'Apple', 'MSFT': 'Microsoft', 'GOOGL': 'Google', 'AMZN': 'Amazon', 'TSLA': 'Tesla'}
        for ticker in tickers:
            print(f"\nðŸ“Š Collecting data for {ticker}...")
            prices = self.price_api.get_intraday_quotes(ticker, interval, period)
            if not prices.empty:
                all_prices.append(prices)
                print(f"  âœ“ {len(prices)} price records")
            company_name = company_map.get(ticker, ticker)
            news = self.news_api.get_news(ticker, company_name, news_days_back)
            if not news.empty:
                all_news.append(news)
                print(f"  âœ“ {len(news)} news articles")

        new_data = {
            'prices': pd.concat(all_prices, ignore_index=True) if all_prices else pd.DataFrame(),
            'news': pd.concat(all_news, ignore_index=True) if all_news else pd.DataFrame()
        }
        self.storage.append_new_data(new_data)
        return new_data

# === MAIN EXECUTION ===
def run_backfill():
    TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'AMZN']

    # Initialize components
    price_api = StockPriceAPI(ALPHAVANTAGE_KEY)
    news_api = NewsAPI(NEWSAPI_KEY)
    storage = IncrementalDataStorage('data/raw')
    collector = DataCollector(price_api, news_api, storage)

    print("\n" + "="*70)
    print("  RUNNING ONE-TIME DATA BACKFILL")
    print("  This will take > 1 minute...")
    print("="*70)

    stats_before = storage.get_statistics()
    print(f"  Existing price records: {stats_before['total_price_records']:,}")
    print(f"  Existing news articles: {stats_before['total_news_articles']:,}")

    new_data = collector.collect_and_store(
        tickers=TICKERS,
        interval='5m',
        # --- THIS IS THE FIX ---
        # We set period to '30d'. This is not '1d',
        # so our StockPriceAPI class will use 'outputsize=full'
        period='30d',
        # We also get more news to match the price history
        news_days_back=30
        # --- END OF FIX ---
    )

    print("\n" + "="*70)
    print("  âœ… BACKFILL COMPLETE!")
    print("="*70)

    stats_after = storage.get_statistics()
    print(f"  Total price records: {stats_after['total_price_records']:,}")
    print(f"  Total news articles: {stats_after['total_news_articles']:,}")
    if stats_after.get('date_range'):
        print(f"  New date range: {stats_after['date_range']['start']} to {stats_after['date_range']['end']}")

if __name__ == "__main__":
    run_backfill()

âœ“ Colab userdata keys loaded.
âœ“ Alpha Vantage initialized (API key: ...Z82S)
âœ“ NewsAPI initialized (API key: ...d0ea)

  RUNNING ONE-TIME DATA BACKFILL
  This will take > 1 minute...
âœ“ Loaded 16,996 existing price records
âœ“ Loaded 669 existing news records
  Existing price records: 16,996
  Existing news articles: 669

ðŸ“Š Collecting data for AAPL...
  ... [Alpha Vantage] Waiting 13s to respect 5 calls/min rate limit...
  ... Requesting 'outputsize=full'
âœ— [Alpha Vantage] Could not find 'Time Series' data key for AAPL.
  âœ“ 96 news articles

ðŸ“Š Collecting data for MSFT...
  ... [Alpha Vantage] Waiting 13s to respect 5 calls/min rate limit...
  ... Requesting 'outputsize=full'
âœ— [Alpha Vantage] Could not find 'Time Series' data key for MSFT.
  âœ“ 100 news articles

ðŸ“Š Collecting data for GOOGL...
  ... [Alpha Vantage] Waiting 13s to respect 5 calls/min rate limit...
  ... Requesting 'outputsize=full'
âœ— [Alpha Vantage] Could not find 'Time Series' data key for GOOG

In [None]:
import pandas as pd
import time
import json
import requests
import torch
import sys
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax

# --- Configuration ---
# --- FIX: Point to the root directory where you uploaded the file ---
NEWS_FILE = 'data/raw/news_master.csv'
OUTPUT_FILE = 'news_with_finbert_sentiment.csv'
# --- End of Fix ---

MODEL_NAME = "ProsusAI/finbert"

# --- Setup (Run this in a Colab cell first) ---
# !pip install transformers torch

@torch.no_grad() # Disable gradient calculations for speed
def get_finbert_sentiment(headlines: list, tokenizer, model) -> list:
    """
    Processes a list of headlines and returns a list of sentiment scores.
    """
    # 1. Tokenize the headlines
    inputs = tokenizer(headlines, padding=True, truncation=True, return_tensors='pt', max_length=512)

    # 2. Run headlines through the model
    outputs = model(**inputs)

    # 3. Get probabilities (softmax)
    # The model outputs 3 scores: [positive, negative, neutral]
    probabilities = softmax(outputs.logits, dim=1)

    # 4. Calculate a single compound score
    # We'll use: (positive - negative)
    # This gives a score from -1.0 (very negative) to 1.0 (very positive)
    positive_probs = probabilities[:, 0]
    negative_probs = probabilities[:, 1]
    # neutral_probs = probabilities[:, 2] # We don't need this for the compound score

    compound_scores = (positive_probs - negative_probs).tolist()

    return compound_scores

def main():
    print(f"Loading headlines from {NEWS_FILE}...")
    try:
        df = pd.read_csv(NEWS_FILE)
    except FileNotFoundError:
        print(f"FATAL: {NEWS_FILE} not found. Did you upload it?")
        print("Make sure it's in the main directory (not a subfolder).")
        return

    # To save time, we only process unique headlines
    unique_headlines = df[['headline']].drop_duplicates().reset_index(drop=True)
    print(f"Found {len(unique_headlines)} unique headlines to process.")

    # --- Load FinBERT Model ---
    print(f"Loading FinBERT model ({MODEL_NAME})... (This may take a moment)")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    print("âœ“ Model loaded.")

    # Process in batches for efficiency
    batch_size = 32
    sentiment_scores = []

    start_time = time.time()
    for i in range(0, len(unique_headlines), batch_size):
        batch_headlines = unique_headlines['headline'][i : i + batch_size].tolist()

        print(f"  Processing batch {i//batch_size + 1}/{len(unique_headlines)//batch_size + 1}...")
        scores = get_finbert_sentiment(batch_headlines, tokenizer, model)
        sentiment_scores.extend(scores)

    end_time = time.time()
    print(f"\nProcessing complete. Took {end_time - start_time:.2f} seconds.")

    # Create a DataFrame from the results
    sentiment_df = pd.DataFrame({
        'headline': unique_headlines['headline'],
        'sentiment': sentiment_scores
    })

    # Merge the new sentiment data back into the original news file
    print("Merging sentiment data back into main news file...")
    final_df = pd.merge(df, sentiment_df, on='headline', how='left')

    # Save the new, enriched file
    final_df.to_csv(OUTPUT_FILE, index=False)
    print(f"âœ“ Success! New file saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    # To run in Colab, add this line at the end
    main()

Loading headlines from data/raw/news_master.csv...
Found 659 unique headlines to process.
Loading FinBERT model (ProsusAI/finbert)... (This may take a moment)
âœ“ Model loaded.
  Processing batch 1/21...
  Processing batch 2/21...
  Processing batch 3/21...
  Processing batch 4/21...
  Processing batch 5/21...
  Processing batch 6/21...
  Processing batch 7/21...
  Processing batch 8/21...
  Processing batch 9/21...
  Processing batch 10/21...
  Processing batch 11/21...
  Processing batch 12/21...
  Processing batch 13/21...
  Processing batch 14/21...
  Processing batch 15/21...
  Processing batch 16/21...
  Processing batch 17/21...
  Processing batch 18/21...
  Processing batch 19/21...
  Processing batch 20/21...
  Processing batch 21/21...

Processing complete. Took 82.08 seconds.
Merging sentiment data back into main news file...
âœ“ Success! New file saved to news_with_finbert_sentiment.csv
