<a href="https://colab.research.google.com/github/Prithwi13/6302_stock/blob/main/6302_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Financial Data Collection with INCREMENTAL STORAGE
Properly accumulates data over 7 days without overwriting
"""

import os
import requests
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import time
import glob

class StockPriceAPI:
    """Handles stock price data using yfinance (completely FREE)"""

    def __init__(self):
        print("âœ“ yfinance initialized (no API key required)")

    def get_intraday_quotes(self, ticker: str, interval='5m', period='7d') -> pd.DataFrame:
        """Fetch intraday quotes using yfinance"""
        try:
            stock = yf.Ticker(ticker)
            df = stock.history(period=period, interval=interval)

            if df.empty:
                print(f"âš  No data returned for {ticker}")
                return pd.DataFrame()

            df = df.reset_index()
            df = df.rename(columns={
                'Datetime': 'timestamp',
                'Open': 'open',
                'High': 'high',
                'Low': 'low',
                'Close': 'close',
                'Volume': 'volume'
            })

            df['ticker'] = ticker
            df = df[['timestamp', 'ticker', 'open', 'high', 'low', 'close', 'volume']]

            if df['timestamp'].dt.tz is None:
                df['timestamp'] = pd.to_datetime(df['timestamp']).dt.tz_localize('UTC')
            else:
                df['timestamp'] = pd.to_datetime(df['timestamp']).dt.tz_convert('UTC')

            return df

        except Exception as e:
            print(f"âœ— Error fetching yfinance data for {ticker}: {e}")
            return pd.DataFrame()


class NewsAPI:
    """Handles financial news from NewsAPI"""

    def __init__(self, api_key: str):
        self.api_key = api_key

    def get_news(self, ticker: str, company_name: str = None, days_back: int = 7) -> pd.DataFrame:
        """Fetch news articles"""
        url = 'https://newsapi.org/v2/everything'
        query = company_name if company_name else ticker
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')

        params = {
            'q': query,
            'from': from_date,
            'sortBy': 'publishedAt',
            'language': 'en',
            'apiKey': self.api_key,
            'pageSize': 100
        }

        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()

            if data['status'] != 'ok':
                raise ValueError(f"API Error: {data.get('message', 'Unknown error')}")

            records = []
            for article in data.get('articles', []):
                if not article.get('title'):
                    continue

                records.append({
                    'timestamp': pd.to_datetime(article['publishedAt']).tz_convert('UTC'),
                    'headline': article['title'],
                    'description': article.get('description', ''),
                    'source': article['source']['name'],
                    'url': article['url'],
                    'ticker': ticker
                })

            if records:
                df = pd.DataFrame(records)
                df = df.sort_values('timestamp').reset_index(drop=True)
                return df

            return pd.DataFrame()

        except Exception as e:
            print(f"âœ— Error fetching news for {ticker}: {e}")
            return pd.DataFrame()


class IncrementalDataStorage:
    """
    Handles incremental data storage - KEY COMPONENT!
    Accumulates data over time without overwriting
    """

    def __init__(self, data_dir='data/raw'):
        self.data_dir = data_dir
        os.makedirs(data_dir, exist_ok=True)

        # Master files that accumulate ALL data
        self.prices_master_file = os.path.join(data_dir, 'prices_master.csv')
        self.news_master_file = os.path.join(data_dir, 'news_master.csv')

        # Daily backup folder
        self.backup_dir = os.path.join(data_dir, 'daily_backups')
        os.makedirs(self.backup_dir, exist_ok=True)

    def load_master_data(self) -> Dict[str, pd.DataFrame]:
        """Load existing accumulated data"""
        prices = pd.DataFrame()
        news = pd.DataFrame()

        # Load prices if exists
        if os.path.exists(self.prices_master_file):
            prices = pd.read_csv(self.prices_master_file)
            prices['timestamp'] = pd.to_datetime(prices['timestamp'])
            print(f"âœ“ Loaded {len(prices):,} existing price records")
        else:
            print("â„¹ No existing price data found (starting fresh)")

        # Load news if exists
        if os.path.exists(self.news_master_file):
            news = pd.read_csv(self.news_master_file)
            news['timestamp'] = pd.to_datetime(news['timestamp'])
            print(f"âœ“ Loaded {len(news):,} existing news records")
        else:
            print("â„¹ No existing news data found (starting fresh)")

        return {'prices': prices, 'news': news}

    def append_new_data(self, new_data: Dict[str, pd.DataFrame]):
        """
        Append new data to master files (CRITICAL METHOD)
        Handles deduplication automatically
        """
        # Load existing data
        existing = self.load_master_data()

        # === PRICES ===
        if not new_data['prices'].empty:
            if existing['prices'].empty:
                # First time - just save
                combined_prices = new_data['prices']
            else:
                # Append new to existing
                combined_prices = pd.concat([existing['prices'], new_data['prices']], ignore_index=True)

            # Remove duplicates (same timestamp + ticker)
            combined_prices = combined_prices.drop_duplicates(
                subset=['timestamp', 'ticker'],
                keep='last'  # Keep newest data
            )

            # Sort by timestamp
            combined_prices = combined_prices.sort_values(['ticker', 'timestamp']).reset_index(drop=True)

            # Save master file
            combined_prices.to_csv(self.prices_master_file, index=False)
            print(f"âœ“ Saved {len(combined_prices):,} total price records (added {len(new_data['prices'])} new)")

        # === NEWS ===
        if not new_data['news'].empty:
            if existing['news'].empty:
                # First time - just save
                combined_news = new_data['news']
            else:
                # Append new to existing
                combined_news = pd.concat([existing['news'], new_data['news']], ignore_index=True)

            # Remove duplicates (same headline + ticker)
            combined_news = combined_news.drop_duplicates(
                subset=['headline', 'ticker'],
                keep='first'  # Keep oldest (first seen)
            )

            # Sort by timestamp
            combined_news = combined_news.sort_values(['ticker', 'timestamp']).reset_index(drop=True)

            # Save master file
            combined_news.to_csv(self.news_master_file, index=False)
            print(f"âœ“ Saved {len(combined_news):,} total news records (added {len(new_data['news'])} new)")

    def create_daily_backup(self):
        """Create a daily backup of master files"""
        timestamp = datetime.now().strftime('%Y%m%d')

        if os.path.exists(self.prices_master_file):
            backup_file = os.path.join(self.backup_dir, f'prices_backup_{timestamp}.csv')
            pd.read_csv(self.prices_master_file).to_csv(backup_file, index=False)
            print(f"âœ“ Created backup: {backup_file}")

        if os.path.exists(self.news_master_file):
            backup_file = os.path.join(self.backup_dir, f'news_backup_{timestamp}.csv')
            pd.read_csv(self.news_master_file).to_csv(backup_file, index=False)
            print(f"âœ“ Created backup: {backup_file}")

    def get_statistics(self) -> dict:
        """Get statistics about collected data"""
        data = self.load_master_data()

        stats = {
            'total_price_records': len(data['prices']),
            'total_news_articles': len(data['news']),
            'tickers': [],
            'date_range': None,
            'trading_days': 0,
            'records_per_ticker': {}
        }

        if not data['prices'].empty:
            stats['tickers'] = data['prices']['ticker'].unique().tolist()
            stats['date_range'] = {
                'start': data['prices']['timestamp'].min(),
                'end': data['prices']['timestamp'].max()
            }
            stats['trading_days'] = data['prices']['timestamp'].dt.date.nunique()
            stats['records_per_ticker'] = data['prices'].groupby('ticker').size().to_dict()

        return stats


class DataCollector:
    """Main data collection orchestrator"""

    def __init__(self, price_api: StockPriceAPI, news_api: NewsAPI, storage: IncrementalDataStorage):
        self.price_api = price_api
        self.news_api = news_api
        self.storage = storage

    def collect_and_store(self, tickers: List[str], interval='5m', period='1d', news_days_back=1):
        """
        Collect data and store incrementally

        NOTE: Use period='1d' for daily runs to get only new data!
        """
        all_prices = []
        all_news = []

        company_map = {
            'AAPL': 'Apple',
            'MSFT': 'Microsoft',
            'GOOGL': 'Google',
            'AMZN': 'Amazon',
            'TSLA': 'Tesla',
            'NVDA': 'NVIDIA',
            'META': 'Meta'
        }

        for ticker in tickers:
            print(f"\nðŸ“Š Collecting data for {ticker}...")

            # Fetch prices for TODAY only (period='1d')
            prices = self.price_api.get_intraday_quotes(ticker, interval, period)
            if not prices.empty:
                all_prices.append(prices)
                print(f"  âœ“ {len(prices)} price records")

            # Fetch news
            company_name = company_map.get(ticker, ticker)
            news = self.news_api.get_news(ticker, company_name, news_days_back)
            if not news.empty:
                all_news.append(news)
                print(f"  âœ“ {len(news)} news articles")

            time.sleep(0.5)

        # Combine today's data
        new_data = {
            'prices': pd.concat(all_prices, ignore_index=True) if all_prices else pd.DataFrame(),
            'news': pd.concat(all_news, ignore_index=True) if all_news else pd.DataFrame()
        }

        # Append to master files (this is the KEY!)
        self.storage.append_new_data(new_data)

        return new_data


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Run incremental data collection"""

    print("\n" + "="*70)
    print("  FINANCIAL SENTIMENT ANALYSIS - INCREMENTAL DATA COLLECTION")
    print("="*70)

    # Configuration
    TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'AMZN']
    NEWSAPI_KEY = 'ad3d858be2a34d4e82ac459ada5bd0ea'

    # Initialize components
    price_api = StockPriceAPI()
    news_api = NewsAPI(NEWSAPI_KEY)
    storage = IncrementalDataStorage('data/raw')
    collector = DataCollector(price_api, news_api, storage)

    # Show current statistics BEFORE collection
    print("\n" + "="*70)
    print("  BEFORE COLLECTION")
    print("="*70)
    stats_before = storage.get_statistics()
    print(f"  Existing price records: {stats_before['total_price_records']:,}")
    print(f"  Existing news articles: {stats_before['total_news_articles']:,}")
    if stats_before['date_range']:
        print(f"  Date range: {stats_before['date_range']['start']} to {stats_before['date_range']['end']}")
        print(f"  Trading days collected: {stats_before['trading_days']}")

    # Collect TODAY's data and append
    print("\n" + "="*70)
    print("  COLLECTING TODAY'S DATA")
    print("="*70)

    new_data = collector.collect_and_store(
        tickers=TICKERS,
        interval='5m',
        period='1d',  # IMPORTANT: Only get today's data!
        news_days_back=1  # Only today's news
    )

    # Show statistics AFTER collection
    print("\n" + "="*70)
    print("  AFTER COLLECTION")
    print("="*70)
    stats_after = storage.get_statistics()
    print(f"  Total price records: {stats_after['total_price_records']:,}")
    print(f"  Total news articles: {stats_after['total_news_articles']:,}")
    if stats_after['date_range']:
        print(f"  Date range: {stats_after['date_range']['start']} to {stats_after['date_range']['end']}")
        print(f"  Trading days collected: {stats_after['trading_days']}")

    print("\n  Records per ticker:")
    for ticker, count in stats_after['records_per_ticker'].items():
        print(f"    â€¢ {ticker}: {count:,} records")

    # Create daily backup
    storage.create_daily_backup()

    print("\n" + "="*70)
    print("  âœ… COLLECTION COMPLETE!")
    print("="*70)
    print("\n  Master files:")
    print(f"    â€¢ data/raw/prices_master.csv")
    print(f"    â€¢ data/raw/news_master.csv")
    print("\n  Daily backups:")
    print(f"    â€¢ data/raw/daily_backups/")
    print("\n  ðŸ’¡ Run this script daily for 5-7 days to accumulate data!")
    print("="*70 + "\n")


def view_data():
    """Helper function to view accumulated data"""
    storage = IncrementalDataStorage('data/raw')
    data = storage.load_master_data()

    print("\n" + "="*70)
    print("  ACCUMULATED DATA VIEWER")
    print("="*70)

    stats = storage.get_statistics()
    print(f"\n  Total Records:")
    print(f"    â€¢ Prices: {stats['total_price_records']:,}")
    print(f"    â€¢ News: {stats['total_news_articles']:,}")
    print(f"    â€¢ Tickers: {', '.join(stats['tickers'])}")
    print(f"    â€¢ Trading days: {stats['trading_days']}")

    if stats['date_range']:
        print(f"\n  Date Range:")
        print(f"    â€¢ Start: {stats['date_range']['start']}")
        print(f"    â€¢ End: {stats['date_range']['end']}")

    print(f"\n  Records per ticker:")
    for ticker, count in stats['records_per_ticker'].items():
        print(f"    â€¢ {ticker}: {count:,}")

    # Show sample data
    if not data['prices'].empty:
        print("\n  ðŸ“Š Sample Price Data (latest 5 records):")
        print(data['prices'].tail(5)[['timestamp', 'ticker', 'close', 'volume']])

    if not data['news'].empty:
        print("\n  ðŸ“° Sample News Data (latest 5 articles):")
        for _, row in data['news'].tail(5).iterrows():
            print(f"    â€¢ [{row['ticker']}] {row['headline'][:60]}...")

    print("\n" + "="*70 + "\n")


if __name__ == '__main__':
    import sys

    if len(sys.argv) > 1 and sys.argv[1] == 'view':
        # View accumulated data: python script.py view
        view_data()
    else:
        # Run collection: python script.py
        main()


  FINANCIAL SENTIMENT ANALYSIS - INCREMENTAL DATA COLLECTION
âœ“ yfinance initialized (no API key required)

  BEFORE COLLECTION
âœ“ Loaded 390 existing price records
âœ“ Loaded 461 existing news records
  Existing price records: 390
  Existing news articles: 461
  Date range: 2025-11-07 14:30:00+00:00 to 2025-11-07 20:55:00+00:00
  Trading days collected: 1

  COLLECTING TODAY'S DATA

ðŸ“Š Collecting data for AAPL...
  âœ“ 78 price records
  âœ“ 97 news articles

ðŸ“Š Collecting data for MSFT...
  âœ“ 78 price records
  âœ“ 99 news articles

ðŸ“Š Collecting data for GOOGL...
  âœ“ 78 price records
  âœ“ 100 news articles

ðŸ“Š Collecting data for TSLA...
  âœ“ 78 price records
  âœ“ 97 news articles

ðŸ“Š Collecting data for AMZN...
  âœ“ 78 price records
  âœ“ 98 news articles
âœ“ Loaded 390 existing price records
âœ“ Loaded 461 existing news records
âœ“ Saved 390 total price records (added 390 new)
âœ“ Saved 461 total news records (added 491 new)

  AFTER COLLECTION
âœ“ Loaded 390 