# Phase 1: News Sources Exploration

This notebook explores different news sources and checks if our current dataset already contains news data.

In [13]:
import sys
from pathlib import Path

project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

import os
os.chdir(project_root)

In [14]:
import pandas as pd
import numpy as np
from src.data.loader import load_raw_dataset
from src.utils.config import load_config

## 1.1 Check Current Dataset for News Data

In [15]:
# Load current dataset
config = load_config()
print(f"Dataset name: {config.data.dataset_name}")

# Check local file first
local_file = Path("data/raw/sp500_stocks_data.parquet")
if local_file.exists():
    print(f"\nLoading local dataset: {local_file}")
    df = pd.read_parquet(local_file)
    print(f"Shape: {df.shape}")
    print(f"\nColumns ({len(df.columns)}):")
    for i, col in enumerate(df.columns, 1):
        print(f"  {i}. {col}")
    
    # Check for news-related columns
    news_keywords = ['news', 'article', 'headline', 'title', 'content', 'text', 'sentiment']
    news_cols = [col for col in df.columns if any(kw in col.lower() for kw in news_keywords)]
    
    if news_cols:
        print(f"\n‚úÖ Found news-related columns: {news_cols}")
        for col in news_cols:
            print(f"\n  {col}:")
            print(f"    Non-null count: {df[col].notna().sum()} / {len(df)}")
            if df[col].dtype == 'object':
                sample = df[col].dropna().iloc[0] if df[col].notna().any() else None
                if sample:
                    print(f"    Sample (first 200 chars): {str(sample)[:200]}...")
    else:
        print(f"\n‚ùå No news-related columns found in dataset")
    
    # Show sample row
    print(f"\nSample row (first 5 columns):")
    print(df.iloc[0][df.columns[:5]])
else:
    print(f"\n‚ö†Ô∏è  Local file not found. Would need to load from Hugging Face.")

Dataset name: pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs

‚ö†Ô∏è  Local file not found. Would need to load from Hugging Face.


## 1.2 Check Hugging Face Dataset Structure

### Option 4: FinBERT for Text Processing

FinBERT is a specialized BERT model trained on financial texts. It's perfect for:
- **Text embeddings** - Converting news text to numerical vectors
- **Sentiment analysis** - Classifying news as positive/negative/neutral
- **Financial domain understanding** - Better than general BERT for financial texts

In [16]:
# FinBERT - Financial BERT model
# Hugging Face: ProsusAI/finbert
# Perfect for financial news embeddings and sentiment analysis

def test_finbert():
    """Test FinBERT for financial text processing"""
    try:
        from transformers import AutoTokenizer, AutoModelForSequenceClassification
        import torch
        
        print("Loading FinBERT model...")
        print("This may take a minute on first run (downloading ~500MB)...")
        
        # Load FinBERT model and tokenizer
        model_name = "ProsusAI/finbert"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        model.eval()
        
        print(f"‚úÖ FinBERT loaded successfully!")
        print(f"   Model: {model_name}")
        print(f"   Max sequence length: {tokenizer.model_max_length}")
        
        # Test with sample financial news
        test_texts = [
            "Apple Inc. reported strong quarterly earnings, beating analyst expectations.",
            "The stock market crashed following the unexpected interest rate hike.",
            "Microsoft announced a new partnership with major cloud providers.",
        ]
        
        print(f"\\nTesting sentiment analysis on sample texts:")
        print("=" * 80)
        
        for text in test_texts:
            # Tokenize
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            
            # Get predictions
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
            # Get sentiment labels
            labels = ["positive", "negative", "neutral"]
            scores = predictions[0].tolist()
            
            # Find predicted label
            predicted_idx = scores.index(max(scores))
            predicted_label = labels[predicted_idx]
            confidence = scores[predicted_idx]
            
            print(f"\\nText: {text[:70]}...")
            print(f"Sentiment: {predicted_label.upper()} (confidence: {confidence:.3f})")
            print(f"Scores: positive={scores[0]:.3f}, negative={scores[1]:.3f}, neutral={scores[2]:.3f}")
        
        print("\\n" + "=" * 80)
        print("\\n‚úÖ FinBERT is working correctly!")
        print("\\nUsage:")
        print("  1. For sentiment: Use the classification head (as shown above)")
        print("  2. For embeddings: Use model.bert() to get hidden states")
        print("  3. For fine-tuning: Can further train on your specific data")
        
        return model, tokenizer
        
    except ImportError:
        print("‚ùå transformers library not installed")
        print("Install with: pip install transformers torch")
        return None, None
    except Exception as e:
        print(f"‚ùå Error loading FinBERT: {e}")
        return None, None

print("Testing FinBERT...")
finbert_model, finbert_tokenizer = test_finbert()

Testing FinBERT...
Loading FinBERT model...
This may take a minute on first run (downloading ~500MB)...
‚úÖ FinBERT loaded successfully!
   Model: ProsusAI/finbert
   Max sequence length: 512
\nTesting sentiment analysis on sample texts:
\nText: Apple Inc. reported strong quarterly earnings, beating analyst expecta...
Sentiment: POSITIVE (confidence: 0.952)
Scores: positive=0.952, negative=0.025, neutral=0.022
\nText: The stock market crashed following the unexpected interest rate hike....
Sentiment: NEGATIVE (confidence: 0.909)
Scores: positive=0.017, negative=0.909, neutral=0.075
\nText: Microsoft announced a new partnership with major cloud providers....
Sentiment: POSITIVE (confidence: 0.920)
Scores: positive=0.920, negative=0.010, neutral=0.070
\n‚úÖ FinBERT is working correctly!
\nUsage:
  1. For sentiment: Use the classification head (as shown above)
  2. For embeddings: Use model.bert() to get hidden states
  3. For fine-tuning: Can further train on your specific data


## FinBERT Usage: Embeddings vs Sentiment

FinBERT –º–æ–∂–µ –¥–∞ —Å–µ –∏–∑–ø–æ–ª–∑–≤–∞ –ø–æ **–¥–≤–∞ –Ω–∞—á–∏–Ω–∞**:

1. **Sentiment Analysis (–æ—Ü–µ–Ω–∫–∞)** - –ö–ª–∞—Å–∏—Ñ–∏—Ü–∏—Ä–∞ –Ω–æ–≤–∏–Ω–∞—Ç–∞ –∫–∞—Ç–æ positive/negative/neutral
2. **Text Embeddings (–∫–∞—Ç–æ features)** - –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä–∞ —Ç–µ–∫—Å—Ç–∞ –≤ —á–∏—Å–ª–æ–≤ –≤–µ–∫—Ç–æ—Ä –∑–∞ –Ω–µ–≤—Ä–æ–Ω–Ω–∞—Ç–∞ –º—Ä–µ–∂–∞

**–ó–∞ –Ω–∞—à–∏—è –º–æ–¥–µ–ª —â–µ –∏–∑–ø–æ–ª–∑–≤–∞–º–µ EMBEDDINGS –∫–∞—Ç–æ features!**

In [None]:
# Example: Using FinBERT embeddings as features for the model

from src.models.news_encoder import FinBERTEncoder
import numpy as np

print("=" * 80)
print("FinBERT: Embeddings vs Sentiment")
print("=" * 80)

# Initialize encoder
encoder = FinBERTEncoder()

# Sample news text
sample_news = [
    "Apple Inc. reported strong quarterly earnings, beating analyst expectations by 15%.",
    "The Federal Reserve announced an unexpected interest rate hike, causing market volatility.",
    "Microsoft announced a new strategic partnership with major cloud providers.",
]

print("\\n1. SENTIMENT ANALYSIS (–æ—Ü–µ–Ω–∫–∞):")
print("-" * 80)
for news in sample_news:
    sentiment = encoder.get_sentiment(news)
    print(f"\\nNews: {news[:60]}...")
    print(f"Sentiment: {sentiment['label'].upper()}")
    print(f"Probabilities: {sentiment['probabilities']}")

print("\\n\\n2. TEXT EMBEDDINGS (–∫–∞—Ç–æ features –∑–∞ –º–æ–¥–µ–ª–∞):")
print("-" * 80)

# Get embeddings (these will be used as features in the neural network)
embeddings = encoder.encode_text(sample_news)
print(f"\\nEmbeddings shape: {embeddings.shape}")
print(f"  - {embeddings.shape[0]} news items")
print(f"  - {embeddings.shape[1]} dimensions per embedding")
print(f"\\nThis is what we'll feed into the neural network!")

# Show sample embedding
print(f"\\nSample embedding (first 10 dimensions):")
print(embeddings[0][:10])

print("\\n\\n3. COMBINED APPROACH:")
print("-" * 80)
print("We can use BOTH:")
print("  - Embeddings ‚Üí Input features for the model")
print("  - Sentiment ‚Üí Additional feature or for analysis")

# Get both embeddings and sentiment
embeddings_with_sentiment, sentiments = encoder.encode_text(
    sample_news,
    return_sentiment=True
)

print(f"\\nEmbeddings: {embeddings_with_sentiment.shape}")
print(f"Sentiments: {len(sentiments)} items")

# We could add sentiment score as an additional feature
sentiment_scores = []
for sent in sentiments:
    # Convert sentiment to numerical score
    # positive = +1, neutral = 0, negative = -1
    score_map = {'positive': 1.0, 'neutral': 0.0, 'negative': -1.0}
    score = score_map.get(sent['label'], 0.0)
    sentiment_scores.append(score)

sentiment_scores = np.array(sentiment_scores)
print(f"\\nSentiment scores: {sentiment_scores}")
print(f"  (Can be added as additional feature alongside embeddings)")

print("\\n" + "=" * 80)
print("CONCLUSION:")
print("=" * 80)
print("""
‚úÖ FinBERT embeddings ‚Üí Main features for neural network (768 dimensions)
‚úÖ Sentiment scores ‚Üí Optional additional feature (1 dimension)

The embeddings capture semantic meaning of the news text,
which the model can learn to correlate with price movements.
""")

## –í–∞–∂–Ω–æ: –û—Ç–∫—ä–¥–µ –∏–¥–≤–∞—Ç –Ω–æ–≤–∏–Ω–∏—Ç–µ?

**FinBERT –ù–ï –∏–∑–≤–ª–∏—á–∞ –Ω–æ–≤–∏–Ω–∏!** FinBERT —Å–∞–º–æ –æ–±—Ä–∞–±–æ—Ç–≤–∞ –Ω–æ–≤–∏–Ω–∏ –∫–æ–∏—Ç–æ –Ω–∏–µ –º—É –¥–∞–≤–∞–º–µ.

**–ü—ä–ª–Ω–∏—è—Ç pipeline –µ:**

```
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ  News Source    ‚îÇ  ‚Üê –ò–∑–≤–ª–∏—á–∞ –Ω–æ–≤–∏–Ω–∏ –æ—Ç –∏–Ω—Ç–µ—Ä–Ω–µ—Ç
‚îÇ  (yfinance/     ‚îÇ     (yfinance, Alpha Vantage, –∏ —Ç.–Ω.)
‚îÇ   Alpha Vantage)‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
         ‚îÇ
         ‚îÇ –Ω–æ–≤–∏–Ω–∏ (—Ç–µ–∫—Å—Ç)
         ‚ñº
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ    FinBERT      ‚îÇ  ‚Üê –û–±—Ä–∞–±–æ—Ç–≤–∞ –Ω–æ–≤–∏–Ω–∏—Ç–µ
‚îÇ  (news_encoder) ‚îÇ     (embeddings + sentiment)
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
         ‚îÇ
         ‚îÇ embeddings (768 —á–∏—Å–ª–∞)
         ‚ñº
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ  Our Model      ‚îÇ  ‚Üê –ò–∑–ø–æ–ª–∑–≤–∞ embeddings –∫–∞—Ç–æ features
‚îÇ  (Transformer)  ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
```

**–°—Ç—ä–ø–∫–∏:**
1. **News Source** –∏–∑–≤–ª–∏—á–∞ –Ω–æ–≤–∏–Ω–∏ ‚Üí "Apple reported strong earnings..."
2. **FinBERT** –æ–±—Ä–∞–±–æ—Ç–≤–∞ –Ω–æ–≤–∏–Ω–∏—Ç–µ ‚Üí [0.23, -0.45, ..., 0.12] (768 —á–∏—Å–ª–∞)
3. **–ù–∞—à–∏—è—Ç –º–æ–¥–µ–ª** –∏–∑–ø–æ–ª–∑–≤–∞ embeddings ‚Üí –ø—Ä–æ–≥–Ω–æ–∑–∞ –∑–∞ —Ü–µ–Ω–∞—Ç–∞

In [None]:
# –ü—Ä–∏–º–µ—Ä: –ü—ä–ª–µ–Ω pipeline –æ—Ç –Ω–æ–≤–∏–Ω–∏ –¥–æ embeddings

print("=" * 80)
print("–ü–™–õ–ï–ù PIPELINE: News Source ‚Üí FinBERT ‚Üí Embeddings")
print("=" * 80)

# –°—Ç—ä–ø–∫–∞ 1: –ò–∑–≤–ª–∏—á–∞–Ω–µ –Ω–∞ –Ω–æ–≤–∏–Ω–∏ (–æ—Ç news source)
print("\\n1. –ò–ó–í–õ–ò–ß–ê–ù–ï –ù–ê –ù–û–í–ò–ù–ò (–æ—Ç news source):")
print("-" * 80)

try:
    from src.data.news_loader import fetch_yahoo_finance_news
    
    # –ò–∑–≤–ª–∏—á–∞–º–µ –Ω–æ–≤–∏–Ω–∏ –∑–∞ AAPL
    print("–ò–∑–≤–ª–∏—á–∞–Ω–µ –Ω–∞ –Ω–æ–≤–∏–Ω–∏ –∑–∞ AAPL –æ—Ç Yahoo Finance...")
    news_list = fetch_yahoo_finance_news('AAPL', limit=3)
    
    if news_list:
        print(f"‚úÖ –ò–∑–≤–ª–µ—á–µ–Ω–∏ {len(news_list)} –Ω–æ–≤–∏–Ω–∏")
        print("\\n–ü—Ä–∏–º–µ—Ä–Ω–∞ –Ω–æ–≤–∏–Ω–∞:")
        news_item = news_list[0]
        print(f"  Title: {news_item['title']}")
        print(f"  Publisher: {news_item['publisher']}")
        print(f"  Time: {news_item['published_time']}")
        
        # –°—Ç—ä–ø–∫–∞ 2: –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å FinBERT
        print("\\n\\n2. –û–ë–†–ê–ë–û–¢–ö–ê –° FINBERT:")
        print("-" * 80)
        
        from src.models.news_encoder import FinBERTEncoder
        encoder = FinBERTEncoder()
        
        # –í–∑–∏–º–∞–º–µ —Ç–µ–∫—Å—Ç–∞ –Ω–∞ –Ω–æ–≤–∏–Ω–∞—Ç–∞
        news_text = news_item['title']  # –ú–æ–∂–µ –¥–∞ –¥–æ–±–∞–≤–∏–º –∏ description/summary
        
        print(f"–û–±—Ä–∞–±–æ—Ç–≤–∞–Ω–µ –Ω–∞ –Ω–æ–≤–∏–Ω–∞: '{news_text[:80]}...'")
        
        # –ü–æ–ª—É—á–∞–≤–∞–º–µ embeddings
        embeddings = encoder.encode_text(news_text)
        print(f"\\n‚úÖ Embeddings shape: {embeddings.shape}")
        print(f"   (–¢–æ–≤–∞ —Å–∞ {embeddings.shape[0]} —á–∏—Å–ª–∞ –∫–æ–∏—Ç–æ —â–µ –∏–∑–ø–æ–ª–∑–≤–∞–º–µ –∫–∞—Ç–æ features)")
        
        # –ü–æ–ª—É—á–∞–≤–∞–º–µ sentiment
        sentiment = encoder.get_sentiment(news_text)
        print(f"\\n‚úÖ Sentiment: {sentiment['label']}")
        print(f"   Probabilities: {sentiment['probabilities']}")
        
        print("\\n\\n3. –†–ï–ó–£–õ–¢–ê–¢:")
        print("-" * 80)
        print("""
        ‚úÖ –ò–º–∞–º–µ –Ω–æ–≤–∏–Ω–∏ (–æ—Ç Yahoo Finance)
        ‚úÖ –ò–º–∞–º–µ embeddings (–æ—Ç FinBERT) - 768 —á–∏—Å–ª–∞
        ‚úÖ –ò–º–∞–º–µ sentiment (–æ—Ç FinBERT) - positive/negative/neutral
        
        –°–ª–µ–¥–≤–∞—â–∞ —Å—Ç—ä–ø–∫–∞: –î–æ–±–∞–≤—è–Ω–µ –Ω–∞ embeddings –∫—ä–º —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏—Ç–µ features
        –∏ —Ç—Ä–µ–Ω–∏—Ä–∞–Ω–µ –Ω–∞ –º–æ–¥–µ–ª–∞!
        """)
        
    else:
        print("‚ö†Ô∏è  –ù–µ —Å–∞ –Ω–∞–º–µ—Ä–µ–Ω–∏ –Ω–æ–≤–∏–Ω–∏. –ú–æ–∂–µ –¥–∞ –µ –ø—Ä–æ–±–ª–µ–º —Å yfinance.")
        
except Exception as e:
    print(f"‚ùå –ì—Ä–µ—à–∫–∞: {e}")
    print("\\n–¢–æ–≤–∞ –µ –Ω–æ—Ä–º–∞–ª–Ω–æ –∞–∫–æ yfinance –Ω–µ –µ –∏–Ω—Å—Ç–∞–ª–∏—Ä–∞–Ω.")
    print("–ò–Ω—Å—Ç–∞–ª–∏—Ä–∞–π —Å: pip install yfinance")

print("\\n" + "=" * 80)
print("–ó–ê–ö–õ–Æ–ß–ï–ù–ò–ï:")
print("=" * 80)
print("""
‚úÖ News Source (yfinance) ‚Üí –ò–∑–≤–ª–∏—á–∞ –Ω–æ–≤–∏–Ω–∏ –æ—Ç –∏–Ω—Ç–µ—Ä–Ω–µ—Ç
‚úÖ FinBERT ‚Üí –û–±—Ä–∞–±–æ—Ç–≤–∞ –Ω–æ–≤–∏–Ω–∏—Ç–µ –∏ –≥–∏ –∫–æ–Ω–≤–µ—Ä—Ç–∏—Ä–∞ –≤ embeddings
‚úÖ Our Model ‚Üí –ò–∑–ø–æ–ª–∑–≤–∞ embeddings –∫–∞—Ç–æ features –∑–∞ –ø—Ä–æ–≥–Ω–æ–∑–∏

FinBERT –µ –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç –∑–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∞, –Ω–µ –∏–∑—Ç–æ—á–Ω–∏–∫ –Ω–∞ –Ω–æ–≤–∏–Ω–∏!
""")

## –†–µ—à–µ–Ω–∏–µ –∑–∞ –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–∏ –Ω–æ–≤–∏–Ω–∏ (2010-2024)

–ó–∞ —Ü–µ–ª–∏—è –ø–µ—Ä–∏–æ–¥ –Ω–∞ –º–æ–¥–µ–ª–∞ (2010-2024) –∏–º–∞–º–µ –Ω—è–∫–æ–ª–∫–æ –æ–ø—Ü–∏–∏:

In [None]:
# –û–ø—Ü–∏–∏ –∑–∞ –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–∏ –Ω–æ–≤–∏–Ω–∏ (2010-2024)

print("=" * 80)
print("–û–ü–¶–ò–ò –ó–ê –ò–°–¢–û–†–ò–ß–ï–°–ö–ò –ù–û–í–ò–ù–ò (2010-2024)")
print("=" * 80)

options = {
    "1. Yahoo Finance RSS Feeds": {
        "–ü–æ–∫—Ä–∏—Ç–∏–µ": "–û–≥—Ä–∞–Ω–∏—á–µ–Ω–æ (–æ–±–∏–∫–Ω–æ–≤–µ–Ω–æ –ø–æ—Å–ª–µ–¥–Ω–∏—Ç–µ 1-2 –≥–æ–¥–∏–Ω–∏)",
        "–ë–µ–∑–ø–ª–∞—Ç–Ω–æ": "‚úÖ –î–∞",
        "API Key": "‚ùå –ù–µ",
        "–¢—Ä—É–¥–Ω–æ—Å—Ç": "–õ–µ—Å–Ω–æ",
        "–ü—Ä–µ–ø–æ—Ä—ä–∫–∞": "–ó–∞ —Ç–µ—Å—Ç–≤–∞–Ω–µ, –Ω–æ –Ω–µ –∑–∞ –ø—ä–ª–Ω–æ –ø–æ–∫—Ä–∏—Ç–∏–µ"
    },
    "2. Web Scraping (Yahoo Finance, Seeking Alpha)": {
        "–ü–æ–∫—Ä–∏—Ç–∏–µ": "–ü—ä–ª–Ω–æ (–º–æ–∂–µ –¥–∞ —Å–µ —Å—Ç–∏–≥–Ω–µ –¥–æ 2010)",
        "–ë–µ–∑–ø–ª–∞—Ç–Ω–æ": "‚úÖ –î–∞",
        "API Key": "‚ùå –ù–µ",
        "–¢—Ä—É–¥–Ω–æ—Å—Ç": "–°—Ä–µ–¥–Ω–æ-—Ç—Ä—É–¥–Ω–æ",
        "–ü—Ä–µ–ø–æ—Ä—ä–∫–∞": "–ù–∞–π-–¥–æ–±—Ä–æ —Ä–µ—à–µ–Ω–∏–µ –∑–∞ –±–µ–∑–ø–ª–∞—Ç–Ω–∏ –¥–∞–Ω–Ω–∏"
    },
    "3. Alpha Vantage News API": {
        "–ü–æ–∫—Ä–∏—Ç–∏–µ": "–û–≥—Ä–∞–Ω–∏—á–µ–Ω–æ (–ø–æ—Å–ª–µ–¥–Ω–∏—Ç–µ 1-2 –≥–æ–¥–∏–Ω–∏)",
        "–ë–µ–∑–ø–ª–∞—Ç–Ω–æ": "‚úÖ –î–∞ (free tier)",
        "API Key": "‚úÖ –î–∞ (–±–µ–∑–ø–ª–∞—Ç–µ–Ω)",
        "–¢—Ä—É–¥–Ω–æ—Å—Ç": "–õ–µ—Å–Ω–æ",
        "–ü—Ä–µ–ø–æ—Ä—ä–∫–∞": "–î–æ–ø—ä–ª–Ω–µ–Ω–∏–µ –∫—ä–º scraping"
    },
    "4. Financial Modeling Prep API": {
        "–ü–æ–∫—Ä–∏—Ç–∏–µ": "–î–æ–±—Ä–æ (–Ω—è–∫–æ–ª–∫–æ –≥–æ–¥–∏–Ω–∏ –Ω–∞–∑–∞–¥)",
        "–ë–µ–∑–ø–ª–∞—Ç–Ω–æ": "‚ö†Ô∏è –ß–∞—Å—Ç–∏—á–Ω–æ (free tier –æ–≥—Ä–∞–Ω–∏—á–µ–Ω)",
        "API Key": "‚úÖ –î–∞",
        "–¢—Ä—É–¥–Ω–æ—Å—Ç": "–õ–µ—Å–Ω–æ",
        "–ü—Ä–µ–ø–æ—Ä—ä–∫–∞": "–ê–∫–æ free tier –µ –¥–æ—Å—Ç–∞—Ç—ä—á–µ–Ω"
    },
    "5. Pre-collected Datasets": {
        "–ü–æ–∫—Ä–∏—Ç–∏–µ": "–ó–∞–≤–∏—Å–∏ –æ—Ç dataset-–∞",
        "–ë–µ–∑–ø–ª–∞—Ç–Ω–æ": "‚úÖ –î–∞ (–∞–∫–æ –Ω–∞–º–µ—Ä–∏–º —Ç–∞–∫—ä–≤)",
        "API Key": "‚ùå –ù–µ",
        "–¢—Ä—É–¥–Ω–æ—Å—Ç": "–ú–Ω–æ–≥–æ –ª–µ—Å–Ω–æ",
        "–ü—Ä–µ–ø–æ—Ä—ä–∫–∞": "–ù–∞–π-–¥–æ–±—Ä–æ –∞–∫–æ –Ω–∞–º–µ—Ä–∏–º –ø–æ–¥—Ö–æ–¥—è—â dataset"
    },
}

for option, details in options.items():
    print(f"\\n{option}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

print("\\n" + "=" * 80)
print("–ü–†–ï–ü–û–†–™–ö–ê:")
print("=" * 80)
print("""
–ó–∞ —Ü–µ–ª–∏—è –ø–µ—Ä–∏–æ–¥ (2010-2024) –ø—Ä–µ–ø–æ—Ä—ä—á–≤–∞–º –ö–û–ú–ë–ò–ù–ò–†–ê–ù –ø–æ–¥—Ö–æ–¥:

1. **Web Scraping** (–æ—Å–Ω–æ–≤–µ–Ω –∏–∑—Ç–æ—á–Ω–∏–∫)
   - Yahoo Finance news pages (—Å —É–≤–∞–∂–µ–Ω–∏–µ –∫—ä–º rate limits)
   - Seeking Alpha (–∞–∫–æ –∏–º–∞ RSS –∏–ª–∏ —Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ)
   - MarketWatch (–∞–∫–æ –µ –¥–æ—Å—Ç—ä–ø–µ–Ω)
   
2. **RSS Feeds** (–¥–æ–ø—ä–ª–Ω–µ–Ω–∏–µ)
   - Yahoo Finance RSS
   - Google News RSS (—Ñ–∏–ª—Ç—Ä–∏—Ä–∞–Ω–æ –∑–∞ —Ñ–∏–Ω–∞–Ω—Å–∏)
   
3. **API-—Ç–∞** (–∑–∞ –ø–æ—Å–ª–µ–¥–Ω–∏—Ç–µ –≥–æ–¥–∏–Ω–∏)
   - Alpha Vantage (–∑–∞ sentiment scores)
   - yfinance (–∑–∞ –Ω–∞–π-–Ω–æ–≤–∏—Ç–µ –Ω–æ–≤–∏–Ω–∏)

–í–ê–ñ–ù–û:
- –í–∏–Ω–∞–≥–∏ –ø—Ä–æ–≤–µ—Ä—è–≤–∞–π robots.txt –∏ Terms of Service
- –î–æ–±–∞–≤—è–π –∑–∞–±–∞–≤—è–Ω–∏—è –º–µ–∂–¥—É –∑–∞—è–≤–∫–∏—Ç–µ (1-2 —Å–µ–∫—É–Ω–¥–∏)
- –ö–µ—à–∏—Ä–∞–π –¥–∞–Ω–Ω–∏—Ç–µ –∑–∞ –¥–∞ –Ω–µ –ø—Ä–∞–≤–∏—à –ø–æ–≤—Ç–æ—Ä–Ω–∏ –∑–∞—è–≤–∫–∏
- –ó–∞–ø–æ—á–Ω–∏ —Å –º–∞–ª—ä–∫ —Ç–µ—Å—Ç –ø—Ä–µ–¥–∏ –ø—ä–ª–Ω–æ –∏–∑–≤–ª–∏—á–∞–Ω–µ

–°–ª–µ–¥–≤–∞—â–∞ —Å—Ç—ä–ø–∫–∞: –°—ä–∑–¥–∞–≤–∞–Ω–µ –Ω–∞ scraping –º–æ–¥—É–ª —Å –ø—Ä–∞–≤–∏–ª–Ω–æ handling –Ω–∞ rate limits.
""")

In [4]:
# Check Hugging Face dataset structure
from datasets import load_dataset_builder, get_dataset_infos

dataset_name = config.data.dataset_name
print(f"Checking dataset: {dataset_name}")

try:
    builder = load_dataset_builder(dataset_name)
    print(f"\nDataset info:")
    print(f"  Description: {builder.info.description[:200]}..." if builder.info.description else "  No description")
    print(f"  Features: {builder.info.features}")
    
    # Check for news-related features
    if hasattr(builder.info, 'features'):
        feature_names = list(builder.info.features.keys()) if isinstance(builder.info.features, dict) else []
        news_features = [f for f in feature_names if any(kw in f.lower() for kw in ['news', 'article', 'headline', 'text'])]
        
        if news_features:
            print(f"\n‚úÖ Found news-related features: {news_features}")
        else:
            print(f"\n‚ùå No news-related features found")
            print(f"\nAvailable features: {feature_names[:20]}")
except Exception as e:
    print(f"\n‚ö†Ô∏è  Could not load dataset builder: {e}")
    print("This is okay - we'll proceed with external news sources")

Checking dataset: pmoe7/SP_500_Stocks_Data-ratios_news_price_10_yrs


Repo card metadata block was not found. Setting CardData to empty.



Dataset info:
  No description
  Features: {'_id': Value('string'), 'compound': Value('float64'), 'date': Value('string'), 'headline': Value('string'), 'neg': Value('float64'), 'neu': Value('float64'), 'pos': Value('float64'), 'ticker': Value('string'), 'time': Value('string')}

‚úÖ Found news-related features: ['headline']


## 1.3 Test News APIs

Let's test different news APIs to see which one works best for our use case.

### Option 1: Alpha Vantage News API

In [None]:
# Alpha Vantage News API
# Documentation: https://www.alphavantage.co/documentation/#news-sentiment
# Free tier: 5 API calls per minute, 500 calls per day

import requests
import os
from datetime import datetime, timedelta

# You'll need to get an API key from https://www.alphavantage.co/support/#api-key
ALPHA_VANTAGE_API_KEY = os.getenv('GKXZ5NT78PUAR2DT', 'demo')  # Replace with your key

def test_alpha_vantage_news(ticker='AAPL', limit=10):
    """Test Alpha Vantage News API"""
    url = 'https://www.alphavantage.co/query'
    params = {
        'function': 'NEWS_SENTIMENT',
        'tickers': ticker,
        'limit': limit,
        'apikey': ALPHA_VANTAGE_API_KEY
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        data = response.json()
        
        if 'Information' in data:
            print(f"‚ö†Ô∏è  API Info: {data['Information']}")
            return None
        
        if 'feed' in data:
            articles = data['feed']
            print(f"‚úÖ Successfully fetched {len(articles)} articles for {ticker}")
            
            if articles:
                print(f"\nSample article:")
                article = articles[0]
                print(f"  Title: {article.get('title', 'N/A')[:100]}...")
                print(f"  Source: {article.get('source', 'N/A')}")
                print(f"  Time: {article.get('time_published', 'N/A')}")
                print(f"  Sentiment Score: {article.get('overall_sentiment_score', 'N/A')}")
                print(f"  Relevance Score: {article.get('relevance_score', 'N/A')}")
            
            return articles
        else:
            print(f"‚ùå Unexpected response format: {list(data.keys())}")
            return None
            
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

print("Testing Alpha Vantage News API...")
print("Note: You need to set ALPHA_VANTAGE_API_KEY environment variable")
print("Get free API key at: https://www.alphavantage.co/support/#api-key\n")

if ALPHA_VANTAGE_API_KEY != 'demo':
    alpha_vantage_results = test_alpha_vantage_news('AAPL', limit=5)
else:
    print("‚ö†Ô∏è  Using demo key - will likely hit rate limit. Set your API key to test properly.")

Testing Alpha Vantage News API...
Note: You need to set ALPHA_VANTAGE_API_KEY environment variable
Get free API key at: https://www.alphavantage.co/support/#api-key

‚ö†Ô∏è  Using demo key - will likely hit rate limit. Set your API key to test properly.


### Option 2: NewsAPI

In [10]:
# NewsAPI
# Documentation: https://newsapi.org/docs
# Free tier: 100 requests per day

NEWSAPI_KEY = os.getenv('NEWSAPI_KEY', '')  # Get from https://newsapi.org/register

def test_newsapi(ticker='AAPL', days_back=7):
    """Test NewsAPI"""
    if not NEWSAPI_KEY:
        print("‚ö†Ô∏è  NEWSAPI_KEY not set. Get one at https://newsapi.org/register")
        return None
    
    url = 'https://newsapi.org/v2/everything'
    
    # Calculate date range
    to_date = datetime.now()
    from_date = to_date - timedelta(days=days_back)
    
    params = {
        'q': ticker,  # Search query
        'from': from_date.strftime('%Y-%m-%d'),
        'to': to_date.strftime('%Y-%m-%d'),
        'sortBy': 'publishedAt',
        'language': 'en',
        'pageSize': 10,
        'apiKey': NEWSAPI_KEY
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        data = response.json()
        
        if data.get('status') == 'ok':
            articles = data.get('articles', [])
            print(f"‚úÖ Successfully fetched {len(articles)} articles for {ticker}")
            
            if articles:
                print(f"\nSample article:")
                article = articles[0]
                print(f"  Title: {article.get('title', 'N/A')[:100]}...")
                print(f"  Source: {article.get('source', {}).get('name', 'N/A')}")
                print(f"  Published: {article.get('publishedAt', 'N/A')}")
                print(f"  Description: {article.get('description', 'N/A')[:150]}...")
            
            return articles
        else:
            print(f"‚ùå Error: {data.get('message', 'Unknown error')}")
            return None
            
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

print("Testing NewsAPI...")
if NEWSAPI_KEY:
    newsapi_results = test_newsapi('AAPL', days_back=7)
else:
    print("‚ö†Ô∏è  Set NEWSAPI_KEY environment variable to test")

Testing NewsAPI...
‚ö†Ô∏è  Set NEWSAPI_KEY environment variable to test


### Option 3: Yahoo Finance (Free, no API key needed)

In [9]:
# Yahoo Finance - Free option using yfinance library
# No API key needed, but may have rate limits

try:
    import yfinance as yf
    
    def test_yahoo_finance_news(ticker='AAPL'):
        """Test Yahoo Finance news"""
        try:
            stock = yf.Ticker(ticker)
            news = stock.news
            
            if news:
                print(f"‚úÖ Successfully fetched {len(news)} news items for {ticker}")
                
                if news:
                    print(f"\nSample news item:")
                    item = news[0]
                    print(f"  Title: {item.get('title', 'N/A')[:100]}...")
                    print(f"  Publisher: {item.get('publisher', 'N/A')}")
                    print(f"  Published: {item.get('providerPublishTime', 'N/A')}")
                    print(f"  Link: {item.get('link', 'N/A')[:80]}...")
                
                return news
            else:
                print(f"‚ö†Ô∏è  No news found for {ticker}")
                return None
                
        except Exception as e:
            print(f"‚ùå Error: {e}")
            return None
    
    print("Testing Yahoo Finance (yfinance)...")
    yahoo_results = test_yahoo_finance_news('AAPL')
    
except ImportError:
    print("‚ö†Ô∏è  yfinance not installed. Install with: pip install yfinance")
    print("This is a free option with no API key needed!")

Testing Yahoo Finance (yfinance)...
‚úÖ Successfully fetched 10 news items for AAPL

Sample news item:
  Title: N/A...
  Publisher: N/A
  Published: N/A
  Link: N/A...


## 1.4 Comparison and Recommendation

In [11]:
# Create comparison table
comparison_data = {
    'Source': ['Alpha Vantage', 'NewsAPI', 'Yahoo Finance (yfinance)'],
    'API Key Required': ['Yes (free)', 'Yes (free)', 'No'],
    'Rate Limit': ['5/min, 500/day', '100/day', 'Unknown (may vary)'],
    'Historical Data': ['Limited', 'Limited (1 month free)', 'Recent only'],
    'Sentiment Score': ['Yes (built-in)', 'No (need separate)', 'No'],
    'Cost': ['Free tier available', 'Free tier available', 'Free'],
    'Ease of Use': ['Medium', 'Easy', 'Very Easy'],
}

df_comparison = pd.DataFrame(comparison_data)
print("\n" + "="*80)
print("NEWS SOURCES COMPARISON")
print("="*80)
print(df_comparison.to_string(index=False))

print("\n" + "="*80)
print("RECOMMENDATION")
print("="*80)
print("""
For Phase 1, I recommend:

1. **Start with Yahoo Finance (yfinance)** - Easiest to set up, no API key needed
   - Good for testing and prototyping
   - May have limitations for historical data

2. **Alpha Vantage as backup** - If we need sentiment scores built-in
   - Requires API key but free tier is generous
   - Has built-in sentiment analysis

3. **For production/historical data** - May need paid service or web scraping
   - Consider Financial Modeling Prep or Polygon.io for better historical coverage

Next steps:
- Test yfinance with multiple tickers
- Check if we can get historical news (may need to scrape or use paid API)
- Evaluate data quality and coverage
""")


NEWS SOURCES COMPARISON
                  Source API Key Required         Rate Limit        Historical Data    Sentiment Score                Cost Ease of Use
           Alpha Vantage       Yes (free)     5/min, 500/day                Limited     Yes (built-in) Free tier available      Medium
                 NewsAPI       Yes (free)            100/day Limited (1 month free) No (need separate) Free tier available        Easy
Yahoo Finance (yfinance)               No Unknown (may vary)            Recent only                 No                Free   Very Easy

RECOMMENDATION

For Phase 1, I recommend:

1. **Start with Yahoo Finance (yfinance)** - Easiest to set up, no API key needed
   - Good for testing and prototyping
   - May have limitations for historical data

2. **Alpha Vantage as backup** - If we need sentiment scores built-in
   - Requires API key but free tier is generous
   - Has built-in sentiment analysis

3. **For production/historical data** - May need paid service or web

## 1.5 Test Historical News Availability

In [12]:
# Test if we can get historical news (important for backtesting)
print("Testing historical news availability...")
print("\nNote: Most free APIs only provide recent news (last 1-30 days)")
print("For backtesting, we need news from 2010-2024 period")
print("\nOptions:")
print("1. Use paid API with historical data (Financial Modeling Prep, Polygon.io)")
print("2. Web scraping from financial news sites (with proper attribution)")
print("3. Use pre-collected datasets (like the Hugging Face one if it has news)")
print("4. Start with recent data and expand backwards gradually")

# Check if yfinance can get older news
try:
    import yfinance as yf
    from datetime import datetime
    
    stock = yf.Ticker('AAPL')
    news = stock.news
    
    if news:
        # Check date range
        dates = []
        for item in news:
            if 'providerPublishTime' in item:
                ts = item['providerPublishTime']
                dt = datetime.fromtimestamp(ts)
                dates.append(dt)
        
        if dates:
            oldest = min(dates)
            newest = max(dates)
            print(f"\n‚úÖ Yahoo Finance news date range:")
            print(f"   Oldest: {oldest.strftime('%Y-%m-%d')}")
            print(f"   Newest: {newest.strftime('%Y-%m-%d')}")
            print(f"   Coverage: {(newest - oldest).days} days")
            
            if oldest.year < 2020:
                print(f"\n‚úÖ Good! Can get historical data back to {oldest.year}")
            else:
                print(f"\n‚ö†Ô∏è  Limited historical data - only goes back to {oldest.year}")
                print(f"   May need alternative source for full backtest period")
except Exception as e:
    print(f"\n‚ö†Ô∏è  Could not test historical availability: {e}")

Testing historical news availability...

Note: Most free APIs only provide recent news (last 1-30 days)
For backtesting, we need news from 2010-2024 period

Options:
1. Use paid API with historical data (Financial Modeling Prep, Polygon.io)
2. Web scraping from financial news sites (with proper attribution)
3. Use pre-collected datasets (like the Hugging Face one if it has news)
4. Start with recent data and expand backwards gradually


## Summary and Next Steps

In [None]:
print("\n" + "="*80)
print("PHASE 1 SUMMARY")
print("="*80)
print("""
‚úÖ Completed:
1. Checked current dataset structure
2. Tested multiple news API options
3. Compared different sources

üìã Next Steps (Phase 2):
1. Choose primary news source (recommendation: yfinance for start)
2. Create news_loader.py module
3. Test fetching news for multiple tickers
4. Design storage format for news data
5. Create script to fetch historical news

üí° Key Decisions:
- Primary source: Yahoo Finance (yfinance) - easiest to start
- Backup: Alpha Vantage (if we need sentiment scores)
- Historical data: May need paid API or web scraping for full coverage
""")