In [6]:
# Article Selector Scoring Analysis Notebook
# Shows internal scores for each article selected for NVDA (2025-10-01 to 2025-10-07)

import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import sys

# Add project root to path
sys.path.insert(0, str(Path.home() / "finreport"))

# Load environment
load_dotenv(Path.home() / "finreport" / ".env")

# Import the scoring functions directly
from core.selectors.select_finance_yahoo import (
    select_articles,
    calculate_article_score,
    _score_body_length,
    _score_content_relevance,
    FINANCIAL_KEYWORDS
)
from sqlalchemy import text
from core.data.db import engine

# === Parameters ===
TICKER = "TSLA"
START_DATE = "2025-10-01"
END_DATE = "2025-10-07"
MAX_ARTICLES = 3
MIN_BODY_CHARS = 800

print(f"Analyzing article scores for:")
print(f"  Ticker: {TICKER}")
print(f"  Period: {START_DATE} to {END_DATE}")
print(f"  Max articles: {MAX_ARTICLES}")
print(f"  Min body chars: {MIN_BODY_CHARS}")
print("\n" + "="*100 + "\n")

# === Query ALL articles (before selection) ===
query = text("""
    SELECT
        id, published_utc, published_date_utc,
        title, url, source,
        tickers, description, summary, tags, 
        full_body, full_body_chars
    FROM news_raw
    WHERE source = 'finance.yahoo.com'
      AND :ticker = ANY(tickers)
      AND published_date_utc >= CAST(:start AS date)
      AND published_date_utc < CAST(:end AS date)
      AND fetch_status = 'ok'
      AND full_body IS NOT NULL
      AND full_body_chars >= :min_chars
    ORDER BY published_utc
""")

params = {
    "ticker": TICKER.upper(),
    "start": START_DATE,
    "end": END_DATE,
    "min_chars": MIN_BODY_CHARS,
}

with engine.connect() as conn:
    df = pd.read_sql(query, conn, params=params)

print(f"Found {len(df)} total articles matching criteria\n")

if df.empty:
    print("⚠️  No articles found")
    sys.exit(0)

# === Calculate detailed scores for each article ===
scores_data = []

for idx, row in df.iterrows():
    article = row.to_dict()
    
    # Get individual score components
    body_length = article.get("full_body_chars") or 0
    title = article.get("title") or ""
    description = article.get("description") or ""
    summary = article.get("summary") or ""
    
    # Calculate component scores
    length_score = _score_body_length(body_length)
    content_score = _score_content_relevance(title, description, summary)
    total_score = calculate_article_score(article)
    
    # Count keyword matches
    title_lower = title.lower()
    desc_lower = description.lower()
    summ_lower = summary.lower()
    
    title_keywords = [kw for kw in FINANCIAL_KEYWORDS if kw in title_lower]
    desc_keywords = [kw for kw in FINANCIAL_KEYWORDS if kw in desc_lower]
    summ_keywords = [kw for kw in FINANCIAL_KEYWORDS if kw in summ_lower]
    
    scores_data.append({
        'id': article['id'],
        'published_utc': article['published_utc'],
        'title': title[:80] + '...' if len(title) > 80 else title,
        'body_chars': body_length,
        'length_score': round(length_score, 4),
        'content_score': round(content_score, 4),
        'total_score': round(total_score, 4),
        'title_kw_count': len(title_keywords),
        'desc_kw_count': len(set(desc_keywords)),
        'summ_kw_count': len(set(summ_keywords)),
        'title_keywords': ', '.join(title_keywords[:5]),
        'url': article['url']
    })

# Create DataFrame with scores
scores_df = pd.DataFrame(scores_data)
scores_df = scores_df.sort_values('total_score', ascending=False).reset_index(drop=True)

# === Display All Articles with Scores ===
print("\n" + "="*100)
print("📊 ALL ARTICLES RANKED BY SCORE")
print("="*100)
print("\nScore Calculation: 30% Length + 70% Content")
print("  - Length Score: Plateau curve favoring ~2500 chars")
print("  - Content Score: 40% title + 30% description + 30% summary keywords\n")

pd.set_option('display.max_rows', None)  # Show ALL rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.4f}'.format)

print(scores_df[['title', 'body_chars', 'length_score', 'content_score', 
                  'total_score', 'title_kw_count', 'desc_kw_count', 'summ_kw_count']].to_string(index=True))

# === Top Selected Articles ===
print("\n\n" + "="*100)
print(f"✅ TOP {MAX_ARTICLES} SELECTED ARTICLES (after MMR diversity)")
print("="*100)

selected_articles = select_articles(
    ticker=TICKER,
    start_date=START_DATE,
    end_date=END_DATE,
    max_articles=MAX_ARTICLES,
    min_body_chars=MIN_BODY_CHARS,
)

selected_ids = [a['id'] for a in selected_articles]
selected_df = scores_df[scores_df['id'].isin(selected_ids)]

print(selected_df[['title', 'body_chars', 'total_score', 
                    'title_kw_count', 'desc_kw_count']].to_string(index=False))

# === Detailed Breakdown ===
print("\n\n" + "="*100)
print("🔍 DETAILED SCORE BREAKDOWN FOR EACH ARTICLE")
print("="*100)

for idx, score_row in scores_df.iterrows():
    article_id = score_row['id']
    is_selected = article_id in selected_ids
    
    print(f"\n{'='*100}")
    print(f"ARTICLE #{idx + 1} {'✅ SELECTED' if is_selected else ''}")
    print('='*100)
    print(f"ID: {article_id}")
    print(f"Published: {score_row['published_utc']}")
    print(f"Title: {score_row['title']}")
    print(f"URL: {score_row['url']}")
    
    print(f"\n📏 BODY LENGTH")
    print(f"  Characters: {score_row['body_chars']:,}")
    print(f"  Length Score: {score_row['length_score']:.4f}")
    
    length_zone = ""
    chars = score_row['body_chars']
    if chars <= 500:
        length_zone = "Too short (<500)"
    elif chars <= 2500:
        length_zone = f"Rising (500-2500) → {score_row['length_score']:.2%}"
    elif chars == 2500:
        length_zone = "Peak (2500) → 100%"
    elif chars <= 6000:
        length_zone = f"Tapering (2500-6000) → {score_row['length_score']:.2%}"
    else:
        length_zone = "Too long (>6000) → capped at 80%"
    print(f"  Zone: {length_zone}")
    
    print(f"\n🎯 CONTENT RELEVANCE")
    print(f"  Content Score: {score_row['content_score']:.4f}")
    print(f"  Title keywords ({score_row['title_kw_count']}): {score_row['title_keywords']}")
    print(f"  Description keywords: {score_row['desc_kw_count']} unique")
    print(f"  Summary keywords: {score_row['summ_kw_count']} unique")
    
    # Calculate component contributions
    title_contrib = min(score_row['title_kw_count'] / 6.0, 1.0) * 0.4
    desc_contrib = min(score_row['desc_kw_count'] / 10.0, 1.0) * 0.3
    summ_contrib = min(score_row['summ_kw_count'] / 10.0, 1.0) * 0.3
    
    print(f"  ├─ Title contribution: {title_contrib:.4f} (40% weight)")
    print(f"  ├─ Description contribution: {desc_contrib:.4f} (30% weight)")
    print(f"  └─ Summary contribution: {summ_contrib:.4f} (30% weight)")
    
    print(f"\n🏆 FINAL SCORE")
    length_weighted = score_row['length_score'] * 0.30
    content_weighted = score_row['content_score'] * 0.70
    print(f"  Length (30%): {score_row['length_score']:.4f} × 0.30 = {length_weighted:.4f}")
    print(f"  Content (70%): {score_row['content_score']:.4f} × 0.70 = {content_weighted:.4f}")
    print(f"  TOTAL: {score_row['total_score']:.4f}")
    
    if is_selected:
        print(f"\n  ✅ Selected for final summary")

# === Statistics ===
print("\n\n" + "="*100)
print("📈 SCORING STATISTICS")
print("="*100)
print(f"\nTotal articles analyzed: {len(scores_df)}")
print(f"Articles selected: {len(selected_ids)}")
print(f"\nScore Distribution:")
print(f"  Mean total score: {scores_df['total_score'].mean():.4f}")
print(f"  Median total score: {scores_df['total_score'].median():.4f}")
print(f"  Min total score: {scores_df['total_score'].min():.4f}")
print(f"  Max total score: {scores_df['total_score'].max():.4f}")
print(f"  Std dev: {scores_df['total_score'].std():.4f}")

print(f"\nLength Scores:")
print(f"  Mean: {scores_df['length_score'].mean():.4f}")
print(f"  Median: {scores_df['length_score'].median():.4f}")

print(f"\nContent Scores:")
print(f"  Mean: {scores_df['content_score'].mean():.4f}")
print(f"  Median: {scores_df['content_score'].median():.4f}")

print(f"\nKeyword Matches:")
print(f"  Avg title keywords: {scores_df['title_kw_count'].mean():.1f}")
print(f"  Avg description keywords: {scores_df['desc_kw_count'].mean():.1f}")
print(f"  Avg summary keywords: {scores_df['summ_kw_count'].mean():.1f}")

# === Export ===
print("\n\n" + "="*100)
print("💾 EXPORT")
print("="*100)
output_path = Path.home() / "finreport" / "build" / f"article_scores_{TICKER}_{START_DATE}_{END_DATE}.csv"
scores_df.to_csv(output_path, index=False)
print(f"✅ Exported full scores to: {output_path}")

print("\n" + "="*100)
print("✅ Analysis complete!")
print("="*100)
selected_ids = [a['id'] for a in selected_articles]
selected_df = scores_df[scores_df['id'].isin(selected_ids)]

print(selected_df[['title', 'body_chars', 'total_score', 
                    'title_kw_count', 'desc_kw_count']].to_string(index=False))

# === Detailed Breakdown ===
print("\n\n" + "="*100)
print("🔍 DETAILED SCORE BREAKDOWN FOR EACH ARTICLE")
print("="*100)

for idx, score_row in scores_df.iterrows():
    article_id = score_row['id']
    is_selected = article_id in selected_ids
    
    print(f"\n{'='*100}")
    print(f"ARTICLE #{idx + 1} {'✅ SELECTED' if is_selected else ''}")
    print('='*100)
    print(f"ID: {article_id}")
    print(f"Published: {score_row['published_utc']}")
    print(f"Title: {score_row['title']}")
    print(f"URL: {score_row['url']}")
    
    print(f"\n📏 BODY LENGTH")
    print(f"  Characters: {score_row['body_chars']:,}")
    print(f"  Length Score: {score_row['length_score']:.4f}")
    
    length_zone = ""
    chars = score_row['body_chars']
    if chars <= 500:
        length_zone = "Too short (<500)"
    elif chars <= 2500:
        length_zone = f"Rising (500-2500) → {score_row['length_score']:.2%}"
    elif chars == 2500:
        length_zone = "Peak (2500) → 100%"
    elif chars <= 6000:
        length_zone = f"Tapering (2500-6000) → {score_row['length_score']:.2%}"
    else:
        length_zone = "Too long (>6000) → capped at 80%"
    print(f"  Zone: {length_zone}")
    
    print(f"\n🎯 CONTENT RELEVANCE")
    print(f"  Content Score: {score_row['content_score']:.4f}")
    print(f"  Title keywords ({score_row['title_kw_count']}): {score_row['title_keywords']}")
    print(f"  Description keywords: {score_row['desc_kw_count']} unique")
    print(f"  Summary keywords: {score_row['summ_kw_count']} unique")
    
    # Calculate component contributions
    title_contrib = min(score_row['title_kw_count'] / 6.0, 1.0) * 0.4
    desc_contrib = min(score_row['desc_kw_count'] / 10.0, 1.0) * 0.3
    summ_contrib = min(score_row['summ_kw_count'] / 10.0, 1.0) * 0.3
    
    print(f"  ├─ Title contribution: {title_contrib:.4f} (40% weight)")
    print(f"  ├─ Description contribution: {desc_contrib:.4f} (30% weight)")
    print(f"  └─ Summary contribution: {summ_contrib:.4f} (30% weight)")
    
    print(f"\n🏆 FINAL SCORE")
    length_weighted = score_row['length_score'] * 0.30
    content_weighted = score_row['content_score'] * 0.70
    print(f"  Length (30%): {score_row['length_score']:.4f} × 0.30 = {length_weighted:.4f}")
    print(f"  Content (70%): {score_row['content_score']:.4f} × 0.70 = {content_weighted:.4f}")
    print(f"  TOTAL: {score_row['total_score']:.4f}")
    
    if is_selected:
        print(f"\n  ✅ Selected for final summary")

# === Statistics ===
print("\n\n" + "="*100)
print("📈 SCORING STATISTICS")
print("="*100)
print(f"\nTotal articles analyzed: {len(scores_df)}")
print(f"Articles selected: {len(selected_ids)}")
print(f"\nScore Distribution:")
print(f"  Mean total score: {scores_df['total_score'].mean():.4f}")
print(f"  Median total score: {scores_df['total_score'].median():.4f}")
print(f"  Min total score: {scores_df['total_score'].min():.4f}")
print(f"  Max total score: {scores_df['total_score'].max():.4f}")
print(f"  Std dev: {scores_df['total_score'].std():.4f}")

print(f"\nLength Scores:")
print(f"  Mean: {scores_df['length_score'].mean():.4f}")
print(f"  Median: {scores_df['length_score'].median():.4f}")

print(f"\nContent Scores:")
print(f"  Mean: {scores_df['content_score'].mean():.4f}")
print(f"  Median: {scores_df['content_score'].median():.4f}")

print(f"\nKeyword Matches:")
print(f"  Avg title keywords: {scores_df['title_kw_count'].mean():.1f}")
print(f"  Avg description keywords: {scores_df['desc_kw_count'].mean():.1f}")
print(f"  Avg summary keywords: {scores_df['summ_kw_count'].mean():.1f}")

# === Export ===
print("\n\n" + "="*100)
print("💾 EXPORT")
print("="*100)
output_path = Path.home() / "finreport" / "build" / f"article_scores_{TICKER}_{START_DATE}_{END_DATE}.csv"
scores_df.to_csv(output_path, index=False)
print(f"✅ Exported full scores to: {output_path}")

print("\n" + "="*100)
print("✅ Analysis complete!")
print("="*100)

Analyzing article scores for:
  Ticker: TSLA
  Period: 2025-10-01 to 2025-10-07
  Max articles: 3
  Min body chars: 800


Found 83 total articles matching criteria


📊 ALL ARTICLES RANKED BY SCORE

Score Calculation: 30% Length + 70% Content
  - Length Score: Plateau curve favoring ~2500 chars
  - Content Score: 40% title + 30% description + 30% summary keywords

                                                                                  title  body_chars  length_score  content_score  total_score  title_kw_count  desc_kw_count  summ_kw_count
0   Tesla (TSLA) Valuation: Examining Q3’s Record Deliveries and Policy-Driven Deman...        2891        0.9777         0.7400       0.8113               3              8             11
1                      Tesla sales rise 7% in Q3 as buyers rush to use $7,500 EV credit        2718        0.9875         0.6033       0.7186               5              5              4
2   Tesla Stock Slipped Despite Record Q3 Deliveries: Can The Company 