In [None]:
"""
Fetch company news from Finnhub, visit each article URL, extract the
main text, and save everything to a CSV.

🔑  Prereqs
    pip install requests beautifulsoup4 lxml pandas tqdm

!!!  Some sites block scraping or require JS.  BeautifulSoup works
     fine for most plain‑HTML pages, but expect occasional failures.
"""

In [None]:
import requests, time, csv
from datetime import date, timedelta, datetime
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

In [None]:
# ---------------- USER SETTINGS -----------------------------------
API_TOKEN  = "d1nn689r01qovv8kac70d1nn689r01qovv8kac7g"  # Finnhub key
SYMBOL     = "AAPL"           # Stock ticker
DAYS_BACK  = 50               # Look‑back window
MAX_ART    = 100               # How many articles to pull (keep low to start)
# ------------------------------------------------------------------

to_date   = date.today()
from_date = to_date - timedelta(days=DAYS_BACK)

# ------- 1) Get news metadata from Finnhub ------------------------
url     = "https://finnhub.io/api/v1/company-news"
params  = {"symbol": SYMBOL,
           "from":   from_date.isoformat(),
           "to":     to_date.isoformat(),
           "token":  API_TOKEN}

resp = requests.get(url, params=params, timeout=10)
resp.raise_for_status()
news_items = resp.json()[:MAX_ART]   # trim to desired count

print(f"Pulled {len(news_items)} news items for {SYMBOL}")

# ------- 2) Helper: extract readable text from an article ---------
headers = {"User-Agent": "Mozilla/5.0"}   # mimic browser

def scrape_article(url):
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "lxml")

        # crude heuristics: grab <article> if present, else large <p> block
        art_tag = soup.find("article")
        if art_tag:
            text = " ".join(p.get_text(" ", strip=True)
                            for p in art_tag.find_all("p"))
        else:
            # fallback: top 10 longest paragraphs
            paragraphs = sorted((p.get_text(" ", strip=True)
                                 for p in soup.find_all("p")),
                                 key=len, reverse=True)[:10]
            text = " ".join(paragraphs)

        return text[:4000]   # keep CSV manageable
    except Exception as e:
        return f"[scrape error: {e}]"

# ------- 3) Loop through articles, scrape, collect rows ----------
rows = []
for art in tqdm(news_items, desc="Scraping"):
    art_time = datetime.fromtimestamp(art["datetime"])
    row = {
        "datetime": art_time.strftime("%Y-%m-%d %H:%M"),
        "headline": art["headline"],
        "source"  : art["source"],
        "url"     : art["url"],
        "text"    : scrape_article(art["url"])
    }
    rows.append(row)
    time.sleep(0.7)          # polite delay so we don’t hammer sites

# ------- 4) Write to CSV -----------------------------------------
out_file = f"{SYMBOL}_news_{from_date}_{to_date}.csv"
pd.DataFrame(rows).to_csv(out_file, index=False, quoting=csv.QUOTE_MINIMAL)

print(f"\nSaved {len(rows)} articles → {out_file}")
