In [None]:
from alpaca.data.historical.news import NewsClient
from alpaca.data.requests import NewsRequest
import pandas as pd
from datetime import datetime, timedelta

# Load API credential from YAML configuration file
print('Loading API keys...')

with open("data/conf/keys.yaml", "r") as f:
    keys = yaml.safe_load(f)

API_KEY = keys['KEYS']['APCA-API-KEY-ID-Data']
SECRET_KEY = keys['KEYS']['APCA-API-SECRET-KEY-Data']

print("API keys loaded.")



In [None]:
from alpaca.data.historical import NewsClient
from alpaca.data.requests import NewsRequest

print('Downloading news data...')

news_client = NewsClient(
    api_key=API_KEY,
    secret_key=SECRET_KEY
)
start_date = datetime(2022, 1, 1)
end_date = datetime.now()


In [None]:
all_articles = []

current_start = start_date
block_size = timedelta(days=7)  # 7-Tage-Blöcke

while current_start < end_date:
    current_end = min(current_start + block_size, end_date)

    request = NewsRequest(
        symbols="APPL",
        start=current_start,
        end=current_end,
        limit=50  # max pro Request
    )

    articles = news_client.get_news(request)
    articles_list = list(articles)  # NewsSet → list

    if articles_list:
        all_articles.extend(articles_list)
        print(f"Fetched {len(articles_list)} articles from {current_start.date()} to {current_end.date()}")

    current_start = current_end  # nächster Block

print(f"Total articles fetched: {len(all_articles)}")


In [None]:
# Headlines, Publish Time und URL extrahieren
data = []
for item in all_articles:
    # Falls item ein Dict ist
    if isinstance(item, dict):
        data.append({
            "headline": item.get("headline", ""),
            "published_at": item.get("published_at", ""),
            "url": item.get("url", "")
        })
    # Falls es ein String ist (sehr selten)
    else:
        data.append({
            "headline": str(item),
            "published_at": "",
            "url": ""
        })

import pandas as pd
df = pd.DataFrame(data)

# CSV speichern
csv_path = 'congress_news.csv' #"apple_news_jan2022_to_now.csv"
df.to_csv(csv_path, index=False)

print(f"CSV saved as: {csv_path}")
df.head()

In [None]:
import pandas as pd
from datetime import datetime

# Debug: Let's see what's actually in all_articles
print(f"Type of all_articles: {type(all_articles)}")
print(f"Length of all_articles: {len(all_articles)}")

if all_articles:
    print(f"Type of first item: {type(all_articles[0])}")
    print(f"First item length: {len(all_articles[0])}")

# Process the tuples - they are in format ('data', {'news': [...]})
flattened_data = []

for item in all_articles:
    if isinstance(item, tuple) and len(item) == 2:
        if item[0] == 'data' and isinstance(item[1], dict) and 'news' in item[1]:
            # Extract news from the dictionary
            news_list = item[1]['news']
            for article in news_list:
                # Access attributes directly from the News object
                created_at = article.created_at
                if hasattr(created_at, 'strftime'):
                    created_at = created_at.strftime('%Y-%m-%d %H:%M:%S')

                flattened_data.append({
                    'id': article.id,
                    'headline': article.headline,
                    'created_at': created_at,
                    'url': article.url

                })

print(f"Processed {len(flattened_data)} articles")

# Create DataFrame
clean_df = pd.DataFrame(flattened_data)

print(f"Created DataFrame with {len(clean_df)} articles")

# Save to CSV
clean_df.to_csv("CLEAN_congress_news.csv", index=False, encoding='utf-8')
print("CSV saved successfully!")

# Display the first few rows
if not clean_df.empty:
    print("\nFirst 5 rows:")
    print(clean_df.head())
else:
    print("DataFrame is empty")

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lukekerbs/us-senate-financial-disclosures-stocks-and-options")

print("Path to dataset files:", path)

In [None]:
import pandas as pd

# Datei einlesen
df = pd.read_csv(file_path)

# Zeige die Spaltennamen
print(df.columns)


In [None]:
ticker_counts = df.groupby('ticker').agg({
    'asset_name': lambda x: ', '.join(x.unique()),
    'ticker': 'size'
}).rename(columns={'ticker': 'Count'}).sort_values(by='Count', ascending=False)

print(ticker_counts.head(20))


In [93]:
""" Scrape stock transactions from Senator periodic filings (resumable + ETA logging) """

from bs4 import BeautifulSoup
import logging
import pandas as pd
import requests
import time
import os
from datetime import datetime, timedelta

ROOT = 'https://efdsearch.senate.gov'
LANDING_PAGE_URL = f'{ROOT}/search/home/'
SEARCH_PAGE_URL = f'{ROOT}/search/'
REPORTS_URL = f'{ROOT}/search/report/data/'

BATCH_SIZE = 100
RATE_LIMIT_SECS = 2
PDF_PREFIX = '/search/view/paper/'
OUTPUT_CSV = 'senator_transactions_all.csv'
MAX_RETRIES = 3
TIMEOUT = 10

REPORT_COL_NAMES = [
    'tx_date',
    'file_date',
    'last_name',
    'first_name',
    'order_type',
    'ticker',
    'asset_name',
    'tx_amount',
    'link'
]

LOGGER = logging.getLogger(__name__)

def add_rate_limit(f):
    def wrapper(*args, **kwargs):
        time.sleep(RATE_LIMIT_SECS)
        return f(*args, **kwargs)
    return wrapper

def _csrf(client: requests.Session) -> str:
    landing_page_response = client.get(LANDING_PAGE_URL)
    assert landing_page_response.url == LANDING_PAGE_URL, "Failed to fetch filings landing page"
    landing_page = BeautifulSoup(landing_page_response.text, "html.parser")
    form_csrf = landing_page.find(attrs={'name': 'csrfmiddlewaretoken'})['value']

    client.post(LANDING_PAGE_URL,
                data={'csrfmiddlewaretoken': form_csrf, 'prohibition_agreement': '1'},
                headers={'Referer': LANDING_PAGE_URL})

    return client.cookies.get('csrftoken') or client.cookies.get('csrf')

def reports_api(client: requests.Session, start_date: str, end_date: str, token: str):
    data = {
        'start': '0',  # immer vom Anfang
        'length': str(BATCH_SIZE),
        'report_types': '[11]',
        'filer_types': '[]',
        'submitted_start_date': start_date,
        'submitted_end_date': end_date,
        'candidate_state': '',
        'senator_state': '',
        'office_id': '',
        'first_name': '',
        'last_name': '',
        'csrfmiddlewaretoken': token
    }
    LOGGER.info(f'Getting reports from {start_date} to {end_date}')

    for attempt in range(MAX_RETRIES):
        try:
            resp = client.post(REPORTS_URL, data=data, headers={'Referer': SEARCH_PAGE_URL}, timeout=TIMEOUT)
            resp.raise_for_status()
            return resp.json()['data']
        except Exception as e:
            LOGGER.warning(f'Attempt {attempt+1} failed: {e}')
            time.sleep(2 ** attempt)
    raise RuntimeError(f'Failed to fetch reports from {start_date} to {end_date} after {MAX_RETRIES} attempts')

def _tbody_from_link(client: requests.Session, link: str):
    report_url = f'{ROOT}{link}'
    resp = client.get(report_url)
    if resp.url == LANDING_PAGE_URL:
        _csrf(client)
        resp = client.get(report_url)
    report = BeautifulSoup(resp.text, "html.parser")
    tbodies = report.find_all('tbody')
    return tbodies[0] if tbodies else None

def txs_for_report_all(client: requests.Session, row):
    first, last, _, link_html, date_received = row
    link_soup = BeautifulSoup(link_html, "html.parser")
    a_tag = link_soup.a
    link = a_tag.get('href') if a_tag else None

    if not link or link.startswith(PDF_PREFIX):
        return pd.DataFrame([{
            'tx_date': None,
            'file_date': date_received,
            'last_name': last,
            'first_name': first,
            'order_type': None,
            'ticker': None,
            'asset_name': None,
            'tx_amount': None,
            'link': f"{ROOT}{link}" if link else None
        }])

    tbody = _tbody_from_link(client, link)
    if not tbody:
        return pd.DataFrame([{
            'tx_date': None,
            'file_date': date_received,
            'last_name': last,
            'first_name': first,
            'order_type': None,
            'ticker': None,
            'asset_name': None,
            'tx_amount': None,
            'link': f"{ROOT}{link}"
        }])

    stocks = []
    for tr in tbody.find_all('tr'):
        cols = [c.get_text().strip() for c in tr.find_all('td')]
        if len(cols) < 8:
            continue
        tx_date, ticker, asset_name, asset_type, order_type, tx_amount = \
            cols[1], cols[3], cols[4], cols[5], cols[6], cols[7]
        if asset_type != 'Stock' and ticker.strip() in ('--', ''):
            continue
        stocks.append({
            'tx_date': tx_date,
            'file_date': date_received,
            'last_name': last,
            'first_name': first,
            'order_type': order_type,
            'ticker': ticker,
            'asset_name': asset_name,
            'tx_amount': tx_amount,
            'link': f"{ROOT}{link}"
        })
    return pd.DataFrame(stocks)

def main():
    LOGGER.info('Initializing client')
    client = requests.Session()
    client.get = add_rate_limit(client.get)
    client.post = add_rate_limit(client.post)

    token = _csrf(client)

    start = datetime(2012, 1, 1)
    end = datetime.today()

    # CSV vorbereiten
    if os.path.exists(OUTPUT_CSV):
        mode = 'a'
        header = False
    else:
        mode = 'w'
        header = True

    while start < end:
        month_end = (start.replace(day=28) + timedelta(days=4)).replace(day=1) - timedelta(days=1)
        start_str = start.strftime("%m/%d/%Y 00:00:00")
        end_str = month_end.strftime("%m/%d/%Y 23:59:59")

        batch = reports_api(client, start_str, end_str, token)

        for r in batch:
            df = txs_for_report_all(client, r)
            df.to_csv(OUTPUT_CSV, mode=mode, header=header, index=False)
            header = False
            mode = 'a'

        start = month_end + timedelta(days=1)

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(levelname)s] %(message)s')
    main()


[2025-11-30 14:32:24,463 INFO] Initializing client
[2025-11-30 14:32:29,499 INFO] Getting reports from 01/01/2012 00:00:00 to 01/31/2012 23:59:59
[2025-11-30 14:32:31,797 INFO] Getting reports from 02/01/2012 00:00:00 to 02/29/2012 23:59:59
[2025-11-30 14:32:34,135 INFO] Getting reports from 03/01/2012 00:00:00 to 03/31/2012 23:59:59
[2025-11-30 14:32:36,484 INFO] Getting reports from 04/01/2012 00:00:00 to 04/30/2012 23:59:59
[2025-11-30 14:32:38,889 INFO] Getting reports from 05/01/2012 00:00:00 to 05/31/2012 23:59:59
[2025-11-30 14:32:41,227 INFO] Getting reports from 06/01/2012 00:00:00 to 06/30/2012 23:59:59
[2025-11-30 14:32:43,574 INFO] Getting reports from 07/01/2012 00:00:00 to 07/31/2012 23:59:59
[2025-11-30 14:32:45,998 INFO] Getting reports from 08/01/2012 00:00:00 to 08/31/2012 23:59:59
[2025-11-30 14:32:48,347 INFO] Getting reports from 09/01/2012 00:00:00 to 09/30/2012 23:59:59
[2025-11-30 14:32:50,687 INFO] Getting reports from 10/01/2012 00:00:00 to 10/31/2012 23:59:59

In [90]:
# --- Notebook Cell ---
""" Test scraping first N Senator reports (including PDFs) """

from bs4 import BeautifulSoup
import logging
import pandas as pd
import requests
import time

ROOT = 'https://efdsearch.senate.gov'
LANDING_PAGE_URL = f'{ROOT}/search/home/'
SEARCH_PAGE_URL = f'{ROOT}/search/'
REPORTS_URL = f'{ROOT}/search/report/data/'

BATCH_SIZE = 100
RATE_LIMIT_SECS = 2
PDF_PREFIX = '/search/view/paper/'

REPORT_COL_NAMES = [
    'tx_date',
    'file_date',
    'last_name',
    'first_name',
    'order_type',
    'ticker',
    'asset_name',
    'tx_amount',
    'link'
]

logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(levelname)s] %(message)s')
LOGGER = logging.getLogger()

def add_rate_limit(f):
    def wrapper(*args, **kwargs):
        time.sleep(RATE_LIMIT_SECS)
        return f(*args, **kwargs)
    return wrapper

def _csrf(client: requests.Session) -> str:
    landing_page_response = client.get(LANDING_PAGE_URL)
    landing_page = BeautifulSoup(landing_page_response.text, "html.parser")
    form_csrf = landing_page.find(attrs={'name': 'csrfmiddlewaretoken'})['value']
    client.post(LANDING_PAGE_URL,
                data={'csrfmiddlewaretoken': form_csrf, 'prohibition_agreement': '1'},
                headers={'Referer': LANDING_PAGE_URL})
    return client.cookies.get('csrftoken') or client.cookies.get('csrf')

def reports_api(client: requests.Session, offset: int, token: str):
    data = {
        'start': str(offset),
        'length': str(BATCH_SIZE),
        'report_types': '[11]',
        'filer_types': '[]',
        'submitted_start_date': '01/01/2012 00:00:00',
        'submitted_end_date': '',
        'candidate_state': '',
        'senator_state': '',
        'office_id': '',
        'first_name': '',
        'last_name': '',
        'csrfmiddlewaretoken': token
    }
    LOGGER.info(f'Getting rows starting at {offset}')
    resp = client.post(REPORTS_URL, data=data, headers={'Referer': SEARCH_PAGE_URL})
    return resp.json()['data']

def senator_reports(client: requests.Session):
    token = _csrf(client)
    idx = 0
    all_reports = []
    while True:
        batch = reports_api(client, idx, token)
        if not batch:
            break
        all_reports.extend(batch)
        idx += BATCH_SIZE
    return all_reports

def _tbody_from_link(client: requests.Session, link: str):
    report_url = f'{ROOT}{link}'
    resp = client.get(report_url)
    report = BeautifulSoup(resp.text, "html.parser")
    tbodies = report.find_all('tbody')
    return tbodies[0] if tbodies else None

def txs_for_report_all(client: requests.Session, row):
    first, last, _, link_html, date_received = row
    link_soup = BeautifulSoup(link_html, "html.parser")
    a_tag = link_soup.a
    link = a_tag.get('href') if a_tag else None
    full_link = f"{ROOT}{link}" if link else None

    if not link or link.startswith(PDF_PREFIX):
        return pd.DataFrame([{
            'tx_date': None,
            'file_date': date_received,
            'last_name': last,
            'first_name': first,
            'order_type': None,
            'ticker': None,
            'asset_name': None,
            'tx_amount': None,
            'link': full_link
        }])

    tbody = _tbody_from_link(client, link)
    if not tbody:
        return pd.DataFrame([{
            'tx_date': None,
            'file_date': date_received,
            'last_name': last,
            'first_name': first,
            'order_type': None,
            'ticker': None,
            'asset_name': None,
            'tx_amount': None,
            'link': full_link
        }])

    stocks = []
    for tr in tbody.find_all('tr'):
        cols = [c.get_text().strip() for c in tr.find_all('td')]
        if len(cols) < 8:
            continue
        tx_date, ticker, asset_name, asset_type, order_type, tx_amount = \
            cols[1], cols[3], cols[4], cols[5], cols[6], cols[7]
        if asset_type != 'Stock' and ticker.strip() in ('--', ''):
            continue
        stocks.append({
            'tx_date': tx_date,
            'file_date': date_received,
            'last_name': last,
            'first_name': first,
            'order_type': order_type,
            'ticker': ticker,
            'asset_name': asset_name,
            'tx_amount': tx_amount,
            'link': full_link
        })
    return pd.DataFrame(stocks)

# --- Test cell: nur die ersten 5 Reports ---
client = requests.Session()
client.get = add_rate_limit(client.get)
client.post = add_rate_limit(client.post)

reports = senator_reports(client)[:5]  # nur erste 5 Reports
all_txs = pd.concat([txs_for_report_all(client, r) for r in reports], ignore_index=True)

# CSV Export
all_txs.to_csv('senator_transactions_test.csv', index=False)

# Vorschau
all_txs.head()


[2025-11-30 13:46:37,476 INFO] Getting rows starting at 0
[2025-11-30 13:46:39,858 INFO] Getting rows starting at 100
[2025-11-30 13:46:42,197 INFO] Getting rows starting at 200
[2025-11-30 13:46:44,559 INFO] Getting rows starting at 300
[2025-11-30 13:46:46,927 INFO] Getting rows starting at 400
[2025-11-30 13:46:49,293 INFO] Getting rows starting at 500
[2025-11-30 13:46:51,646 INFO] Getting rows starting at 600
[2025-11-30 13:46:53,978 INFO] Getting rows starting at 700
[2025-11-30 13:46:56,550 INFO] Getting rows starting at 800
[2025-11-30 13:46:58,906 INFO] Getting rows starting at 900
[2025-11-30 13:47:01,248 INFO] Getting rows starting at 1000
[2025-11-30 13:47:03,590 INFO] Getting rows starting at 1100
[2025-11-30 13:47:05,925 INFO] Getting rows starting at 1200
[2025-11-30 13:47:08,321 INFO] Getting rows starting at 1300
[2025-11-30 13:47:10,756 INFO] Getting rows starting at 1400
[2025-11-30 13:47:13,141 INFO] Getting rows starting at 1500
[2025-11-30 13:47:15,519 INFO] Getti

Unnamed: 0,tx_date,file_date,last_name,first_name,order_type,ticker,asset_name,tx_amount,link
0,11/25/2025,11/27/2025,McCormick,David H,Purchase,BITB,Bitwise Bitcoin ETF,"$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/5...
1,11/24/2025,11/27/2025,McCormick,David H,Purchase,BITB,Bitwise Bitcoin ETF,"$50,001 - $100,000",https://efdsearch.senate.gov/search/view/ptr/5...
2,11/21/2025,11/26/2025,Smith,Tina,Sale (Full),HBAN,Huntington Bancshares Incorporated - Common Stock,"$100,001 - $250,000",https://efdsearch.senate.gov/search/view/ptr/8...
3,07/10/2025,11/24/2025,Peters,Gary C,Sale (Full),OGN,Organon & Co.,"$1,001 - $15,000",https://efdsearch.senate.gov/search/view/ptr/1...
4,11/03/2025,11/21/2025,Mullin,Markwayne,Sale (Full),FI,Fiserv Inc,"$15,001 - $50,000",https://efdsearch.senate.gov/search/view/ptr/8...
