# Homework Starter — Stage 04: Data Acquisition and Ingestion
Name: 
Date: 

## Objectives
- API ingestion with secrets in `.env`
- Scrape a permitted public table
- Validate and save raw data to `data/raw/`

In [7]:
import os, pathlib, datetime as dt
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

RAW = pathlib.Path('../data/raw'); RAW.mkdir(parents=True, exist_ok=True)
load_dotenv(); print('ALPHAVANTAGE_API_KEY loaded?', bool(os.getenv('ALPHAVANTAGE_API_KEY')))

ALPHAVANTAGE_API_KEY loaded? True


## Helpers (use or modify)

In [8]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

## Part 1 — API Pull (Required)
Choose an endpoint (e.g., Alpha Vantage or use `yfinance` fallback).

In [9]:
SYMBOL = 'AAPL'
USE_ALPHA = bool(os.getenv('ALPHAVANTAGE_API_KEY'))
if USE_ALPHA:
    url = 'https://www.alphavantage.co/query'
    #adjusted doesn't seem to work, premium endpoint instead of free;
    # params = {'function':'TIME_SERIES_DAILY_ADJUSTED','symbol':SYMBOL,'outputsize':'compact','apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
    params = {'function':'TIME_SERIES_DAILY','symbol':SYMBOL,'outputsize':'compact','apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    key = [k for k in js if 'Time Series' in k][0]
    df1 = pd.DataFrame(js[key]).T.reset_index()
    print('df1 columns:', df1.columns.tolist())
    #less columns in the free endpoint, had to remove some
    #df_api = pd.DataFrame(js[key]).T.reset_index().rename(columns={'index':'date','5. adjusted close':'adj_close'})[['date','adj_close']]
    df_api = pd.DataFrame(js[key]).T.reset_index().rename(columns={'index':'date',"4. close":"close"})[['date','close']]
    df_api['date'] = pd.to_datetime(df_api['date'])
    #df_api['adj_close'] = pd.to_numeric(df_api['adj_close'])
    #replace with free verison
    df_api['close'] = pd.to_numeric(df_api['close'])
else:
    import yfinance as yf
    df_api = yf.download(SYMBOL, period='3mo', interval='1d').reset_index()[['Date','Adj Close']]
    df_api.columns = ['date','adj_close']
#adjust for close
# v_api = validate(df_api, ['date','adj_close']); v_api
v_api = validate(df_api, ['date','close']); v_api


df1 columns: ['index', '1. open', '2. high', '3. low', '4. close', '5. volume']


{'missing': [], 'shape': (100, 2), 'na_total': 0}

In [10]:
_ = save_csv(df_api.sort_values('date'), prefix='api', source='alpha' if USE_ALPHA else 'yfinance', symbol=SYMBOL)

Saved ../data/raw/api_source-alpha_symbol-AAPL_20250820-183541.csv


## Part 2 — Scrape a Public Table (Required)
Replace `SCRAPE_URL` with a permitted page containing a simple table.

In [11]:
# SCRAPE_URL = 'https://example.com/markets-table'  # TODO: replace with permitted page
SCRAPE_URL = 'https://finance.yahoo.com/gainers'  # TODO: replace with permitted page
#good website for gaining companies by yahoo which is one of the originals
headers = {'User-Agent':'AFE-Homework/1.0'}
try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30); resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)
    print(data)
except Exception as e:
    print('Scrape failed, using inline demo table:', e)
    html = '<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>'
    soup = BeautifulSoup(html, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)

if 'Price' in df_scrape.columns:
    df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')
v_scrape = validate(df_scrape, list(df_scrape.columns)); v_scrape

[['NEGG', 'Newegg Commerce, Inc.', '', '110.64+21.24(+23.76%)', '+21.24', '+23.76%', '1.577M', '1.136M', '2.155B', '--', '+419.77%', ''], ['LNZA', 'LanzaTech Global, Inc.', '', '23.28+3.56(+18.05%)', '+3.56', '+18.05%', '126,383', '28,794', '54.001M', '--', '-88.26%', ''], ['SRRK', 'Scholar Rock Holding Corporation', '', '34.40+4.29(+14.26%)', '+4.29', '+14.26%', '5.374M', '1.417M', '3.307B', '--', '+225.27%', ''], ['STEM', 'Stem, Inc.', '', '16.47+1.93(+13.27%)', '+1.93', '+13.27%', '589,501', '521,463', '137.674M', '--', '+15.21%', ''], ['MCRB', 'Seres Therapeutics, Inc.', '', '17.94+1.69(+10.40%)', '+1.69', '+10.40%', '130,131', '119,888', '157.057M', '--', '-7.04%', ''], ['GDS', 'GDS Holdings Limited', '', '33.90+2.31(+7.31%)', '+2.31', '+7.31%', '3.427M', '2.151M', '6.756B', '308.18', '+114.02%', ''], ['CVI', 'CVR Energy, Inc.', '', '28.89+1.88(+6.96%)', '+1.88', '+6.96%', '2.319M', '1.173M', '2.904B', '--', '+9.75%', ''], ['ADI', 'Analog Devices, Inc.', '', '244.87+14.43(+6.26%)'

{'missing': [], 'shape': (25, 12), 'na_total': 25}

In [12]:
_ = save_csv(df_scrape, prefix='scrape', site='yahoo_finance', table='gainers')

Saved ../data/raw/scrape_site-yahoo_finance_table-gainers_20250820-183542.csv


## Documentation
- API Source: https://www.alphavantage.co/query with params TIME_SERIES_DAILY for funciotn type, AAPL for symbol, compact outputsize, and my free tier api key
- Scrape Source: https://finance.yahoo.com/gainers -> table of top gainers in the market
- Assumptions & risks: To the API there are definitely rate limits, but in terms of the scrape since its just extracting from the http request that can be done whenever. Selector fragility was seen when the endpoint user here was changed to a premium endpoint. Therefore the coed here can only be guaranteed to work for AlphaVantage for only some time. Schema changes can be more likely than selector fragility due to endpoint changes, but they are still unlikely considering I only use a handful of columns

