# Homework Starter — Stage 04: Data Acquisition and Ingestion
Name: 
Date: 

## Objectives
- API ingestion with secrets in `.env`
- Scrape a permitted public table
- Validate and save raw data to `data/raw/`

In [2]:
import os, pathlib, datetime as dt
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

RAW = pathlib.Path('data/raw'); RAW.mkdir(parents=True, exist_ok=True)
load_dotenv(); print('ALPHAVANTAGE_API_KEY loaded?', bool(os.getenv('ALPHAVANTAGE_API_KEY')))

ALPHAVANTAGE_API_KEY loaded? True


## Helpers (use or modify)

In [3]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

## Part 1 — API Pull (Required)
Choose an endpoint (e.g., Alpha Vantage or use `yfinance` fallback).

In [20]:
SYMBOL = 'AAPL'
USE_ALPHA = bool(os.getenv('ALPHAVANTAGE_API_KEY'))
if USE_ALPHA:
    url = 'https://www.alphavantage.co/query'
    #adjusted doesn't seem to work, premium endpoint instead of free;
    # params = {'function':'TIME_SERIES_DAILY_ADJUSTED','symbol':SYMBOL,'outputsize':'compact','apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
    params = {'function':'TIME_SERIES_DAILY','symbol':SYMBOL,'outputsize':'compact','apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    key = [k for k in js if 'Time Series' in k][0]
    df1 = pd.DataFrame(js[key]).T.reset_index()
    print('df1 columns:', df1.columns.tolist())
    #less columns in the free endpoint, had to remove some
    #df_api = pd.DataFrame(js[key]).T.reset_index().rename(columns={'index':'date','5. adjusted close':'adj_close'})[['date','adj_close']]
    df_api = pd.DataFrame(js[key]).T.reset_index().rename(columns={'index':'date',"4. close":"close"})[['date','close']]
    df_api['date'] = pd.to_datetime(df_api['date'])
    #df_api['adj_close'] = pd.to_numeric(df_api['adj_close'])
    #replace with free verison
    df_api['close'] = pd.to_numeric(df_api['close'])
else:
    import yfinance as yf
    df_api = yf.download(SYMBOL, period='3mo', interval='1d').reset_index()[['Date','Adj Close']]
    df_api.columns = ['date','adj_close']
#adjust for close
# v_api = validate(df_api, ['date','adj_close']); v_api
v_api = validate(df_api, ['date','close']); v_api


df1 columns: ['index', '1. open', '2. high', '3. low', '4. close', '5. volume']


{'missing': [], 'shape': (100, 2), 'na_total': 0}

In [21]:
_ = save_csv(df_api.sort_values('date'), prefix='api', source='alpha' if USE_ALPHA else 'yfinance', symbol=SYMBOL)

Saved data/raw/api_source-alpha_symbol-AAPL_20250817-212427.csv


## Part 2 — Scrape a Public Table (Required)
Replace `SCRAPE_URL` with a permitted page containing a simple table.

In [None]:
# SCRAPE_URL = 'https://example.com/markets-table'  # TODO: replace with permitted page
SCRAPE_URL = 'https://finance.yahoo.com/gainers'  # TODO: replace with permitted page
#good website for gaining companies by yahoo which is one of the originals
headers = {'User-Agent':'AFE-Homework/1.0'}
try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30); resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)
    print(data)
except Exception as e:
    print('Scrape failed, using inline demo table:', e)
    html = '<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>'
    soup = BeautifulSoup(html, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)

if 'Price' in df_scrape.columns:
    df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')
v_scrape = validate(df_scrape, list(df_scrape.columns)); v_scrape

[['RUN', 'Sunrun Inc.', '', '13.92+3.44(+32.82%)', '+3.44', '+32.82%', '76.432M', '24.218M', '3.212B', '--', '-31.29%', ''], ['NXT', 'Nextracker Inc.', '', '60.58+6.59(+12.21%)', '+6.59', '+12.21%', '6.048M', '2.503M', '9.195B', '16.51', '+50.40%', ''], ['UNH', 'UnitedHealth Group Incorporated', '', '304.01+32.52(+11.98%)', '+32.52', '+11.98%', '67.496M', '17.987M', '275.334B', '13.16', '-47.47%', ''], ['FSLR', 'First Solar, Inc.', '', '199.95+19.90(+11.05%)', '+19.90', '+11.05%', '10.932M', '4.116M', '21.444B', '17.09', '-12.45%', ''], ['BHC', 'Bausch Health Companies Inc.', '', '8.34+0.83(+11.05%)', '+0.83', '+11.05%', '8.902M', '2.783M', '3.084B', '32.08', '+45.04%', ''], ['HSAI', 'Hesai Group', '', '25.59+2.27(+9.73%)', '+2.27', '+9.73%', '11.791M', '3.431M', '3.39B', '232.64', '+464.90%', ''], ['ARWR', 'Arrowhead Pharmaceuticals, Inc.', '', '20.51+1.73(+9.21%)', '+1.73', '+9.21%', '5.202M', '1.933M', '2.836B', '--', '-12.16%', ''], ['NU', 'Nu Holdings Ltd.', '', '13.10+1.09(+9.08%

{'missing': [], 'shape': (25, 12), 'na_total': 25}

In [11]:
_ = save_csv(df_scrape, prefix='scrape', site='example', table='markets')

Saved data/raw/scrape_site-example_table-markets_20250817-211839.csv


## Documentation
- API Source: https://www.alphavantage.co/query with params
TIME_SERIES_DAILY for funciotn type, AAPL for symbol, compact outputsize, and my free tier api key
- Scrape Source: (URL/table description)
- Assumptions & risks: (rate limits, selector fragility, schema changes)
- Confirm `.env` is not committed.
