# Homework Starter — Stage 04: Data Acquisition and Ingestion
Name: Wissal Barday
Date: 20/08/2025

## Objectives
- API ingestion with secrets in `.env`
- Scrape a permitted public table
- Validate and save raw data to `data/raw/`

In [15]:
import os, pathlib, datetime as dt
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

RAW = pathlib.Path('data/raw'); RAW.mkdir(parents=True, exist_ok=True)
load_dotenv(); print('ALPHAVANTAGE_API_KEY loaded?', bool(os.getenv('ALPHAVANTAGE_API_KEY')))

ALPHAVANTAGE_API_KEY loaded? False


## Part 1 — API Pull (Required)
Choose an endpoint (e.g., Alpha Vantage or use `yfinance` fallback).

In [None]:
SYMBOL = 'AAPL'
USE_ALPHA = bool(os.getenv('ALPHAVANTAGE_API_KEY'))
if USE_ALPHA:
    url = 'https://www.alphavantage.co/query'
    params = {'function':'TIME_SERIES_DAILY_ADJUSTED','symbol':SYMBOL,'outputsize':'compact','apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    key = [k for k in js if 'Time Series' in k][0]
    df_api = pd.DataFrame(js[key]).T.reset_index().rename(columns={'index':'date','5. adjusted close':'adj_close'})[['date','adj_close']]
    df_api['date'] = pd.to_datetime(df_api['date']); df_api['adj_close'] = pd.to_numeric(df_api['adj_close'])
else:
    import yfinance as yf
df_api = yf.download(SYMBOL, period='3mo', interval='1d') \
    .reset_index()[['Date','Close']]
df_api.columns = ['date','adj_close']   # treat Close as adjusted close

v_api = validate(df_api, ['date','adj_close']); v_api


  df_api = yf.download(SYMBOL, period='3mo', interval='1d') \
[*********************100%***********************]  1 of 1 completed

        date   adj_close
0 2025-05-20  206.625504
1 2025-05-21  201.860901
2 2025-05-22  201.131729
3 2025-05-23  195.048645
4 2025-05-27  199.983047





In [None]:
# preview
display(df_api.head())
print(df_api.dtypes)

# validate
v_api = validate(df_api, ['date','adj_close'])
print("Validation:", v_api)

src = 'alphavantage' if USE_ALPHA else 'yfinance'
path_api = save_csv(df_api, prefix='api', source=src, symbol=SYMBOL)
path_api


Unnamed: 0,date,adj_close
0,2025-05-20,206.625504
1,2025-05-21,201.860901
2,2025-05-22,201.131729
3,2025-05-23,195.048645
4,2025-05-27,199.983047


date         datetime64[ns]
adj_close           float64
dtype: object
Validation: {'missing': [], 'shape': (64, 2), 'na_total': 0}
Saved data\raw\api_source-yfinance_symbol-AAPL_20250820-215216.csv


WindowsPath('data/raw/api_source-yfinance_symbol-AAPL_20250820-215216.csv')

In [25]:
_ = save_csv(df_api.sort_values('date'), prefix='api', source='alpha' if USE_ALPHA else 'yfinance', symbol=SYMBOL)

Saved data\raw\api_source-yfinance_symbol-AAPL_20250820-215216.csv


## Part 2 — Scrape a Public Table (Required)
Replace `SCRAPE_URL` with a permitted page containing a simple table.

In [None]:
SCRAPE_URL = '"https://en.wikipedia.org/wiki/List_of_mergers_and_acquisitions_by_Apple'  # Web scrape a table (Wikipedia → Apple acquisitions)
headers = {'User-Agent':'AFE-Homework/1.0'}
try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30); resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    # grab the first table on the page (most Wikipedia pages use .wikitable)
    table = soup.select_one('table')
    if table is None:
        raise ValueError("No <table> found on page")
    
    
    
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)
except Exception as e:
    print('Scrape failed, using inline demo table:', e)
    html = '<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>'
    soup = BeautifulSoup(html, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)

if 'Price' in df_scrape.columns:
    df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')
v_scrape = validate(df_scrape, list(df_scrape.columns)); v_scrape

Scrape failed, using inline demo table: No connection adapters were found for '"https://en.wikipedia.org/wiki/List_of_mergers_and_acquisitions_by_Apple'


{'missing': [], 'shape': (1, 2), 'na_total': 0}

In [29]:
_ = save_csv(df_scrape, prefix='scrape', site='example', table='markets')

Saved data\raw\scrape_site-example_table-markets_20250820-220917.csv


## Documentation
- API Source: (URL/endpoint/params)
- Scrape Source: (URL/table description)
- Assumptions & risks: (rate limits, selector fragility, schema changes)
- Confirm `.env` is not committed.