In [3]:
import os
import json
import time
import datetime as dt
import csv
import pathlib

from typing import Dict, List
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

In [23]:
#creating path
DATA_RAW = pathlib.Path("data/raw")
DATA_RAW.mkdir(parents=True, exist_ok=True)

# Load .env
load_dotenv()
ALPHA_API_KEY = os.getenv("ALPHA_API_KEY")
NINJA_API_KEY = os.getenv("NINJA_API_KEY")
print("Loaded API_ALPHA_API_KEY?", bool(ALPHA_API_KEY))
print("Loaded API_NINJAS_API_KEY?", bool(NINJA_API_KEY))

Loaded API_ALPHA_API_KEY? True
Loaded API_NINJAS_API_KEY? True


In [24]:
def safe_stamp():
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")


def safe_filename(prefix: str, meta: Dict[str, str]) -> str:
    mid = "_".join([f"{k}-{str(v).replace(' ', '-')[:20]}" for k, v in meta.items()])
    return f"{prefix}_{mid}_{safe_stamp()}.csv"


def validate_df(df: pd.DataFrame, required_cols: List[str], dtypes_map: Dict[str, str]) -> Dict[str, str]:
    msgs = {}
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        msgs['missing_cols'] = f"Missing columns: {missing}"
    
    for col, dtype in dtypes_map.items():
        if col in df.columns:
            try:
                if dtype == 'datetime64[ns]':
                    pd.to_datetime(df[col])
                    
                elif dtype == 'float':
                    pd.to_numeric(df[col])
            except Exception as e:
                msgs[f'dtype_{col}'] = f"Failed to coerce {col} to {dtype}: {e}"
  
    na_counts = df.isna().sum().sum()
    msgs['na_total'] = f"Total NA values: {na_counts}"
    return msgs

!pip install investpy

!pip install yfinance

In [25]:
#ALPHA VANTAGE - PREMIUM ENDPOINT

SYMBOL = "AAPL"
use_alpha = bool(ALPHA_API_KEY)

if not use_alpha:
    print("No Alpha Vantage API key found — currently falling back to Yahoo Finance")

if use_alpha: #Using API to pull out values
    url = "https://www.alphavantage.co/query"
    parameters = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",      
        "symbol": SYMBOL,
        "outputsize": "compact",     # compact - last 100 value; full - all historical value
        "apikey": ALPHA_API_KEY,
        "datatype": "json"
    }
    
    r = requests.get(url, params=parameters, timeout=30)
    r.raise_for_status()
    data = r.json()
    if "Information" in data:
        raise RuntimeError({data['Information']})
    if "Error Message" in data:
        raise RuntimeError( {data['Error Message']})


    key = [k for k in data.keys() if "Time Series" in k]
    
    assert key, f"Unexpected response keys: {list(data.keys())}"
    series = data[key[0]]
    
    df = (
        pd.DataFrame(series).T
        .rename_axis("date")
        .reset_index()
    )

    df = df[['date', '1. open', '2. high', '3. low', '4. close','5. volume']].rename(
        columns={
        '1. open': 'Open',
        '2. high': 'High',
        '3. low': 'Low',
        '4. close': 'Close',
        '5. volume': 'Volume'
        })

    df['date'] = pd.to_datetime(df['date'])
    for col in ['open','high','low','close','volume']:
        df[col] = pd.to_numeric(df[col])
        
else: #use yahoofinance
    import yfinance as yf 
    df = (yf.download(SYMBOL, period="6mo", interval="1d").reset_index())[['Date','Open','High','Low','Close','Volume']]
    

df = df.sort_values('date').reset_index(drop=True)

msgs = validate_df(
    df,
    required_cols=['date','open','high','low','close','volume'],
    dtypes_map={
        'date':'datetime64[ns]',
        'open':'float',
        'high':'float',
        'low':'float',
        'close':'float',
        'volume':'float'
    })
print(msgs)

file_name = safe_filename(prefix="api", meta={"source": "alpha" 
                                              if use_alpha 
                                              else "YahooFinance", "symbol": 'AAPL'})

out_path = DATA_RAW/file_name #saving it somewhere
df.to_csv(out_path, index=False)
print("Saved:", out_path)

RuntimeError: {'Thank you for using Alpha Vantage! This is a premium endpoint. You may subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly unlock all premium endpoints'}

In [25]:
#ALPHA VANTAGE - PREMIUM ENDPOINT

SYMBOL = "AAPL"
use_alpha = bool(ALPHA_API_KEY)

if not use_alpha:
    print("No Alpha Vantage API key found — currently falling back to Yahoo Finance")

if use_alpha: #Using API to pull out values
    url = "https://www.alphavantage.co/query"
    parameters = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",      
        "symbol": SYMBOL,
        "outputsize": "compact",     # compact - last 100 value; full - all historical value
        "apikey": ALPHA_API_KEY,
        "datatype": "json"
    }
    
    r = requests.get(url, params=parameters, timeout=30)
    r.raise_for_status()
    data = r.json()
    if "Information" in data:
        raise RuntimeError({data['Information']})
    if "Error Message" in data:
        raise RuntimeError( {data['Error Message']})


    key = [k for k in data.keys() if "Time Series" in k]
    
    assert key, f"Unexpected response keys: {list(data.keys())}"
    series = data[key[0]]
    
    df = (
        pd.DataFrame(series).T
        .rename_axis("date")
        .reset_index()
    )

    df = df[['date', '1. open', '2. high', '3. low', '4. close','5. volume']].rename(
        columns={
        '1. open': 'Open',
        '2. high': 'High',
        '3. low': 'Low',
        '4. close': 'Close',
        '5. volume': 'Volume'
        })

    df['date'] = pd.to_datetime(df['date'])
    for col in ['open','high','low','close','volume']:
        df[col] = pd.to_numeric(df[col])
        
else: #use yahoofinance
    import yfinance as yf 
    df = (yf.download(SYMBOL, period="6mo", interval="1d").reset_index())[['Date','Open','High','Low','Close','Volume']]
    

df = df.sort_values('date').reset_index(drop=True)

msgs = validate_df(
    df,
    required_cols=['date','open','high','low','close','volume'],
    dtypes_map={
        'date':'datetime64[ns]',
        'open':'float',
        'high':'float',
        'low':'float',
        'close':'float',
        'volume':'float'
    })
print(msgs)

file_name = safe_filename(prefix="api", meta={"source": "alpha" 
                                              if use_alpha 
                                              else "YahooFinance", "symbol": 'AAPL'})

out_path = DATA_RAW/file_name #saving it somewhere
df.to_csv(out_path, index=False)
print("Saved:", out_path)

RuntimeError: {'Thank you for using Alpha Vantage! This is a premium endpoint. You may subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly unlock all premium endpoints'}

In [25]:
#ALPHA VANTAGE - PREMIUM ENDPOINT

SYMBOL = "AAPL"
use_alpha = bool(ALPHA_API_KEY)

if not use_alpha:
    print("No Alpha Vantage API key found — currently falling back to Yahoo Finance")

if use_alpha: #Using API to pull out values
    url = "https://www.alphavantage.co/query"
    parameters = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",      
        "symbol": SYMBOL,
        "outputsize": "compact",     # compact - last 100 value; full - all historical value
        "apikey": ALPHA_API_KEY,
        "datatype": "json"
    }
    
    r = requests.get(url, params=parameters, timeout=30)
    r.raise_for_status()
    data = r.json()
    if "Information" in data:
        raise RuntimeError({data['Information']})
    if "Error Message" in data:
        raise RuntimeError( {data['Error Message']})


    key = [k for k in data.keys() if "Time Series" in k]
    
    assert key, f"Unexpected response keys: {list(data.keys())}"
    series = data[key[0]]
    
    df = (
        pd.DataFrame(series).T
        .rename_axis("date")
        .reset_index()
    )

    df = df[['date', '1. open', '2. high', '3. low', '4. close','5. volume']].rename(
        columns={
        '1. open': 'Open',
        '2. high': 'High',
        '3. low': 'Low',
        '4. close': 'Close',
        '5. volume': 'Volume'
        })

    df['date'] = pd.to_datetime(df['date'])
    for col in ['open','high','low','close','volume']:
        df[col] = pd.to_numeric(df[col])
        
else: #use yahoofinance
    import yfinance as yf 
    df = (yf.download(SYMBOL, period="6mo", interval="1d").reset_index())[['Date','Open','High','Low','Close','Volume']]
    

df = df.sort_values('date').reset_index(drop=True)

msgs = validate_df(
    df,
    required_cols=['date','open','high','low','close','volume'],
    dtypes_map={
        'date':'datetime64[ns]',
        'open':'float',
        'high':'float',
        'low':'float',
        'close':'float',
        'volume':'float'
    })
print(msgs)

file_name = safe_filename(prefix="api", meta={"source": "alpha" 
                                              if use_alpha 
                                              else "YahooFinance", "symbol": 'AAPL'})

out_path = DATA_RAW/file_name #saving it somewhere
df.to_csv(out_path, index=False)
print("Saved:", out_path)

RuntimeError: {'Thank you for using Alpha Vantage! This is a premium endpoint. You may subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly unlock all premium endpoints'}

In [25]:
#ALPHA VANTAGE - PREMIUM ENDPOINT

SYMBOL = "AAPL"
use_alpha = bool(ALPHA_API_KEY)

if not use_alpha:
    print("No Alpha Vantage API key found — currently falling back to Yahoo Finance")

if use_alpha: #Using API to pull out values
    url = "https://www.alphavantage.co/query"
    parameters = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",      
        "symbol": SYMBOL,
        "outputsize": "compact",     # compact - last 100 value; full - all historical value
        "apikey": ALPHA_API_KEY,
        "datatype": "json"
    }
    
    r = requests.get(url, params=parameters, timeout=30)
    r.raise_for_status()
    data = r.json()
    if "Information" in data:
        raise RuntimeError({data['Information']})
    if "Error Message" in data:
        raise RuntimeError( {data['Error Message']})


    key = [k for k in data.keys() if "Time Series" in k]
    
    assert key, f"Unexpected response keys: {list(data.keys())}"
    series = data[key[0]]
    
    df = (
        pd.DataFrame(series).T
        .rename_axis("date")
        .reset_index()
    )

    df = df[['date', '1. open', '2. high', '3. low', '4. close','5. volume']].rename(
        columns={
        '1. open': 'Open',
        '2. high': 'High',
        '3. low': 'Low',
        '4. close': 'Close',
        '5. volume': 'Volume'
        })

    df['date'] = pd.to_datetime(df['date'])
    for col in ['open','high','low','close','volume']:
        df[col] = pd.to_numeric(df[col])
        
else: #use yahoofinance
    import yfinance as yf 
    df = (yf.download(SYMBOL, period="6mo", interval="1d").reset_index())[['Date','Open','High','Low','Close','Volume']]
    

df = df.sort_values('date').reset_index(drop=True)

msgs = validate_df(
    df,
    required_cols=['date','open','high','low','close','volume'],
    dtypes_map={
        'date':'datetime64[ns]',
        'open':'float',
        'high':'float',
        'low':'float',
        'close':'float',
        'volume':'float'
    })
print(msgs)

file_name = safe_filename(prefix="api", meta={"source": "alpha" 
                                              if use_alpha 
                                              else "YahooFinance", "symbol": 'AAPL'})

out_path = DATA_RAW/file_name #saving it somewhere
df.to_csv(out_path, index=False)
print("Saved:", out_path)

RuntimeError: {'Thank you for using Alpha Vantage! This is a premium endpoint. You may subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly unlock all premium endpoints'}

In [36]:
#checking NINJA API
import os
from dotenv import load_dotenv

load_dotenv()
NINJA_API_KEY = os.getenv("NINJA_API_KEY")

if not NINJA_API_KEY:
    raise ValueError("No Ninja API key found in .env")

In [38]:
#NINJA API

SYMBOL = "AAPL"
use_ninja = bool(NINJA_API_KEY)

if not use_ninja:
    print("No NINJA API key found — currently falling back to Yahoo Finance")

if use_ninja: #Using NINJA API to pull values
    import requests
    url = 'https://api.api-ninjas.com/v1/stockprice?symbol={}'.format(SYMBOL)
    r = requests.get(url, headers={'X-Api-Key': NINJA_API_KEY})
                                              
    #r = requests.get(url, params=parameters, timeout=30)
    r.raise_for_status()
    data = r.json()

    key = [k for k in data.keys() if "Time Series" in k]
    
    assert key, f"Unexpected response keys: {list(data.keys())}"
    series = data[key[0]]
    
    df = (
        pd.DataFrame(series).T
        .rename_axis("date")
        .reset_index() 
    )

    df = df[['date', '1. open', '2. high', '3. low', '4. close','5. volume']].rename(
        columns={
        '1. open': 'Open',
        '2. high': 'High',
        '3. low': 'Low',
        '4. close': 'Close',
        '5. volume': 'Volume'
        })

    df['date'] = pd.to_datetime(df['date'])
    for col in ['open','high','low','close','volume']:
        df[col] = pd.to_numeric(df[col])
        
else: #use yahoofinance
    import yfinance as yf 
    df = (yf.download(SYMBOL, period="6mo", interval="1d").reset_index())[['Date','Open','High','Low','Close','Volume']]


df = df.sort_values('date').reset_index(drop=True)

msgs = validate_df(
    df,
    required_cols=['date','open','high','low','close','volume'],
    dtypes_map={
        'date':'datetime64[ns]',
        'open':'float',
        'high':'float',
        'low':'float',
        'close':'float',
        'volume':'float'
    })
print(msgs)

file_name = safe_filename(prefix="api", meta={"source": "NINJA" if use_ninja else "YahooFinance", "symbol": 'AAPL'})

out_path = DATA_RAW/file_name #saving it somewhere
df.to_csv(out_path, index=False)
print("Saved:", out_path)


HTTPError: 400 Client Error: Bad Request for url: https://api.api-ninjas.com/v1/stockprice?symbol=AAPL

In [62]:
#investing.com refuse to give values

SYMBOL = "AAPL"
API_KEY = bool(API_KEY)
print("Using Alpha Vantage:", use_alpha)

if use_alpha: #Using API to pull out values
    url = "https://www.alphavantage.co/query"
    parameters = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",      
        "from_symbol": SYMBOL,
        "outputsize": "compact",     # compact - last 100 value; full - all historical value
        "apikey": JFTZDNVI2ATUQEFB,
        "datatype": "json"
    }
    
    r = requests.get(url, params=parameters, timeout=30)
    r.raise_for_status()
    data = r.json()
   
    key = [k for k in data.keys() if "TIME_SERIES_DAILY_ADJUSTED" in k]
    
    assert key, f"Unexpected response keys: {list(data.keys())}"
    series = data[key[0]]
    
    df = (
        pd.DataFrame(series).T
        .rename_axis("date")
        .reset_index()
    )

    df = df[['date', '1. open', '2. high', '3. low', '4. close']].rename(
        columns={
        '1. open': 'open',
        '2. high': 'high',
        '3. low': 'low',
        '4. close': 'close'
        })

    df['date'] = pd.to_datetime(df['date'])
    for col in ['open','high','low','close']:
        df[col] = pd.to_numeric(df[col])
        
else: #use investing.com  live data
    import investpy as inv 

    search_result = investpy.search_quotes(text='apple', products=['stocks'],countries=['united states'], n_results=1)
    recent_data = search_result.retrieve_recent_data()
    print(recent_data.head())

    df = inv.get_currency_cross_recent_data(
        'XAU/USD', as_json=True, order='ascending', interval='Daily')
    df.columns =['date','open','high','low','close']

    
df = df.sort_values('date').reset_index(drop=True)
msgs = validate_df(
    df,
    required_cols=['date','open','high','low','close'],
    dtypes_map={'date':'datetime64[ns]','open':'float','high':'float','low':'float','close':'float'}
)
print(msgs)

file_name = safe_filename(prefix="api", meta={"source": "alpha" if use_alpha else "investing.com", "symbol": 'XAU/USD'})
out_path = DATA_RAW/file_name #saving it somewhere
df.to_csv(out_path, index=False)
print("Saved:", out_path)

Using Alpha Vantage: False


ConnectionError: ERR#0015: error 403, try again later.

In [19]:
url = 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=IBM&interval=1day&apikey=SYMBOL'
r = requests.get(url)
data = r.json()
print(data)

{'Meta Data': {'1. Information': 'Daily Prices (open, high, low, close) and Volumes', '2. Symbol': 'IBM', '3. Last Refreshed': '2025-08-19', '4. Output Size': 'Compact', '5. Time Zone': 'US/Eastern'}, 'Time Series (Daily)': {'2025-08-19': {'1. open': '240.0000', '2. high': '242.8300', '3. low': '239.4900', '4. close': '241.2800', '5. volume': '3328305'}, '2025-08-18': {'1. open': '239.5700', '2. high': '241.4200', '3. low': '239.1158', '4. close': '239.4500', '5. volume': '3569594'}, '2025-08-15': {'1. open': '237.6100', '2. high': '240.6200', '3. low': '236.7700', '4. close': '239.7200', '5. volume': '4344322'}, '2025-08-14': {'1. open': '238.2500', '2. high': '239.0000', '3. low': '235.6200', '4. close': '237.1100', '5. volume': '4556725'}, '2025-08-13': {'1. open': '236.2000', '2. high': '240.8411', '3. low': '236.2000', '4. close': '240.0700', '5. volume': '5663562'}, '2025-08-12': {'1. open': '236.5300', '2. high': '237.9600', '3. low': '233.3600', '4. close': '234.7700', '5. volu

In [None]:
    url = "https://api.api-ninjas.com/v1/stockpricehistorical?ticker=AAPL&period=1h&start=1706000000&end=1706302801"
    parameters = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",      
        "ticker": SYMBOL,
        "outputsize": "compact",     # compact - last 100 value; full - all historical value
        "apikey": NINJA_API_KEY,
        "datatype": "json"
    }

    
