Imports

In [95]:
import requests
import time
from secret import POLYGON_API_KEY
import datetime
import polars as pl
import logging
import numpy as np

name: get_paginated_request
function: retrieves all data from a paginated polygon api request in one polars dataframe
inputs:
    request_url (required) [string]: the initial http request to execute
    df_schema (optional) [dictionary]: a dictionary of column: type pairs to define the schema for the results dataframe. defaults to no schema, which could lead to dataframe stacking errors
output: a polars dataframe containing all the results of the http request merged

In [96]:
def get_paginated_request(request_url, df_schema = None):
    request_response = requests.get(request_url)
    if request_response.status_code != 200:
        raise ValueError(f'http request failed: {request_url}')
    request_json = request_response.json()
    if request_json['status'] == 'ERROR':
        time.sleep(70)
    results_df = pl.DataFrame(schema=df_schema, data=request_json['results'])

    while 'next_url' in request_json:
        request_url = f'{request_json['next_url']}&apiKey={POLYGON_API_KEY}'
        request_json = requests.get(request_url).json()
        if request_json['status'] == 'ERROR':
            time.sleep(70)
            request_json= requests.get(request_url).json()
        page_df = pl.DataFrame(schema = df_schema, data = request_json['results'])
        results_df.vstack(page_df, in_place=True)
    return results_df

name: _set_ticker_types (private)
function: works as a helper function for get_historical_tickers to set the ticker_list arg, setting it to an np array of strings from a string, list, or none

In [97]:
def _set_ticker_types(ticker_types):
    if ticker_list == None:
        request_url = f'https://api.polygon.io/v3/reference/tickers/types?asset_class=stocks&locale=us&apiKey={POLYGON_API_KEY}'
        ticker_types_df = get_paginated_request(request_url=request_url)
        ticker_list = ticker_types_df.select(pl.col('code')).to_numpy()
        return ticker_list
    elif isinstance(ticker_list, str):
        ticker_list = np.array(ticker_list)
        return ticker_list
    elif isinstance(ticker_list, np.ndarray):
        if ticker_list.dtype.type == np.str_:
            return ticker_list
        else: raise TypeError('ticker list must be strings')
    elif isinstance(ticker_list, list):
        ticker_list = np.array(ticker_list)
        if ticker_list.dtype.type == np.str_:
            return ticker_list
        else: raise TypeError('ticker list must be strings')
    else:
        raise TypeError('ticker list must be string, or list/np array of strings')


name: get_historical_tickers
function: Returns a full dataframe of listed and delisted tickers including all ticker types
inputs: 
    ticker_types (optional): The types of tickers to include in the final list of tickers. can be any value(s) in ["CS", "PFD", "WARRANT", "RIGHT", "BOND", "ETF", "ETN", "ETV", "SP", "ADRC", "ADRP", "ADRW", "ADRR", "FUND" "BASKET", "UNIT" "LT", "OS", "GDR", "OTHER", "NYRS", "AGEN", "EQLK", "ETS"]. Defaults to all ticker types
    include_delisted (optional): whether or not to include delisted tickers, default true
    output: a polars dataframe containing an row for every ticker with a schema of {'ticker': str, 'name': str, 'market': str, 'locale': str, 'primary_exchange': str,      'type': str, 'active': bool, 'currency_name': str, 'cik': str, 'last_updated_utc': str, 'delisted_utc': str, 'composite_figi': str, 'share_class_figi': str}

In [98]:
def get_historical_tickers(ticker_types = None, include_delisted = True):
    ticker_types = _set_ticker_types(ticker_types)
    df_schema = {'ticker': str, 'name': str, 'market': str, 'locale': str, 'primary_exchange': str, 'type': str, 'active': bool, 'currency_name': str, 'cik': str, 'last_updated_utc': str, 'delisted_utc': str, 'composite_figi': str, 'share_class_figi': str}
    full_df = pl.DataFrame(schema = df_schema)
    
    for ticker_type in ticker_types:
        request_url = f'https://api.polygon.io/v3/reference/tickers?type={ticker_type}&market=stocks&active=true&order=asc&limit=1000&sort=ticker&apiKey={POLYGON_API_KEY}'
        listed_df = get_paginated_request(request_url=request_url, df_schema=df_schema)
        full_df.vstack(listed_df, in_place=True)

        if not include_delisted:
            continue

        request_url = f'https://api.polygon.io/v3/reference/tickers?type={ticker_type}&market=stocks&active=false&order=asc&limit=1000&sort=ticker&apiKey={POLYGON_API_KEY}'
        unlisted_df = get_paginated_request(request_url=request_url, df_schema=df_schema)
        full_df.vstack(unlisted_df, in_place=True)
    return full_df

name: _get_data_input_validator (private)
function: works as a helper function for get_data by checking types and values of inputs, in order to make the main get_data function more readable

In [99]:
def _get_data_input_validator(start_date, end_date, timeframe, multiplier):
    if(end_date < start_date):
        raise ValueError('start_date must be earlier than end_date')
    valid_timeframes = {'hour', 'day', 'minute', 'week', 'month', 'quarter', 'year', 'second'}
    if timeframe not in valid_timeframes:
        raise ValueError(f'invalid timeframe argument, must be one of {valid_timeframes}.')
    if not isinstance(multiplier, int) or multiplier < 1:
        raise ValueError('multiplier must be an integer greater than 0')

name: _set_tickers (private)
function: works as a helper function for get_data by processing the input for the "tickers" arg, and either converting it to a numpy string array or throwing an exception

In [100]:
def _set_tickers(tickers):
    if tickers == None:
        return pl.read_parquet('Data/tickers.parquet').select(pl.col('ticker')).to_numpy()
    elif isinstance(tickers, np.ndarray):
        if tickers.dtype.type == str:
            return tickers
        else: raise TypeError('arg tickers must be an array of strings')
    elif isinstance(tickers, str):
        return np.array(tickers)
    elif isinstance(tickers, list):
        tickers = np.array(tickers)
        if tickers.dtype.type == str:
            return tickers
        else: raise TypeError('arg tickers must be an array of strings')
    elif isinstance(tickers, pl.dataframe.frame.DataFrame):
        try:
            arr = tickers.select(pl.col('ticker')).to_numpy()
            return arr
        except pl.dataframe.frame.ColumnNotFoundError:
            logging.exception('the dataframe passed for tickers arg does not contain a column named "ticker"')
    else:
        raise TypeError('invalid tickers argument: must be of type string, string list, or polars dataframe with a "ticker" column. leave empty for all tickers')

name: _set_date (private)
function: works as a helper function for get_data by processing date arguments, and converting them into isoformat from unix millisecond timestamp or datetime objects

In [101]:
def _set_date(date, position):
    if isinstance(date, str):
        try:
            datetime.date.fromisoformat(date)
            return date
        except (ValueError, TypeError):
            logging.exception(f'{position}_date arg must be in isoformat: YYYY-MM-DD')
    elif isinstance(date, int):
        try:
            date = datetime.date.fromtimestamp(date)
            return date.isoformat()
        except ValueError:
            logging.exception(f'{position}_date arg unix timestamp out of range')
    elif isinstance(date, datetime.date):
        return date.isoformat()
    elif isinstance(date, datetime.datetime):
        return date.date().isoformat()
    else:
        raise TypeError(f'{position}_date arg must be isoformat date string, unix millisecond timestamp, date object, or datetime object')
            
    

name: get_data
function: gathers historical data into a polars dataframe for analysis or exportation
inputs:
    start_date (optional) [string]: an isoformat date representing the day to start collecting stock data, defaults to 2000-01-01
    end_date (optional) [string]: an isoformat date representing the day to stop collecting stock data. Must occur after start_date, defaults to today's date
    tickers (optional) [string || string list || polars dataframe]: the tickers to gather data for over the date range. if a polars dataframe is given it must have a "ticker" column. defaults to all tickers in history
    timeframe (optional) [string]: the base timeframe to use for agreggates. Can be any of [second, minute, hour, day, week, month, quarter, year]. defaults to "day"
    multiplier (optional) [integer]: A multiplier for the timeframe parameter. ex: multiplier: 5, timeframe: minute gets 5 minute bars. defaults to 1
output:
    polars dataframe with columns [ticker, timestamp, o, h, l, c, v, vw, n, otc, t] for each ohlc bar
    ticker (required) [string]: the ticker of the stock traded in the ohlc bar
    timestamp (required) [string]: a string containing the date and time of the phlc bar
    o (required) [float]: the open price of the bar
    h (required) [float]: the high price of the bar
    l (required) [float]: the low price of the bar
    c (required) [float]: the closing price of the bar
    v (required) [float]: the number of shares traded during the bar 
    vw (required) [float]: the volume weighted average price of the stock 
    n (optional) [integer]: the number of transactions made during the bar
    otc (optional) [boolean]: true if the stock is an otc ticker, null otherwise
    t (required) [integer]: the unix millisecond timestamp of the bar

In [102]:
def get_data(start_date = '2000-01-01', end_date = datetime.date.today(), tickers = None, timeframe = 'day', multiplier = 1):
    _set_date(start_date, 'start')
    _set_date(end_date, 'end')
    _set_tickers(tickers)
    _get_data_input_validator(start_date, end_date, timeframe, multiplier)
    
    print(tickers)