In [3]:
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml

pd.set_option('display.expand_frame_repr', False)

In [4]:
DATA_STORE = Path('data/assets.h5')

In [5]:
stooq_path = Path('data/stooq')
if not stooq_path.exists():
    stooq_path.mkdir()

In [68]:
# download manually is req from https://stooq.com/db/l/?g={code}
# code : metadata_dict numbers
metadata_dict = {
    ('jp', 'tse etfs'): 34,
    ('jp', 'tse stocks'): 32,
    ('us', 'nasdaq etfs'): 69,
    ('us', 'nasdaq stocks'): 27,
    ('us', 'nyse etfs'): 70,
    ('us', 'nyse stocks'): 28,
    ('us', 'nysemkt stocks'): 26
}

In [73]:
for (market, asset_class), code in metadata_dict.items():
    try:
        print(market, asset_class)
        df = pd.read_csv(f'data/stooq/txt/{asset_class}.txt', sep=', ', header=None)
        df.columns = ['ticker', 'name']
        df = df.drop_duplicates('ticker').dropna()
        # print(market, asset_class, f'# tickers: {df.shape[0]:,.0f}')
        path = stooq_path / 'tickers' / market
        if not path.exists():
            path.mkdir(parents=True)
        df.to_csv(path / f'{asset_class}.csv', index=False)
    except Exception as e:
        print(e, market, asset_class)


jp tse etfs
jp tse stocks
us nasdaq etfs
us nasdaq stocks
us nyse etfs
us nyse stocks
us nysemkt stocks


In [75]:
def get_stooq_prices_and_tickers(frequency='daily', market='us', asset_class='nasdaq etfs'):
    prices = []

    tickers = (pd.read_csv(stooq_path / 'tickers' / market / f'{asset_class}.csv'))

    if frequency in ['5 min', 'hourly']:
        parse_dates = [['date', 'time']]
        date_label = 'date_time'
    else:
        parse_dates = ['date']
        date_label = 'date'
    names = ['ticker', 'freq', 'date', 'time', 'open', 'high', 'low', 'close','volume', 'openint']

    usecols = ['ticker', 'open', 'high', 'low', 'close', 'volume'] + parse_dates
    path = stooq_path / 'data' / frequency / market / asset_class
    print(path.as_posix())
    files = path.glob('**/*.txt')
    for i, file in enumerate(files, 1):
        if i % 500 == 0:
            print(i)
        if file.stem not in set(tickers.ticker.str.lower()):
            print(file.stem, 'not available')
            file.unlink()
        else:
            try:
                df = (pd.read_csv(file, names=names, usecols=usecols, header=0, parse_dates=parse_dates))
                prices.append(df)
            except pd.errors.EmptyDataError:
                print('\tdata missing', file.stem)
                file.unlink()

    prices = (pd.concat(prices, ignore_index=True).rename(columns=str.lower)
              .set_index(['ticker', date_label]).apply(lambda x: pd.to_numeric(x, errors='coerce')))
    return prices, tickers

In [76]:
# load some Japanese and all US assets for 2000-2023
markets = {'jp': ['tse stocks'],
           'us': ['nasdaq etfs', 'nasdaq stocks', 'nyse etfs', 'nyse stocks', 'nysemkt stocks']}
frequency = 'daily'

idx = pd.IndexSlice
for market, asset_classes in markets.items():
    for asset_class in asset_classes:
        print(f'\n{asset_class}')
        prices, tickers = get_stooq_prices_and_tickers(frequency=frequency, market=market, asset_class=asset_class)

        prices = prices.sort_index().loc[idx[:, '2000': '2023'], :]
        names = prices.index.names
        prices = (prices.reset_index().drop_duplicates().set_index(names).sort_index())

        print('\nNo. of observations per asset')
        print(prices.groupby('ticker').size().describe())
        key = f'stooq/{market}/{asset_class.replace(" ", "/")}/'

        print(prices.info(null_counts=True))

        prices.to_hdf(DATA_STORE, key + 'prices', format='t')

        print(tickers.info())
        tickers.to_hdf(DATA_STORE, key + 'tickers', format='t')


tse stocks
stooq/data/daily/jp/tse stocks
500
1000
1500
2000
2500
3000
3500

No. of observations per asset
count    3889.000000
mean     3254.058113
std      1523.918803
min         1.000000
25%      2094.000000
50%      3830.000000
75%      4433.000000
max      5717.000000
dtype: float64
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 12655032 entries, ('1301.JP', Timestamp('2005-03-22 00:00:00')) to ('9997.JP', Timestamp('2023-05-01 00:00:00'))
Data columns (total 5 columns):
 #   Column  Non-Null Count     Dtype  
---  ------  --------------     -----  
 0   open    12655032 non-null  float64
 1   high    12655032 non-null  float64
 2   low     12655032 non-null  float64
 3   close   12655032 non-null  float64
 4   volume  12655032 non-null  float64
dtypes: float64(5)
memory usage: 531.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3889 entries, 0 to 3888
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tic