In [5]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

from time import time
from io import StringIO
import sys, os
from tqdm import tqdm

from itertools import product
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr

sys.path.insert(1, os.path.join(sys.path[0], '..'))

sns.set_style('whitegrid')
np.random.seed(42)

YEAR = 252
idx = pd.IndexSlice
DATA_DIR = Path('data')

results_path = Path('results', 'return_predictions')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [14]:
# load feature engineering
data = pd.read_hdf(DATA_DIR / 'data.h5', 'stooq/japan/equities')
data.info(null_counts=True), len(data.index.unique('ticker'))

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3081645 entries, ('9984.JP', Timestamp('2010-01-04 00:00:00')) to ('3053.JP', Timestamp('2023-05-01 00:00:00'))
Data columns (total 23 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   ret_1            3080700 non-null  float64
 1   ret_rel_perc_1   3080700 non-null  float64
 2   ret_5            3076920 non-null  float64
 3   ret_rel_perc_5   3076920 non-null  float64
 4   ret_10           3072195 non-null  float64
 5   ret_rel_perc_10  3072195 non-null  float64
 6   ret_21           3061800 non-null  float64
 7   ret_rel_perc_21  3061800 non-null  float64
 8   ret_63           3022110 non-null  float64
 9   ret_rel_perc_63  3022110 non-null  float64
 10  PPO              3058020 non-null  float64
 11  NATR             3068415 non-null  float64
 12  RSI              3068415 non-null  float64
 13  bbl              3077865 non-null  float64
 14  bbu              3077865 non-nul

(None, 945)

In [13]:
data.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,ret_1,ret_rel_perc_1,ret_5,ret_rel_perc_5,ret_10,ret_rel_perc_10,ret_21,ret_rel_perc_21,ret_63,ret_rel_perc_63,...,bbl,bbu,weekday,month,year,fwd_ret_01,fwd_ret_05,fwd_ret_10,fwd_ret_21,fwd_ret_63
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3053.JP,2023-04-25,-0.006135,6.0,-0.024096,5.0,0.0,9.0,-0.074286,4.0,-0.079545,6.0,...,1.011415,1.023632,1,4,2023,-0.006173,,,,
3053.JP,2023-04-26,-0.006173,6.0,-0.024242,5.0,-0.006173,8.0,-0.085227,4.0,-0.090395,6.0,...,1.007083,1.02194,2,4,2023,0.018634,,,,
3053.JP,2023-04-27,0.018634,16.0,0.0,9.0,0.018634,12.0,-0.057471,5.0,-0.068182,7.0,...,1.025849,1.003246,3,4,2023,-0.012195,,,,
3053.JP,2023-04-28,-0.012195,4.0,0.006211,11.0,0.006211,10.0,-0.05814,5.0,-0.074286,6.0,...,1.010225,1.015059,4,4,2023,0.012346,,,,
3053.JP,2023-05-01,0.012346,15.0,0.006135,11.0,0.012346,11.0,-0.057471,5.0,-0.073446,7.0,...,1.02372,1.006098,0,5,2023,,,,,


In [15]:
prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'stooq/jp/tse/stocks/prices').loc[idx[:, '2010':'2023'],:])
prices.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9997.JP,2023-04-25,727.0,734.0,724.0,727.0,125600.0
9997.JP,2023-04-26,728.0,738.0,726.0,734.0,150100.0
9997.JP,2023-04-27,733.0,733.0,727.0,732.0,73800.0
9997.JP,2023-04-28,738.0,745.0,738.0,745.0,127000.0
9997.JP,2023-05-01,749.0,753.0,741.0,746.0,131200.0


In [18]:
dollar_vol = prices.close.mul(prices.volume)
dollar_vol_rank = dollar_vol.groupby(level='ticker').rank(ascending=False)
universe = dollar_vol_rank.groupby(level='ticker').mean().nsmallest(250).index
universe[-10:]

Index(['7131.JP', '4413.JP', '9252.JP', '4412.JP', '9253.JP', '9251.JP',
       '9250.JP', '4379.JP', '4378.JP', '9249.JP'],
      dtype='object', name='ticker')