This notebook combines some ideas of [JPX simple overfitting model - LB≃3](https://www.kaggle.com/code/paulorzp/jpx-simple-overfitting-model-lb-3) with my ideas

In [None]:
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Integer
from tqdm.notebook import tqdm
import optuna
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [None]:
np.random.seed(0)
random.seed(0)

In [None]:
%%time
# Loading Stock Prices
path = "../input/jpx-tokyo-stock-exchange-prediction/"
df_prices = pd.read_csv(f"{path}train_files/stock_prices.csv")
df_prices = df_prices[~df_prices["Target"].isnull()]
prices = pd.read_csv(f"{path}supplemental_files/stock_prices.csv")
df_prices = df_prices[df_prices.Date>"2019-10-01"]
df_prices = df_prices[df_prices.Date<"2021-10-01"]
df_prices.info(show_counts=True)
prices = prices[prices.Date>="2021-10-01"]
prices.info(show_counts=True)

In [None]:
%%time
df_prices['Month'] = pd.to_datetime(df_prices['Date'], errors='coerce').dt.month
df_prices['DayOfWeek'] =  pd.to_datetime(df_prices['Date'], errors='coerce').dt.dayofweek
df_prices['Open_Close'] = df_prices['Open'] / df_prices['Close']
df_prices['High_Low'] = df_prices['High'] / df_prices['Low']
prices['Month'] = pd.to_datetime(prices['Date'], errors='coerce').dt.month
prices['DayOfWeek'] = pd.to_datetime(prices['Date'], errors='coerce').dt.dayofweek
prices['Open_Close'] = prices['Open'] / prices['Close']
prices['High_Low'] = prices['High'] / prices['Low']
pd.options.display.float_format = '{:,.6g}'.format

In [None]:
%%time
training_codes = sorted(df_prices.SecuritiesCode.unique().tolist())
random.shuffle(training_codes)
print(f'Number of companies for training is {len(training_codes)}.')
assert len(training_codes) > 100
n = int(round(0.9 * len(training_codes)))
validation_codes = training_codes[n:]
training_codes = training_codes[:n]
prices = prices.loc[prices['SecuritiesCode'].isin(validation_codes)]
df_prices = df_prices.loc[df_prices['SecuritiesCode'].isin(training_codes)]

In [None]:
df_prices.describe()

In [None]:
prices.describe()

In [None]:
training_codes = sorted(df_prices.SecuritiesCode.unique().tolist())
validation_codes = sorted(prices.SecuritiesCode.unique().tolist())
print(f'Number of companies for training is {len(training_codes)}.')
print(f'Number of companies for validation is {len(validation_codes)}.')

In [None]:
# Utilities 

def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    weights_mean = weights.mean()
    df = df.sort_values(by='Rank')
    purchase = (df['Target'][:portfolio_size]  * weights).sum() / weights_mean
    short    = (df['Target'][-portfolio_size:] * weights[::-1]).sum() / weights_mean
    return purchase - short

def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2):
    grp = df.groupby('Date')
    min_size = grp["Target"].count().min()
    if min_size<2*portfolio_size:
        portfolio_size=min_size//2
        if portfolio_size<1:
            return 0, None
    buf = grp.apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf

def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

In [None]:
## By Yuike - https://www.kaggle.com/code/ikeppyo/examples-of-higher-scores-than-perfect-predictions

# This function adjusts the predictions so that the daily spread return approaches a certain value.
        
def adjuster(df):
    def calc_pred(df, x, y, z):
        return df['Target'].where(df['Target'].abs() < x, df['Target'] * y + np.sign(df['Target']) * z)

    def objective(trial, df):
        x = trial.suggest_uniform('x', 0, 0.2)
        y = trial.suggest_uniform('y', 0, 0.05)
        z = trial.suggest_uniform('z', 0, 1e-3)
        df["Rank"] = calc_pred(df, x, y, z).rank(ascending=False, method="first") - 1 
        return calc_spread_return_per_day(df, 200, 2)

    def predictor_per_day(df):
        study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SD))#5187
        study.optimize(lambda trial: abs(objective(trial, df) - 3), 3)
        return calc_pred(df, *study.best_params.values())

    return df.groupby("Date").apply(predictor_per_day).reset_index(level=0, drop=True)

def _predictor_base(feature_df, model, pre_, post_):
    res = model.predict(
        pre_.transform(
            feature_df[feats].to_numpy(dtype=np.float64, na_value=np.nan)
        )
    ) 
    return post_.inverse_transform(res.reshape((res.shape[0], 1))).flatten()

def _predictor_with_adjuster(feature_df, model, pre_, post_):
    df_pred = feature_df.copy()
    res = model.predict(
        pre_.transform(
            feature_df[feats].to_numpy(dtype=np.float64, na_value=np.nan)
        )
    )
    df_pred["Target"] = post_.inverse_transform(res.reshape((res.shape[0], 1))).flatten()
    return adjuster(df_pred).values.T

In [None]:
def find_best_extratrees(X: np.ndarray, y: np.ndarray,
                         test_data: pd.DataFrame) -> ExtraTreesRegressor:
    space = [
        Real(1e-6, 1e-3, 'log-uniform', name='ccp_alpha'),
        Integer(2, 16, name='min_samples_split'),
        Integer(3, 10, name='min_samples_leaf')
    ]
    predictor = _predictor_base
    
    @use_named_args(space)
    def objective_f(ccp_alpha: float, min_samples_split: int,
                    min_samples_leaf: int) -> float:
        test_et = ExtraTreesRegressor(
            min_samples_leaf=min_samples_leaf,
            min_samples_split=min_samples_split,
            ccp_alpha=ccp_alpha, bootstrap=False,
            criterion="squared_error", max_features=(X.shape[1] // 3),
            n_estimators=100, random_state=42, verbose=True, n_jobs=-1
        ).fit(X, y)
        test_data["pred"] = predictor(prices, test_et, preprocessor, postprocessor)
        score, _ = calc_spread_return_sharpe(add_rank(test_data))
        return -score
    
    print(f'X.shape = {X.shape}')
    print(f'y.shape = {y.shape}')
    res_gp = gp_minimize(
        objective_f, space,
        n_calls=32, n_random_starts=8,
        n_restarts_optimizer=4, random_state=42,
        verbose=True, n_jobs=1
    )
    best_parameters = {
        'ccp_alpha': float(res_gp.x[0]),
        'min_samples_split': int(res_gp.x[1]),
        'min_samples_leaf': int(res_gp.x[2])
    }
    print(f'Best parameters are: {best_parameters}')
    final_et = ExtraTreesRegressor(
        min_samples_leaf=best_parameters['min_samples_leaf'],
        min_samples_split=best_parameters['min_samples_split'],
        ccp_alpha=best_parameters['ccp_alpha'], bootstrap=False,
        criterion="squared_error", max_features=(X.shape[1] // 3),
        n_estimators=100, random_state=42, verbose=True, n_jobs=-1
    ).fit(X, y)
    return final_et

In [None]:
%%time
feats = ["Open", "High", "Low", "Close", "Open_Close", "High_Low",
         "Volume", "Month", "DayOfWeek"]
X_train = df_prices[feats].to_numpy(dtype=np.float64, na_value=np.nan)
y_train = df_prices["Target"].to_numpy(dtype=np.float64, na_value=np.nan)
preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='median')),
    ("transformer", ColumnTransformer(
        [
            ("numerical", Pipeline(steps=[
                ("standard", StandardScaler(with_mean=True, with_std=True)),
                ("decorr", PCA(random_state=42, svd_solver='full', whiten=True))
            ]), [0, 1, 2, 3, 4, 5, 6]),
            ("categorical", OneHotEncoder(
                sparse=False, handle_unknown='ignore', drop='first'
            ), [7, 8])
        ]
    ))
])
preprocessor.fit(X_train)
postprocessor = StandardScaler()
postprocessor.fit(y_train.reshape((y_train.shape[0], 1)))
X_train = preprocessor.transform(X_train)
y_train = postprocessor.transform(
    y_train.reshape((y_train.shape[0], 1))
).flatten()
best_et = find_best_extratrees(X_train, y_train, prices.copy())

In [None]:
best_et.verbose = False
for tree_idx, cur_tree in enumerate(best_et.estimators_):
    print(f'Tree {tree_idx + 1}:')
    print(f'    the depth            = {cur_tree.get_depth()};')
    print(f'    the number of leaves = {cur_tree.get_n_leaves()}.')
    print('')

In [None]:
selected_code = random.choice(training_codes)
X_demo = df_prices[df_prices.SecuritiesCode == selected_code][feats].to_numpy(dtype=np.float64, na_value=np.nan)
y_demo = df_prices[df_prices.SecuritiesCode == selected_code].Target.to_numpy(dtype=np.float64, na_value=np.nan)
X_demo = preprocessor.transform(X_demo)
demo_predictions = best_et.predict(X_demo)
demo_predictions = postprocessor.inverse_transform(
    demo_predictions.reshape((demo_predictions.shape[0], 1))
).flatten()
plt.figure(1, figsize=(9, 9))
plt.plot(demo_predictions, color='r', label='Predictions')
plt.plot(y_demo, color='g', label='Targets')
plt.grid()
plt.title('Known company, old period')
plt.legend(loc='best')
plt.show()

In [None]:
selected_code = random.choice(validation_codes)
X_demo = prices[prices.SecuritiesCode == selected_code][feats].to_numpy(dtype=np.float64, na_value=np.nan)
y_demo = prices[prices.SecuritiesCode == selected_code].Target.to_numpy(dtype=np.float64, na_value=np.nan)
X_demo = preprocessor.transform(X_demo)
demo_predictions = best_et.predict(X_demo)
demo_predictions = postprocessor.inverse_transform(
    demo_predictions.reshape((demo_predictions.shape[0], 1))
).flatten()
plt.figure(2, figsize=(9, 9))
plt.plot(demo_predictions, color='r', label='Predictions')
plt.plot(y_demo, color='g', label='Targets')
plt.grid()
plt.title('Unknown company, new period')
plt.legend(loc='best')
plt.show()

In [None]:
predictor = _predictor_with_adjuster
maxSD = 3683
max_score = None
for SD in tqdm(range(maxSD, 5000)):
    prices["pred"] = predictor(prices, best_et, preprocessor, postprocessor)
    score, buf = calc_spread_return_sharpe(add_rank(prices))
    if max_score is None:
        max_score = score
        maxSD = SD
    elif score > max_score:
        max_score = score
        maxSD = SD

print(f'{maxSD} Sharpe Ratio Score with adjuster -> {max_score}')
SD = maxSD

In [None]:
%%time
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for prices, options, financials, trades, secondary_prices, sample_prediction in iter_test:
    prices['Month'] = pd.to_datetime(prices['Date'], errors='coerce').dt.month
    prices['DayOfWeek'] = pd.to_datetime(prices['Date'], errors='coerce').dt.dayofweek
    prices['Open_Close'] = prices['Open'] / prices['Close']
    prices['High_Low'] = prices['High'] / prices['Low']
    prices.loc[:,"pred"] = predictor(prices, best_et, preprocessor, postprocessor)
    prices = add_rank(prices)
    rank = prices.set_index('SecuritiesCode')['Rank'].to_dict()
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(rank)
    env.predict(sample_prediction)