# imports

In [1]:
import math
import os
from scipy import stats
import lightgbm as lgb
import pandas as pd
import numpy as np
import warnings
import random

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)

# data loading and preprocessing

In [2]:
financials1=pd.read_csv(r"G:\.shortcut-targets-by-id\1VYwE_M93mefT96L13j17FvaBdNSNnykZ\Kaggle\jpx-tokyo-stock-exchange-prediction\train_files\financials.csv")
financials2=pd.read_csv(r"G:\.shortcut-targets-by-id\1VYwE_M93mefT96L13j17FvaBdNSNnykZ\Kaggle\jpx-tokyo-stock-exchange-prediction\supplemental_files\financials.csv")
stock_prices1=pd.read_csv(r"G:\.shortcut-targets-by-id\1VYwE_M93mefT96L13j17FvaBdNSNnykZ\Kaggle\jpx-tokyo-stock-exchange-prediction\train_files\stock_prices.csv")
stock_prices2=pd.read_csv(r"G:\.shortcut-targets-by-id\1VYwE_M93mefT96L13j17FvaBdNSNnykZ\Kaggle\jpx-tokyo-stock-exchange-prediction\supplemental_files\stock_prices.csv")


stock_info=pd.read_csv(r"G:\.shortcut-targets-by-id\1VYwE_M93mefT96L13j17FvaBdNSNnykZ\Kaggle\jpx-tokyo-stock-exchange-prediction\stock_list.csv")

In [3]:
financials=pd.concat([financials1,financials2])
stock_prices=pd.concat([stock_prices1,stock_prices2])

In [4]:
def proprocessing_financials(financials):
    def finance_check(x):
        if "_Consolidated_" in x:
            return 1
        else:
            return 0
    
    
    financials["TypeOfDocument"]=financials["TypeOfDocument"].astype(str)
    financials["OperatingProfit"]=financials["OperatingProfit"].apply(convert_float)
    financials["NetSales"]=financials["NetSales"].apply(convert_float)
    financials['EquityToAssetRatio']= financials['EquityToAssetRatio'].apply(convert_float)
    
    
    financials["check"]=financials["TypeOfDocument"].apply(lambda x: 1 if "_Consolidated_" in x else 0)
    financials=financials[financials["check"]==1]
    financials=financials.fillna(0)
    financials=financials.replace([np.inf, -np.inf], 0)
    financials["OperationMargin"] = financials["OperatingProfit"]/financials["NetSales"]
    financials=financials.replace([np.inf, -np.inf], 0)
    financials["Statement_Date"]=1
   
    
    return financials[['Date', 'SecuritiesCode','OperationMargin','EquityToAssetRatio','Statement_Date']]

In [11]:
def convert_float(x):
    try:
        return float(x)
    except:
        return None
    
def seq_count(row):
    if pd.isna(row["close4"]):
        return None
    elif row["4_day_compare"]==0:
        return row["count"]*(-1)
    else:
        return row["count"]
    
def convert_str(x):
    try:
        return str(x)
    except:
        return None


In [6]:
def stock_info_preprocessing(stock_info):

    #keep 33 sector code to capture difference in industries, and issued shares in order to calculate daily transaction ration
    
    return stock_info[["SecuritiesCode","33SectorCode","IssuedShares"]]

In [7]:
def real_close(prices):
    prices=prices.sort_values("Date", ascending=False)
    prices["CumAdjustFactor"] = prices["AdjustmentFactor"].cumprod()
    prices["real_close"] = prices["CumAdjustFactor"]*prices["Close"]
    prices= prices.sort_values("Date")
    prices[prices["real_close"]==0]["real_close"] = np.nan
    prices["real_close"] = prices["real_close"].ffill()
    return prices

In [87]:
def price_preprocess(prices):
    prices.fillna(0)
    prices.replace([np.inf, -np.inf], 0)
    prices = prices.sort_values(["SecuritiesCode", "Date"])
    
    prices['volume_ratio']=prices["Volume"]/prices["IssuedShares"]  # transaction ratio
    prices['daily_price_change']=(prices["High"]-prices["Low"])/prices["Open"]  # the price voltaility  over a day
    prices["price/volume"]=prices["daily_price_change"]/prices["volume_ratio"] #  
    
    df=pd.DataFrame()
    stock_code_list=prices.SecuritiesCode.unique().tolist()
    


    
    for codei in stock_code_list:

        stocki_df=prices[prices["SecuritiesCode"]==codei]
        
     
        #daily price change AR(n)s
        stocki_df["daily_price_change1"]=stocki_df["daily_price_change"].shift(1)
        stocki_df["daily_price_change2"]=stocki_df["daily_price_change"].shift(2)
        stocki_df["daily_price_change3"]=stocki_df["daily_price_change"].shift(3)
        stocki_df["daily_price_change4"]=stocki_df["daily_price_change"].shift(4)
        
        
        # if a stock has a sequencial gain or loss compares with 4 days ago
        stocki_df["close4"]=stocki_df["real_close"].shift(4)
        stocki_df["4_day_compare"]=(stocki_df["real_close"]>=stocki_df["close4"]).astype(int)
        groups = (stocki_df['4_day_compare'] != stocki_df['4_day_compare'].shift()).cumsum()
        stocki_df["count"]=stocki_df.groupby(["4_day_compare",groups]).cumcount()+1
        stocki_df["seq_count"]=stocki_df.apply(seq_count,axis=1)

        #brein band 
        stocki_df["brein_up"] = stocki_df['real_close'].rolling(window=20).mean() +2*stocki_df['real_close'].rolling(window=20).std()

        stocki_df["brein_down"] = stocki_df['real_close'].rolling(window=20).mean() -2*stocki_df['real_close'].rolling(window=20).std()
        stocki_df["brein_width"] = stocki_df['real_close'].rolling(window=20).mean() +4*stocki_df['real_close'].rolling(window=20).std()
        #stocki_df["distance_brein_up"] = np.abs(stocki_df['real_close'] -stocki_df["brein_up"]) / stocki_df["brein_width"]
        #stocki_df["distance_brein_down"] = np.abs(stocki_df['real_close'] -stocki_df["brein_down"]) / stocki_df["brein_width"]


        # RSI -14days

        stocki_df["real_close_diff1"]=stocki_df["real_close"].diff(1)
        stocki_df["gain"]=stocki_df["real_close_diff1"].clip(lower=0)
        stocki_df["loss"]=stocki_df["real_close_diff1"].clip(upper=0).abs()
        stocki_df["avg_gain"]=stocki_df["gain"].rolling(window=14).mean()
        stocki_df["avg_loss"]=stocki_df["loss"].rolling(window=14).mean()
        #stocki_df["RS"]=stocki_df["avg_gain"]/stocki_df["avg_loss"]
        #stocki_df["RSI"]= 100-(100/(1+stocki_df["RS"]))
       



        

        #targets ARs

        stocki_df["future_close1"]=stocki_df["real_close"].shift(-1)
        stocki_df["future_close2"]=stocki_df["real_close"].shift(-2)
        stocki_df["calculated_target"]=(stocki_df["future_close2"] - stocki_df["future_close1"])/stocki_df["future_close1"]
        stocki_df["target_3"]=stocki_df["calculated_target"].shift(3)
        stocki_df["target_4"]=stocki_df["calculated_target"].shift(4)
        stocki_df["target_5"]=stocki_df["calculated_target"].shift(5)

     
        
        #price moving average ratio
        stocki_df['5_day_price_MA_ratio']  = stocki_df['real_close']/stocki_df['real_close'].rolling(window=5).mean()
        stocki_df['10_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=10).mean()
        stocki_df['20_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=20).mean()
        stocki_df['40_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=40).mean()
        stocki_df['60_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=60).mean()
        stocki_df['120_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=120).mean()
        
        
        # price volatilities
        #stocki_df['5_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=5).std()
        #stocki_df['10_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=10).std()
        stocki_df['20_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=20).std()
        stocki_df['40_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=40).std()
        stocki_df['60_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=60).std()
        stocki_df['120_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=120).std()
        
        # return AR(n)s
        
        stocki_df["5_day_return"]=stocki_df["real_close"].pct_change(5)
        stocki_df["10_day_return"]=stocki_df["real_close"].pct_change(10)
        stocki_df["20_day_return"]=stocki_df["real_close"].pct_change(20)
        stocki_df["40_day_return"]=stocki_df["real_close"].pct_change(40)
        stocki_df["60_day_return"]=stocki_df["real_close"].pct_change(60)
        stocki_df["120_day_return"]=stocki_df["real_close"].pct_change(120)
     
       # volume moving average
        
        
        stocki_df["5_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=5).mean()
        stocki_df["10_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=10).mean()
        stocki_df["20_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=20).mean()
        stocki_df["40_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=40).mean()
      
       # volumn volatilities
        
        #stocki_df["5_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=5).std()
        #stocki_df["10_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=10).std()
        stocki_df["20_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=20).std()
        stocki_df["40_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=40).std()
        stocki_df["60_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=60).std()
        stocki_df["120_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=120).std()

       #
        
        

        df=pd.concat([df,stocki_df]) 

    df["distance_brein_up"] = round(np.abs(df['real_close'] -df["brein_up"]) / df["brein_width"],2)
    df["distance_brein_down"] = round(np.abs(df['real_close'] - df["brein_down"]) / df["brein_width"],2)

    df["RS"]=round(df["avg_gain"]/df["avg_loss"],2)
    df["RSI"]= round(100-(100/(1+df["RS"])),0)

        
    return df

In [88]:
def train_data_preprocess(stock_prices,stock_info,financials):
    stock_info=stock_info_preprocessing(stock_info)
    finance=proprocessing_financials(financials)
    prices=stock_prices.merge(stock_info,left_on=["SecuritiesCode"],right_on=["SecuritiesCode"],how="left")
    prices = prices.sort_values(["SecuritiesCode", "Date"])
    prices = prices.groupby("SecuritiesCode").apply(real_close).reset_index(drop=True)
    prices=price_preprocess(prices)
    prices=prices.merge(finance,left_on=["Date","SecuritiesCode"],right_on=["Date","SecuritiesCode"],how="left")
    prices['OperationMargin']=prices['OperationMargin'].fillna(method="ffill")
    prices['EquityToAssetRatio']=prices['EquityToAssetRatio'].fillna(method="ffill")
    prices["Volume"]=prices["Volume"].astype(float)
    prices["33SectorCode"]= prices["33SectorCode"].astype(int)
    prices["SupervisionFlag"]= prices["SupervisionFlag"].astype(int)
    
    return prices

In [90]:
data=train_data_preprocess(stock_prices,stock_info,financials)

In [38]:
#customized lightgmb loss funtion

def l2_loss(y, data):
    t = data.get_label()
    grad = y - t 
    hess = np.ones_like(y)
    return grad, hess

def l2_eval(y, data):
    t = data.get_label()
    loss = (y - t) ** 2 
    return 'l2', loss.mean(), False  

def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True

# hyperpara tuning

In [95]:
from hyperopt import hp, fmin, tpe, Trials, partial
from hyperopt.early_stop import no_progress_loss
import os



In [96]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)

In [119]:
# use sharpe ratio for hyperparameter tuning
def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def sharpe_ratio(preds,test):
    weights=np.linspace(2,1,200)
    temp_pred=test[['Date','SecuritiesCode','Target']].copy()
    temp_pred["PTarget"]=preds
    temp_pred["Rank"]=temp_pred.groupby(["Date"])["PTarget"].rank(ascending=False, method="first") - 1
    temp_pred["Rank"]=temp_pred["Rank"].astype(int)
    buf=temp_pred.groupby('Date').apply(calc_spread_return_per_day,portfolio_size=200, toprank_weight_ratio=2)
   
    return -buf.mean()/buf.std()

In [121]:
data["Date"].min(), data["Date"].max()

('2017-01-04', '2022-06-24')

In [112]:
#train validation split, randomly select 1400 stocks for training, 600 stocks for validating
code_list=data.SecuritiesCode.unique().tolist()
train_list=random.sample(code_list,1400)
validation_list=np.setxor1d(code_list,train_list)

In [113]:
train=data[data['SecuritiesCode'].isin(train_list)]
test=data[data['SecuritiesCode'].isin(validation_list)]

In [114]:
# define features and datasets

features=[ 'ExpectedDividend', 'Volume',"daily_price_change","daily_price_change1",
          "daily_price_change2","daily_price_change3","daily_price_change4",
          '33SectorCode','OperationMargin','EquityToAssetRatio','volume_ratio',"distance_brein_up","distance_brein_down","RSI",
          'seq_count','target_3','target_4','target_5',"SupervisionFlag",
          '5_day_price_MA_ratio',
          '10_day_price_MA_ratio', '20_day_price_MA_ratio',
          '40_day_price_MA_ratio', '60_day_price_MA_ratio',
          '120_day_price_MA_ratio', 
               
               '20_day_price_volatility','40_day_price_volatility',
               '60_day_price_volatility', '120_day_price_volatility',
                '5_day_return', '10_day_return',
               '20_day_return', '40_day_return', '60_day_return', '120_day_return',
               "5_day_volume_ratio_MA","10_day_volume_ratio_MA","20_day_volume_ratio_MA","40_day_volume_ratio_MA",
                '20_day_volume_ratio_std','40_day_volume_ratio_std',
               '60_day_volume_ratio_std', '120_day_volume_ratio_std',"price/volume"
               ]

tr_dataset = lgb.Dataset(train[features],train["Target"],feature_name = features,categorical_feature=['33SectorCode'],free_raw_data=False )


In [115]:
# define tuning object, minimize -sharpe ration
def object(params):

    params_lgb={'metric':'None',
            'objective': 'regression',
            'force_col_wise':True,
            'learning_rate': params["learning_rate"],
            'num_leaves': params["num_leaves"],
            'min_data_in_leaf':params['min_data_in_leaf'],
            #'max_depth': params["max_depth"],
            'subsample': params["subsample"],
            'feature_fraction': params['feature_fraction'],
            'bagging_fraction': params['bagging_fraction'],
            'lambda_l1': params['lambda_l1'],
            'lambda_l2': params['lambda_l2'],
          
            }  

    

    model = lgb.train(  params = params_lgb, 
                train_set = tr_dataset,                
                num_boost_round = 500, 
                feval=feval_pearsonr
               )
    
    preds=model.predict(test[features])

    return sharpe_ratio(preds,test)


# define parameter seach space

space = {
    'num_leaves':  hp.choice('num_leaves', range(50,100)),
    'min_data_in_leaf':  hp.choice('min_data_in_leaf', range(100,500)),
    'max_depth':  hp.choice('max_depth', range(3,10)),
    'feature_fraction':  hp.uniform('feature_fraction', 0.2, 0.9),
    'subsample':  hp.uniform('subsample', 0.5, 0.9),
    'bagging_fraction':  hp.uniform('bagging_fraction', 0.5, 0.9),    
    'learning_rate':  hp.uniform('learning_rate', 0.001, 0.1),
    'lambda_l1':  hp.uniform('lambda_l1', 0.0001, 1),
    'lambda_l2':  hp.uniform('lambda_l2', 0.0001, 1),
    
}  



In [116]:
# define tuning process
def param_hyperopt(max_evals,space):

    trials=Trials()

    early_stop_fn= no_progress_loss(200)

    params_best= fmin(object, space=space,algo=tpe.suggest, max_evals=max_evals,verbose=True,trials=trials,early_stop_fn=early_stop_fn)

    print("best params:", params_best)

    return params_best,trials



In [117]:
# start tuning
params_best,trails=param_hyperopt(3000,space)

[LightGBM] [Info] Total Bins 9845                       
[LightGBM] [Info] Number of data points in the train set: 1826086, number of used features: 44
[LightGBM] [Info] Start training from score 0.000367    
[LightGBM] [Info] Total Bins 9845                                                      
[LightGBM] [Info] Number of data points in the train set: 1826086, number of used features: 44
[LightGBM] [Info] Start training from score 0.000367                                   
[LightGBM] [Info] Total Bins 9845                                                     
[LightGBM] [Info] Number of data points in the train set: 1826086, number of used features: 44
[LightGBM] [Info] Start training from score 0.000367                                  
[LightGBM] [Info] Total Bins 9845                                                      
[LightGBM] [Info] Number of data points in the train set: 1826086, number of used features: 44
[LightGBM] [Info] Start training from score 0.000367                

In [118]:
# the best sharpe-ration on validation set is 0.47
params_best

{'bagging_fraction': 0.8129300896075456,
 'feature_fraction': 0.8997963192996467,
 'lambda_l1': 0.48667613936866305,
 'lambda_l2': 0.16960014569414728,
 'learning_rate': 0.09998532471948109,
 'max_depth': 0,
 'min_data_in_leaf': 185,
 'num_leaves': 47,
 'subsample': 0.5366895744256853}

In [80]:
object(params_best)

[LightGBM] [Info] Total Bins 8817
[LightGBM] [Info] Number of data points in the train set: 2370539, number of used features: 39
[LightGBM] [Info] Start training from score 0.000440


-0.05964972372031924

# Final sumission on kaggle

## imports and seeds

In [104]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

('2017-01-04', '2022-06-24')

In [20]:
import math
import os
from scipy import stats
import lightgbm as lgb
import jpx_tokyo_market_prediction
import random

import warnings

warnings.filterwarnings('ignore')

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)


## data preprocessing

In [21]:
def proprocessing_financials(financials):
    def finance_check(x):
        if "_Consolidated_" in x:
            return 1
        else:
            return 0
    
    
    financials["TypeOfDocument"]=financials["TypeOfDocument"].astype(str)
    financials["OperatingProfit"]=financials["OperatingProfit"].apply(convert_float)
    financials["NetSales"]=financials["NetSales"].apply(convert_float)
    financials['EquityToAssetRatio']= financials['EquityToAssetRatio'].apply(convert_float)
    
    
    financials["check"]=financials["TypeOfDocument"].apply(lambda x: 1 if "_Consolidated_" in x else 0)
    financials=financials[financials["check"]==1]
    financials=financials.fillna(0)
    financials=financials.replace([np.inf, -np.inf], 0)
    financials["OperationMargin"] = financials["OperatingProfit"]/financials["NetSales"]
    financials=financials.replace([np.inf, -np.inf], 0)
    financials["Statement_Date"]=1
   
    
    return financials[['Date', 'SecuritiesCode','OperationMargin','EquityToAssetRatio','Statement_Date']]

In [None]:
def convert_float(x):
    try:
        return float(x)
    except:
        return None
    
def seq_count(row):
    if pd.isna(row["close4"]):
        return None
    elif row["4_day_compare"]==0:
        return row["count"]*(-1)
    else:
        return row["count"]
    
def convert_str(x):
    try:
        return str(x)
    except:
        return None

In [43]:
def real_close(prices):
    prices=prices.sort_values("Date", ascending=False)
    prices["CumAdjustFactor"] = prices["AdjustmentFactor"].cumprod()
    prices["real_close"] = prices["CumAdjustFactor"]*prices["Close"]
    prices= prices.sort_values("Date")
    prices[prices["real_close"]==0]["real_close"] = np.nan
    prices["real_close"] = prices["real_close"].ffill()
    return prices

<lightgbm.basic.Booster at 0x237049da770>

In [18]:
def price_preprocess(prices):
    prices.fillna(0)
    prices.replace([np.inf, -np.inf], 0)
    prices = prices.sort_values(["SecuritiesCode", "Date"])
    
    prices['volume_ratio']=prices["Volume"]/prices["IssuedShares"]
    prices['daily_price_change']=(prices["High"]-prices["Low"])/prices["Open"]
    prices["price/volume"]=prices["daily_price_change"]/prices["volume_ratio"]
    
    df=pd.DataFrame()
    stock_code_list=prices.SecuritiesCode.unique().tolist()
    


    
    for codei in stock_code_list:

        stocki_df=prices[prices["SecuritiesCode"]==codei]
        
     
        #daily price change
        stocki_df["daily_price_change1"]=stocki_df["daily_price_change"].shift(1)
        stocki_df["daily_price_change2"]=stocki_df["daily_price_change"].shift(2)
        stocki_df["daily_price_change3"]=stocki_df["daily_price_change"].shift(3)
        stocki_df["daily_price_change4"]=stocki_df["daily_price_change"].shift(4)
        
        
        #
        stocki_df["close4"]=stocki_df["real_close"].shift(4)
        stocki_df["4_day_compare"]=(stocki_df["real_close"]>=stocki_df["close4"]).astype(int)
        groups = (stocki_df['4_day_compare'] != stocki_df['4_day_compare'].shift()).cumsum()
        stocki_df["count"]=stocki_df.groupby(["4_day_compare",groups]).cumcount()+1
        stocki_df["seq_count"]=stocki_df.apply(seq_count,axis=1)

        #brein band
        stocki_df["brein_up"] = stocki_df['real_close'].rolling(window=20).mean() +2*stocki_df['real_close'].rolling(window=20).std()

        stocki_df["brein_down"] = stocki_df['real_close'].rolling(window=20).mean() -2*stocki_df['real_close'].rolling(window=20).std()
        stocki_df["brein_width"] = stocki_df['real_close'].rolling(window=20).mean() +4*stocki_df['real_close'].rolling(window=20).std()
        #stocki_df["distance_brein_up"] = np.abs(stocki_df['real_close'] -stocki_df["brein_up"]) / stocki_df["brein_width"]
        #stocki_df["distance_brein_down"] = np.abs(stocki_df['real_close'] -stocki_df["brein_down"]) / stocki_df["brein_width"]


        # RSI -14

        stocki_df["real_close_diff1"]=stocki_df["real_close"].diff(1)
        stocki_df["gain"]=stocki_df["real_close_diff1"].clip(lower=0)
        stocki_df["loss"]=stocki_df["real_close_diff1"].clip(upper=0).abs()
        stocki_df["avg_gain"]=stocki_df["gain"].rolling(window=14).mean()
        stocki_df["avg_loss"]=stocki_df["loss"].rolling(window=14).mean()
        #stocki_df["RS"]=stocki_df["avg_gain"]/stocki_df["avg_loss"]
        #stocki_df["RSI"]= 100-(100/(1+stocki_df["RS"]))
       



        

        #targets ARs

        stocki_df["future_close1"]=stocki_df["real_close"].shift(-1)
        stocki_df["future_close2"]=stocki_df["real_close"].shift(-2)
        stocki_df["calculated_target"]=(stocki_df["future_close2"] - stocki_df["future_close1"])/stocki_df["future_close1"]
        stocki_df["target_3"]=stocki_df["calculated_target"].shift(3)
        stocki_df["target_4"]=stocki_df["calculated_target"].shift(4)
        stocki_df["target_5"]=stocki_df["calculated_target"].shift(5)

     
        
        #price moving average ratio
        stocki_df['5_day_price_MA_ratio']  = stocki_df['real_close']/stocki_df['real_close'].rolling(window=5).mean()
        stocki_df['10_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=10).mean()
        stocki_df['20_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=20).mean()
        stocki_df['40_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=40).mean()
        stocki_df['60_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=60).mean()
        stocki_df['120_day_price_MA_ratio'] = stocki_df['real_close']/stocki_df['real_close'].rolling(window=120).mean()
        
        
        # volatilities
        #stocki_df['5_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=5).std()
        #stocki_df['10_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=10).std()
        stocki_df['20_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=20).std()
        stocki_df['40_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=40).std()
        stocki_df['60_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=60).std()
        stocki_df['120_day_price_volatility'] = np.log(stocki_df['real_close']).diff().rolling(window=120).std()
        
        # return 
        
        stocki_df["5_day_return"]=stocki_df["real_close"].pct_change(5)
        stocki_df["10_day_return"]=stocki_df["real_close"].pct_change(10)
        stocki_df["20_day_return"]=stocki_df["real_close"].pct_change(20)
        stocki_df["40_day_return"]=stocki_df["real_close"].pct_change(40)
        stocki_df["60_day_return"]=stocki_df["real_close"].pct_change(60)
        stocki_df["120_day_return"]=stocki_df["real_close"].pct_change(120)
     
       # volume moving average
        
        
        stocki_df["5_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=5).mean()
        stocki_df["10_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=10).mean()
        stocki_df["20_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=20).mean()
        stocki_df["40_day_volume_ratio_MA"]=stocki_df["volume_ratio"].rolling(window=40).mean()
      
       # volumn volatilities
        
        #stocki_df["5_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=5).std()
        #stocki_df["10_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=10).std()
        stocki_df["20_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=20).std()
        stocki_df["40_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=40).std()
        stocki_df["60_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=60).std()
        stocki_df["120_day_volume_ratio_std"]=stocki_df["volume_ratio"].rolling(window=120).std()

       #
        
        

        df=pd.concat([df,stocki_df]) 

    df["distance_brein_up"] = round(np.abs(df['real_close'] -df["brein_up"]) / df["brein_width"],2)
    df["distance_brein_down"] = round(np.abs(df['real_close'] - df["brein_down"]) / df["brein_width"],2)

    df["RS"]=round(df["avg_gain"]/df["avg_loss"],2)
    df["RSI"]= round(100-(100/(1+df["RS"])),0)

        
    return df

In [24]:
def train_data_preprocess(stock_prices,stock_info,financials):
    stock_info=stock_info_preprocessing(stock_info)
    finance=proprocessing_financials(financials)
    prices=stock_prices.merge(stock_info,left_on=["SecuritiesCode"],right_on=["SecuritiesCode"],how="left")
    prices = prices.sort_values(["SecuritiesCode", "Date"])
    prices = prices.groupby("SecuritiesCode").apply(real_close).reset_index(drop=True)
    prices=price_preprocess(prices)
    prices=prices.merge(finance,left_on=["Date","SecuritiesCode"],right_on=["Date","SecuritiesCode"],how="left")
    prices['OperationMargin']=prices['OperationMargin'].fillna(method="ffill")
    prices['EquityToAssetRatio']=prices['EquityToAssetRatio'].fillna(method="ffill")
    prices["Volume"]=prices["Volume"].astype(float)
    prices["33SectorCode"]= prices["33SectorCode"].astype(int)
    prices["SupervisionFlag"]= prices["SupervisionFlag"].astype(int)
    
    return prices
    

In [25]:
stock_prices=pd.concat([stock_prices,stock_prices2])
financials=pd.concat([financials,financials2])

In [46]:
data=train_data_preprocess(stock_prices,stock_info,financials)

## modeling training

In [None]:
# customized loss functions
def l2_loss(y, data):
    t = data.get_label()
    grad = y - t 
    hess = np.ones_like(y)
    return grad, hess

def l2_eval(y, data):
    t = data.get_label()
    loss = (y - t) ** 2 
    return 'l2', loss.mean(), False  

def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True


In [None]:
#train validation split
code_list=data.SecuritiesCode.unique().tolist()
train_list=random.sample(code_list,1400)
validation_list=np.setxor1d(code_list,train_list)

In [None]:
features=[ 'ExpectedDividend', 'Volume',"daily_price_change","daily_price_change1",
          "daily_price_change2","daily_price_change3","daily_price_change4",
          '33SectorCode','OperationMargin','EquityToAssetRatio','volume_ratio',"distance_brein_up","distance_brein_down","RSI",
          'seq_count','target_3','target_4','target_5',"SupervisionFlag",
          '5_day_price_MA_ratio',
          '10_day_price_MA_ratio', '20_day_price_MA_ratio',
          '40_day_price_MA_ratio', '60_day_price_MA_ratio',
          '120_day_price_MA_ratio',     '20_day_price_volatility','40_day_price_volatility','60_day_price_volatility', '120_day_price_volatility','5_day_return', '10_day_return',
        '20_day_return', '40_day_return', '60_day_return', '120_day_return',
        "5_day_volume_ratio_MA","10_day_volume_ratio_MA","20_day_volume_ratio_MA","40_day_volume_ratio_MA",
        '20_day_volume_ratio_std','40_day_volume_ratio_std',
        '60_day_volume_ratio_std', '120_day_volume_ratio_std',"price/volume"
               ]


In [47]:
tr_dataset = lgb.Dataset(data[data['SecuritiesCode'].isin(train_list)][features],data[data['SecuritiesCode'].isin(train_list)]["Target"],feature_name = features,categorical_feature=['33SectorCode','SupervisionFlag'] )
vl_dataset = lgb.Dataset(data[data['SecuritiesCode'].isin(validation_list)][features], data[data['SecuritiesCode'].isin(validation_list)]["Target"],feature_name = features,categorical_feature=['33SectorCode','SupervisionFlag'])


params_lgb={'metric':'None',
'objective': 'regression',
'force_col_wise':True,
'bagging_fraction': 0.813,
 'feature_fraction': 0.90,
 'lambda_l1': 0.487,
 'lambda_l2': 0.170,
 'learning_rate': 0.1,
 #'max_depth': 6,
 'min_data_in_leaf': 185,
 'num_leaves': 47,
 'subsample': 0.537}


model = lgb.train(  params = params_lgb, 
                train_set = tr_dataset, 
                valid_sets = [tr_dataset, vl_dataset],                
                num_boost_round = 5000, 
                feval=feval_pearsonr,
                
                callbacks=[ lgb.early_stopping(stopping_rounds=300 , verbose=True), lgb.log_evaluation(period=100)]
               )

## submission

In [None]:
sample_submission = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")

env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

In [None]:
stock_prices_all= stock_prices.copy()
financials_all=financials.copy()

In [None]:
counter=0

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test: 
    
    current_date=prices.Date.min()
    sample_prediction_date=sample_prediction["Date"].iloc[0]
    
    print(f"current_date: {current_date}, sample_prediction_date: {sample_prediction_date}")
    
    if counter==0:
        stock_prices_all=stock_prices_all[stock_prices_all["Date"]<current_date]
        financials_all=financials_all[financials_all["Date"]<current_date]
    
        
    stock_prices_all = pd.concat([stock_prices_all,prices])
    financials_all= pd.concat([financials_all, financials])
    
    data=train_data_preprocess(stock_prices_all.copy(),stock_info.copy(),financials_all.copy())
    data=data[data["Date"]==current_date]
    
    data['pred_target'] = model.predict(data[features])
    
    data["Rank"]=data["pred_target"].rank(ascending=False, method="first") - 1
    
    rank_map= data.set_index('SecuritiesCode')['Rank'].to_dict()
    sample_prediction['Rank'] = sample_prediction['SecuritiesCode'].map(rank_map)
    sample_prediction["Rank"] = sample_prediction["Rank"].astype(int)
    
    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1

    # register your predictions
    env.predict(sample_prediction)
    counter+=1

In [None]:
! head submission.csv