In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")
df.head()

<span style="display:block;color:white;font-size:24px;background-color:#5499C7;font-weight:bold;">  1 | Introduction </span>

Success in any financial market requires one to identify solid investments. When a stock or derivative is undervalued, it makes sense to buy. If it's overvalued, perhaps it's time to sell. While these finance decisions were historically made manually by professionals, technology has ushered in new opportunities for retail investors. Data scientists, specifically, may be interested to explore quantitative trading, where decisions are executed programmatically based on predictions from trained models.

There are plenty of existing quantitative trading efforts used to analyze financial markets and formulate investment strategies. To create and execute such a strategy requires both historical and real-time data, which is difficult to obtain especially for retail investors. This competition will provide financial data for the Japanese market, allowing retail investors to analyze the market to the fullest extent.

Japan Exchange Group, Inc. (JPX) is a holding company operating one of the largest stock exchanges in the world, Tokyo Stock Exchange (TSE), and derivatives exchanges Osaka Exchange (OSE) and Tokyo Commodity Exchange (TOCOM). JPX is hosting this competition and is supported by AI technology company AlpacaJapan Co.,Ltd.

This competition will compare your models against real future returns after the training phase is complete. The competition will involve building portfolios from the stocks eligible for predictions (around 2,000 stocks). Specifically, each participant ranks the stocks from highest to lowest expected returns and is evaluated on the difference in returns between the top and bottom 200 stocks. You'll have access to financial data from the Japanese market, such as stock information and historical stock prices to train and test your model.

All winning models will be made public so that other participants can learn from the outstanding models. Excellent models also may increase the interest in the market among retail investors, including those who want to practice quantitative trading. At the same time, you'll gain your own insights into programmatic investment methods and portfolio analysis―and you may even discover you have an affinity for the Japanese market.

<span style="display:block;color:white;font-size:24px;background-color:#5499C7;font-weight:bold;"> 2 | Preprocessing  </span>

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
from pathlib import Path
import seaborn as sb
import datetime
import matplotlib.pyplot as plt
import gc
import copy
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score
import lightgbm as gb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [None]:
def load_data(fpath,fname):
    file = Path(os.path.join(fpath,fname+".csv"))
    if file.exists():
        print(f"{file} is available")
        df = pd.read_csv(os.path.join(fpath,fname+".csv"))
        return df
    else:
        print("No such file...")
        return

In [None]:
#Set Path of Directory & Files
PATH = "/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/"
FNAME = "stock_prices"

df = load_data(PATH,FNAME)

In [None]:
df.head()

In [None]:
print(f"Number of rows {df.shape[0]} and number of columns {df.shape[1]}")

In [None]:
#Load Supplement File
SPATH="/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/"
SFNAME = "stock_prices"
df_supplement = load_data(SPATH,SFNAME)
df_supplement.head()

In [None]:
#df = pd.concat([df,df_supplement],axis=0)
#df_temp = copy.copy(df)
#df.set_index("Date",inplace=True)
#df.tail()

In [None]:
print(f"After concatination. Number of rows {df.shape[0]} and number of columns {df.shape[1]}")

In [None]:
PATH = "/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/"
SECOND_FNAME="secondary_stock_prices"
df_secondary = load_data(PATH,SECOND_FNAME)
df_secondary.head()

In [None]:
def data_prep(data):
    data["Date"] = pd.to_datetime(data["Date"])
    data["SecuritiesCode"] = data["SecuritiesCode"].astype("Int64")
    data["Open"] = data["Open"].astype("float")
    data["High"] = data["High"].astype("float")
    data["Low"] = data["Low"].astype("float")
    data["Close"] = data["Close"].astype("float")
    data["Volume"] = data["Volume"].astype("float")
    data["AdjustmentFactor"] = data["AdjustmentFactor"].astype("float")
    data["ExpectedDividend"] = data["ExpectedDividend"].astype("float")   
    #detect missing values and impute
    num_col=["Open","High","Low","Close","Volume","AdjustmentFactor","ExpectedDividend"]
    for col in num_col :
        per_miss = round(data[col].isnull().sum()*100/len(data),2)
        print(f"Missing Value in {col} is {per_miss} % ")
        if per_miss >= 90:
            data[col] = data[col].fillna(0.0)
        else:
             data[col] = data[col].fillna(data[col].median())
    data = data.replace([np.inf,-np.inf],np.nan)
    data.dropna(inplace=True)
    data["Adj.Close"]=data["Close"]*data["AdjustmentFactor"]
    data.drop("AdjustmentFactor",axis=1,inplace=True)
    
    
    print("\nPreporcessing of Data is over")    
    return data

In [None]:
df_proc = data_prep(df)
print(f"Length {len(df)}")

Given dataset has varied start date and end date. Majority of the stock has start date 2017-01-04 and end date 2021-12-03. However,some stock (such as  has start date 2020-12-22 and end date 2021-12-3. Ex. Stock 4168. We need to remove the one year stock data from 4 year stock data and preprocess the data further.

In [None]:
#stock_year={}
#stocks = [i for i in df_proc["SecuritiesCode"].unique()]
#for stock in stocks:
#    dt = df_proc[df_proc["SecuritiesCode"]==stock]
#    stock_year[stock] = [dt["Date"].iloc[0],dt["Date"].iloc[len(dt)-1]]
                        

In [None]:
def filter_data_by_year(data,stockyear):
    dropped_stock=[]
    for key,val in stockyear.items():
        val[0]=val[0].to_pydatetime()
        
        if val[0].year > 2017:
            dropped_stock.append(key)
    print("List of Stock with One Year Data.....\n")
    print(dropped_stock)   
    data = data[~data["SecuritiesCode"].isin(dropped_stock)]
    return dropped_stock,data

In [None]:
#one_year_stock,df_proc = filter_data_by_year(df_proc,stock_year)

In [None]:
def detect_outliers(data,numeric_col):
    for col in numeric_col:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3-Q1
        upper = Q3 + 3*IQR
        lower = Q1 - 3*IQR
        data[col] = np.where(data[col] > upper,data[col].median(),data[col])
        data[col] = np.where(data[col] < lower,data[col].median(),data[col])
    return data

In [None]:
num_col=["Open","High","Low","Volume","Close","Adj.Close","ExpectedDividend"]
df_proc_outliers = detect_outliers(df_proc,num_col)


In [None]:
df_proc_outliers.head()

<span style="display:block;color:white;font-size:24px;background-color:#5499C7;font-weight:bold;"> 3 | Data Visualization  </span>

In [None]:
#temp = copy.copy(df_proc_outliers)
#temp = temp.set_index("Date")
#fig,ax = plt.subplots(figsize=(12,4))
#ax.plot(temp[temp["SecuritiesCode"]==1301]["Adj.Close"],color="blue",label="1301",alpha=0.5)
#ax.plot(temp[temp["SecuritiesCode"]==1332]["Adj.Close"],color="red",label="1332",alpha=0.5)
#ax.plot(temp[temp["SecuritiesCode"]==1333]["Adj.Close"],color="green",label="1333",alpha=0.5)
#ax.plot(temp[temp["SecuritiesCode"]==1376]["Adj.Close"],color="orange",label="1376",alpha=0.5)
#ax.plot(temp[temp["SecuritiesCode"]==1377]["Adj.Close"],color="black",label="1377",alpha=0.5)
#plt.legend(loc='center left',bbox_to_anchor=(1,.5))
#plt.title("Adj.Closing Price")

In [None]:
#cols = [i for i in df_proc_outliers["SecuritiesCode"].unique()]
#areturn=[]
#fig,ax = plt.subplots(figsize=(12,4))
#step=20

#for i in range(0,step):
#        t1 = temp[temp["SecuritiesCode"]==cols[i]]
#        temp[f"Daily_Return_{cols[i]}"] = (t1["Close"]/t1["Close"].shift(1))-1
#        areturn.append(0.25*temp[f"Daily_Return_{cols[i]}"].sum())
#temp.dropna(inplace=True)    
#ax.plot(temp["Daily_Return_1301"],color="blue",alpha=0.5,label="1301")
#ax.plot(temp["Daily_Return_1332"],color="red",alpha=0.5,label="1332")
#ax.plot(temp["Daily_Return_1414"],color="green",alpha=0.5,label="1414")
#ax.plot(temp["Daily_Return_1435"],color="black",alpha=0.5,label="1435")
#ax.plot(temp["Daily_Return_1407"],color="orange",alpha=0.5,label="1407")
#plt.legend(loc='center left',bbox_to_anchor=(1,.5)) 
#plt.title("Daily Return")


In [None]:
   
#df = pd.DataFrame({"Annual_Return":areturn,"Stock":cols[0:step]})
#df = df.set_index("Stock")
#df = df.sort_values(by="Annual_Return",ascending=False)
#df.plot(kind="bar",title="Total Return (2017-2021)")
    

In [None]:
#!pip install finplot

In [None]:
#import finplot as fl
#temp = copy.copy(df_proc_outliers)
#temp = temp.set_index("Date")

In [None]:
#temp = temp[["SecuritiesCode","Open","High","Low","Close","Volume","Adj.Close"]]


#temp.head()

In three different ways we can plot stock market data.

1. **Using mplfinance package**
2. **Using finplot package**
3. **Using plotly**

### Using mplfinance package

In [None]:
#!pip install --upgrade mplfinance

In [None]:
#import mplfinance as mpl 
#temp = copy.copy(df_proc_outliers)
#temp = temp.set_index("Date")

In [None]:
#temp = temp[["SecuritiesCode","Open","High","Low","Close","Volume"]]

Comparing pattern of 5 different stock using candle stick chart

In [None]:
def plot_10_candle_chart(data,start,end):
    cols = [i for i in data["SecuritiesCode"].to_list()]
    data = data[data["SecuritiesCode"].isin(cols[start:end])]
    fig = mpl.figure(figsize=(20,14),style='yahoo',tight_layout = True)
    #left,bottom,width,height
    ax_p = fig.add_axes([0.05,0.75,0.5,0.25])
    ax_v = fig.add_axes([0.05,0.50,0.5,0.25])
    mpl.plot(data, type='candle', ax=ax_p, volume=ax_v, show_nontrading=False,
                     datetime_format='%a %d-%m-%y', xrotation=0)
    plt.title(f"{cols[start:end]}")

In [None]:
#plot_10_candle_chart(temp,0,5)

### Using Plotly

In [None]:
#import plotly.graph_objects as go
#codes = [code for code in df_proc_outliers["SecuritiesCode"].to_list()]
#for code in codes[0:1] :
#    df = df_proc_outliers[df_proc_outliers["SecuritiesCode"]==code]
#    fig = go.Figure(data=[go.Candlestick(x=df.index,
#                open=df['Open'], high=df['High'],
#                low=df['Low'], close=df['Close'])
#                     ])

 #   fig.update_layout(
 #                       title=f'Japan Stock Price | Code {code} ',
 #                       yaxis_title=f'{code} Stock Price',
 #   
 #                   )

 #   fig.show()

Let us plot **Bollinger Band** to understand the volatility.

In [None]:
#!pip install cufflinks

In [None]:
#!pip install chart_studio

In [None]:
#import cufflinks as cf
#from plotly.offline import iplot

In [None]:
#codes = [code for code in df_proc_outliers["SecuritiesCode"].to_list()]
#for code in codes[0:1] :
#    df = df_proc_outliers[df_proc_outliers["SecuritiesCode"]==code]
#    qf = cf.QuantFig(df,title=f"BB of Stock {code}")
#    qf.add_bollinger_bands()
#    qf.add_atr()
#    qf.add_rsi()
#    qf.add_cci()
#    cf.go_offline()
#    qf.iplot(theme='white')

**Effect of Closing Price** 

In [None]:
def effect_CLP(data,stock_1,stock_2):
    X = data[data["SecuritiesCode"]==stock_1]
    X = X["Close"].to_numpy().reshape(-1,1)
    Y = data[data["SecuritiesCode"]==stock_2]
    Y = Y["Close"].to_numpy().reshape(-1,1)
    model = LinearRegression()
    model.fit(X,Y)
    Y_hat = model.predict(X)
    r2=r2_score(Y,Y_hat)
    alpha = model.intercept_[0]
    beta = model.coef_[0, 0]
    fig,ax = plt.subplots()
    ax.scatter(X,Y,color='blue',edgecolor='k',alpha=0.6)
    ax.plot(X,Y_hat,color='red')
    ax.set_title(f"'{stock_2}' = {round(alpha,2)} + {round(beta,2)}*'{stock_1}'|R^2:{round(r2,2)}")

In [None]:
#effect_CLP(df_proc_outliers,1332,1333)

In [None]:
#effect_CLP(df_proc_outliers,1376,1377)

**Influence of differnt stock**

**Top 30 Stock** in each sector.

<span style="display:block;color:white;font-size:24px;background-color:#5499C7;font-weight:bold;">  4 | Feature Engineering </span>

In [None]:
def generate_features(data,targetcol=True):
    #windows=[20,40,60,120,240]
    #for win in windows:
    #    data.loc[:,f"MA_{win}"] = data.groupby("SecuritiesCode")["Close"].rolling(window=win).mean().values
        
    #windows=[10,20,40,60]
    #for win in windows:
    #    data.loc[:,f"Vol{win}"] =data.groupby("SecuritiesCode")["Volume"].rolling(window=win).mean().values
    #win1 = [10,10,10,20,20,40]
    #win2 = [20,40,60,40,60,60]
    #for win1,win2 in zip(win1,win2):
    #    data[f"Vol{win1}/Vol{win2}"] = data[f"Vol{win1}"]/data[f"Vol{win2}"]
    
    #windows=[10,20,40,60,120]
    #for win in windows:
    #    data[f"Vol{win}Std"] = data.groupby("SecuritiesCode")["Volume"].rolling(window=win).std().values
    
    #windows=[10,20,40,60,120]
    #for win in windows:
    #    data[f"Volatility{win}"] = np.log(data["Close"]).groupby(data["SecuritiesCode"]).diff().rolling(window=win).std()
    
    data["weekday"] = data["Date"].dt.weekday
    data["span"] = data["Close"]-data["Open"]
    
    securitiescode = [i for i in data["SecuritiesCode"].to_list()]
    dates = [i for i in data["Date"].to_list()]
    
    cols = [i for i in data.columns]
    
    for col in cols:
        data[col] = data[col].fillna(0.0)
    
    data.dropna(inplace=True)
    
    if targetcol==True:
        target = data["Target"]
        data.drop(['RowId','Date','SecuritiesCode','SupervisionFlag', 'Target'],axis=1,inplace=True)
    
        print(f"Length of Date: {len(dates)}")
        print(f"Length of Securties Code: {len(securitiescode)}")
        print(f"Length of Target: {len(target)}")
    
        return data,target,securitiescode,dates  
    else:
        #target = data["Target"]
        data.drop(['RowId','Date','SecuritiesCode','SupervisionFlag'],axis=1,inplace=True)
    
        print(f"Length of Date: {len(dates)}")
        print(f"Length of Securties Code: {len(securitiescode)}")
        #print(f"Length of Target: {len(target)}")
    
        return data,securitiescode,dates   

In [None]:
df_spec = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
df_spec["17SectorName"]=df_spec["17SectorName"].str.strip()
sectorname = [i for i in df_spec["17SectorName"].unique() if i != "-"]
sectorwise_stock={}
for sname in sectorname:
    temp_spec = df_spec[df_spec["17SectorName"]==sname]
    sectorwise_stock[sname] = temp_spec["SecuritiesCode"].astype("int").to_list()

<span style="display:block;color:white;font-size:24px;background-color:#5499C7;font-weight:bold;">  5 | Model Selection </span>

Remove highly correlated columns. 'Volatility60' and 'Volatility120'are highly correlated. 'Volatility40'and 'Volatility60'are highly correlated. 'Volatility10'and 'Volatility20' are highly correlated.

In [None]:
def regressor_baseline(X,Y):
    dt = copy.copy(X)
    X = np.array(X)
    Y = np.array(Y)
    scaler = MinMaxScaler(feature_range=(-1,1))
    X = scaler.fit_transform(X)
    Y = Y.reshape(-1,1)
    Y = scaler.fit_transform(Y)
    #trainx, testx, trainy, testy = X.iloc[:9000],X.iloc[21001:29000],Y.iloc[:9000],Y.iloc[21001:29000]
    #X_val,y_val = X.iloc[11001:20000],Y.iloc[11001:20000]
    
    #trainx,testx,trainy,testy = train_test_split(X,Y,test_size=0.3,random_state=0)
    
    
    params = {
               'boosting': 'gbdt',
               'objective': 'regression',
               'reg_alpha': 0.011389929142769662,
               'reg_lambda': 1.660255383543303,
               'colsample_bytree': 0.9782098395588904,
               'subsample': 0.770465865631085,
               'learning_rate': 0.02464790052467249,
               'max_depth': 20,
               'num_leaves': 875,
               'min_child_samples': 274,
                'n_estimators':500
              }
 
    model = gb.LGBMRegressor(**params) 
    
       
    model.fit(X,Y)
    
    preds = model.predict(X)
    r2 = r2_score(Y,preds)
    mse = mean_squared_error(Y,preds)
    rmse = mse**0.5
    print(f"MSE {mse}")
    print(f"RMSE {rmse}")
    #print(f"R2 Square {r2*100}%")
    #Show top 10 features
    num=10
    boost = model.booster_
    feature_imp = pd.DataFrame({'Value':boost.feature_importance(),'Feature':dt.columns})
    plt.figure(figsize=(5,4))
    sb.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()
    
    return feature_imp
    

In [None]:
%%time
#feature_imp = regressor_baseline(df_proc_outliers_features[numeric_cols],target)

In [None]:
#dt = feature_imp.sort_values(by="Value",ascending=False)[0:10]
#numeric_cols_updated = dt["Feature"].to_list()
#numeric_cols_updated

In [None]:
def lgbm_regressor(X,Y,price):
    
    
    X = np.array(X)
    Y = np.array(Y)
    scaler = MinMaxScaler(feature_range=(-1,1))
    X = scaler.fit_transform(X)
    price = np.array(price)
    price = scaler.fit_transform(price)
    Y = Y.reshape(-1,1)
    Y = scaler.fit_transform(Y)
    
    
    #trainx, testx, trainy, testy = X.iloc[:9000],X.iloc[21001:29000],Y.iloc[:9000],Y.iloc[21001:29000]
    #X_val,y_val = X.iloc[11001:20000],Y.iloc[11001:20000]
    
    #trainx,testx,trainy,testy = train_test_split(X,Y,test_size=0.3,random_state=0)
    
    
    params = {
               'boosting': 'gbdt',
               'objective': 'regression',
               'reg_alpha': 0.011389929142769662,
               'reg_lambda': 1.660255383543303,
               'colsample_bytree': 0.9782098395588904,
               'subsample': 0.770465865631085,
               'learning_rate': 0.02464790052467249,
               'max_depth': 20,
               'num_leaves': 875,
               'min_child_samples': 274,
                'n_estimators':500
              }
 
    model = gb.LGBMRegressor(**params) 
    
       
    model.fit(X,Y)
    
    preds = model.predict(X)
    r2 = r2_score(Y,preds)
    mse = mean_squared_error(Y,preds)
    rmse = mse**0.5
    print(f"Light GBM MSE {mse}")
    print(f"Light GBM RMSE {rmse}")
    #print(f"R2 Suqare {r2*100}%")
    #testy = scaler.inverse_transform(Y)
    #print("Predicted value before transformation..\n")
    #print(preds)
    #print("After transformation...\n")
    preds = model.predict(price) 
    preds = preds.reshape(-1,1)
    preds = scaler.inverse_transform(preds)
    #print(preds)
    return preds

In [None]:
%%time
#preds_lgbm,test_y = lgbm_regressor(df_proc_outliers_features[numeric_cols_updated],target)

In [None]:
#lgbm = pd.Series([preds_lgbm[i][0] for i in range(len(preds_lgbm))],name="LGBM").to_frame()
#test = pd.Series([test_y[i][0] for i in range(len(test_y))],name="Actual").to_frame()
#output_lgbm = pd.concat([test,lgbm],axis=1)
#output_lgbm.head()

In [None]:
def extratrees_regressor(X,Y,price):
    X = np.array(X)
    Y = np.array(Y)
    scaler = MinMaxScaler(feature_range=(-1,1))
    X = scaler.fit_transform(X)
    price=np.array(price)
    price = scaler.fit_transform(price)
    Y = Y.reshape(-1,1)
    Y = scaler.fit_transform(Y)
    
    #check here       
    #trainx,testx,trainy,testy = train_test_split(X,Y,test_size=0.3,random_state=0)
    
    
    params = {
               'bootstrap': True,
               'oob_score': True,
               'n_jobs': -1,
               'max_features': 'sqrt', 
                'max_depth': 20,
                           
                'n_estimators':500
              }
    model = ExtraTreesRegressor(**params)
    model.fit(X,Y)
    preds = model.predict(X)
    r2 = r2_score(Y,preds)
    mse = mean_squared_error(Y,preds)
    rmse = mse**0.5
    print(f"EXTRA TREE MSE = {mse} ")
    print(f"EXTRA TREE RMSE = {rmse}")
    #print(f"R2 Square = {r2*100}%")
    #testy = scaler.inverse_transform(Y)
    #print("Predicted value before transformation..\n")
    #print(preds)
    #print("After transformation...\n")
    preds = model.predict(price)
    preds = preds.reshape(-1,1)
    preds = scaler.inverse_transform(preds)
    #print(preds)
    return preds

In [None]:
lgbmpred=[]
actual=[]
extratrpred=[]
securitiescode=[]
dates=[]

In [None]:
def sectorwise_stock_prediction(col,data,price,count,numeric_cols_updated):
    print(f">>>>>>>>{col}>>>>>>>>>>")    
    data = data[data["SecuritiesCode"].isin(sectorwise_stock[col])]
    price = price[price["SecuritiesCode"].isin(sectorwise_stock[col])]
    
    data_features,target,scode,dte = generate_features(data,True)
    price_features,price_scode,price_dte = generate_features(price,False)
    #print(f"Length of Date-1: {len(dte)}")
    #data_features.head()
    corr = data_features.corr()
    price_corr = price_features.corr()
    if count==0 and len(numeric_cols_updated)==0:
        plt.figure(figsize=(5,5))
        umask = np.triu(corr)
        sb.heatmap(corr,annot=True,fmt="0.1g",cmap="viridis",mask=umask)
        plt.show()
    threshold = 0.8 # to delete corr value above 0.8
    numeric_cols = [i for i in data_features.columns]
    price_numeric_cols = [i for i in price_features.columns]
    drop_numeric_cols=[]
    pair=[]
    price_pair=[]
    drop_price=[]
    for col in numeric_cols:
        for i in range(len(corr)):
            if abs(corr[col].iloc[i]) >= threshold and col != numeric_cols[i] :
                #print(f"{col} and {numeric_cols[i]} are highly correlated...") 
                if col not in pair:
                    pair.append(col)
                    pair.append(numeric_cols[i])
                    drop_numeric_cols.append(col) 
    
    
    for col in price_numeric_cols:
        for i in range(len(price_corr)):
            if abs(price_corr[col].iloc[i]) >= threshold and col != price_numeric_cols[i] :
                #print(f"{col} and {numeric_cols[i]} are highly correlated...") 
                if col not in price_pair:
                    price_pair.append(col)
                    price_pair.append(price_numeric_cols[i])
                    drop_price.append(col)         
    
    numeric_cols = list(set(numeric_cols)-set(drop_numeric_cols))
    price_numeric_cols = list(set(price_numeric_cols)-set(drop_price))
    print(f"Dropping highly correlated columns in training : {drop_numeric_cols}")
    print(f"Dropping highly correlated columns in testing : {drop_price}")
    
    del drop_numeric_cols
    del drop_price
    del price_pair
    del pair
    gc.collect()
    
    if count==0 and len(numeric_cols_updated)==0:
        feature_imp = regressor_baseline(data[numeric_cols],target)
        dt = feature_imp.sort_values(by="Value",ascending=False)[0:10]
        numeric_cols_updated = dt["Feature"].to_list()
    preds_lgbm = lgbm_regressor(data[numeric_cols_updated],target,price[numeric_cols_updated])
    lgbm = [preds_lgbm[i][0] for i in range(len(preds_lgbm))]
    #test = [test_y[i][0] for i in range(len(test_y))]
    print("=========Verify Length of Data ============")
    print(f"Length of Date: {len(price_dte)}")
    print(f"Length of Securties Code: {len(price_scode)}")
    print(f"Length of Pred: {len(lgbm)}")
    print("===========================================")
    extratr_preds = extratrees_regressor(data[numeric_cols_updated],target,price[numeric_cols_updated])
    extratree = [extratr_preds[i][0] for i in range(len(extratr_preds))]
    lgbmpred.extend(lgbm)
    #actual.extend(test)
    extratrpred.extend(extratree)
    securitiescode.extend(price_scode)
    dates.extend(price_dte)
    
    #result_lgbm["ENSEMBLE"] = 1.17*preds_lgbm + .35*extratrees_preds
    #mse = mean_squared_error(output_lgbm["Actual"],output_lgbm["ENSEMBLE"])
    #rmse = mse**0.5
    #print(f"Sector {col} RMSE :{rmse}")
    return numeric_cols_updated

In [None]:
#count=0
#numeric_cols_updated=[]
#for sectorname,code in sectorwise_stock.items():
#         numeric_cols_updated=sectorwise_stock_prediction(sectorname,df_proc_outliers,count,numeric_cols_updated)
#         count +=1

In [None]:
%%time
#extratrees_preds = extratrees_regressor(df_proc_outliers_features[numeric_cols_updated],target)

In [None]:
#extratree = pd.Series([extratrees_preds[i][0] for i in range(len(extratrees_preds))],name="EXTRA_TREES").to_frame()
#extratree

In [None]:
#output = pd.DataFrame({"Actual":actual,"LGBM":lgbmpred,"ExtraTree":extratrpred})
#output["ENSEMBLE"] = 1.17*output["LGBM"] + .35*output["ExtraTree"]
#mse = mean_squared_error(output["Actual"],output["ENSEMBLE"])
#rmse = mse**0.5
#rmse

### Rank the result

In [None]:
#output["RANK"] = output["ENSEMBLE"].rank(method = "dense",na_option="top",ascending=False).astype("int")
#output.head()


### Submission

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
%%time
data = df_proc_outliers.copy()
count=0
numeric_cols_updated=[]
iterations=1
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    #prices["Date"] = pd.to_datetime(prices["Date"])
    print(sample_prediction.head())
      
    prices = data_prep(prices)
    prices = detect_outliers(prices,num_col)
        
    for sectorname,code in sectorwise_stock.items():
              numeric_cols_updated=sectorwise_stock_prediction(sectorname,data,prices,count,numeric_cols_updated)
              count +=1
    output = pd.DataFrame({"Date":dates,"SecuritiesCode":securitiescode,"LGBM":lgbmpred,"ExtraTree":extratrpred})
    output["ENSEMBLE"] = 1.17*output["LGBM"] + .35*output["ExtraTree"]
    #sample_prediction['Pred'] = output["ENSEMBLE"]
    #sample_prediction = data_sample_pred.sort_values(by="Pred", ascending=False).set_index("index")
    output = output.sort_values(by="ENSEMBLE", ascending=False)
    #sample_prediction["Rank"] = output["ENSEMBLE"].rank(method = "dense",na_option="top",ascending=False).astype("int")
    output["Rank"] = np.arange(0,2000)
    output = output.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction["Rank"] = output["Rank"]
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    #required for next run
    lgbmpred=[]
    #actual=[]
    extratrpred=[]
    securitiescode=[]
    dates=[]
    print(len(submission))
    print(submission.head(2000))
    env.predict(submission)
    print(f">>>>>>>>>>>>>>Completed Iteration - {iterations}>>>>>>>>>>>>>")
    iterations+=1
    

In [None]:
#output_lgbm = pd.concat([output_lgbm,extratree],axis=1)
#output_lgbm["ENSEMBLE"] = 1.17*preds_lgbm + .35*extratrees_preds
#output_lgbm.head()

In [None]:
#mse = mean_squared_error(output_lgbm["Actual"],output_lgbm["ENSEMBLE"])
#rmse = mse**0.5
#rmse

In [None]:
#r2 = r2_score(output_lgbm["Actual"],output_lgbm["ENSEMBLE"])
#r2

In [None]:
#output_lgbm["RANK"] = output_lgbm["ENSEMBLE"].rank(method = "dense",na_option="top",ascending=False).astype("int")
#output_lgbm

Work is going on. Changing work for testing. 



**References**

Going to cite