In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import xgboost as xgb
import jpx_tokyo_market_prediction
import warnings; warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
import gc
from decimal import ROUND_HALF_UP, Decimal
from datetime import datetime, timedelta

In [None]:
prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
sprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv")
stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
financials = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv")
#for final
supplemental_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
supplemental_sprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/secondary_stock_prices.csv")
supplemental_financials = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/financials.csv")

**Combine files**

In [None]:
prices=prices.append(sprices,ignore_index=True)

# for final
prices=prices.append(supplemental_prices,ignore_index=True)
prices=prices.append(supplemental_sprices,ignore_index=True)
financials=financials.append(supplemental_financials,ignore_index=True)

del supplemental_financials
del supplemental_sprices
del supplemental_prices
del sprices

prices=prices.drop(['RowId','ExpectedDividend'],axis=1)
prices=prices.dropna()
gc.collect()

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)
    price['Date']= price['Date'].dt.strftime("%Y-%m-%d")

#     price.set_index("Date", inplace=True)
    return price

**RSI**

In [None]:
#rsi
def RSI_create(df,period):
    dfa=pd.DataFrame()
    def RSI(series,period):
        delta = series.diff().dropna()
        u = delta * 0
        d = u.copy()
        u[delta > 0] = delta[delta > 0]
        d[delta < 0] = -delta[delta < 0]
        u[u.index[period-1]] = np.mean( u[:period] ) #first value is sum of avg gains
        u = u.drop(u.index[:(period-1)])
        d[d.index[period-1]] = np.mean( d[:period] ) #first value is sum of avg losses
        d = d.drop(d.index[:(period-1)])
        rs = pd.DataFrame.ewm(u, com=period-1, adjust=False).mean() / \
             pd.DataFrame.ewm(d, com=period-1, adjust=False).mean()
        return 100 - 100 / (1 + rs)
    for code in df['SecuritiesCode'].unique():
        df_single=df[df['SecuritiesCode']==code][['Date','SecuritiesCode','AdjustedClose']]
        try: df_single['rsi'] = RSI(df_single['AdjustedClose'],period)#5 19 best
        except:
            lst=[]
            for i in range(len(df_single)):
                lst.append(np.nan)
            rsina=pd.Series(lst)
            df_single['rsi']=rsina
        df_single.drop(['AdjustedClose'],axis=1,inplace=True)
        dfa=dfa.append(df_single)
    return dfa

def rsi_class(x):
    ret = "low"
    if x < 50:
        ret = "low"
    if x > 50:
        ret = "med"
    if x > 70:
        ret = "hi"
    return(ret)

In [None]:
def reduce_memory(train_data):
    start_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in train_data.columns:
        col_type = train_data[col].dtype

        if col_type != object and col_type != bool:
            c_min = train_data[col].min()
            c_max = train_data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    train_data[col] = train_data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    train_data[col] = train_data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    train_data[col] = train_data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    train_data[col] = train_data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    train_data[col] = train_data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    train_data[col] = train_data[col].astype(np.float32)
                else:
                    train_data[col] = train_data[col].astype(np.float64)
        else:
#             train_data[col] = train_data[col].astype('bool')
            pass

    end_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return train_data

In [None]:
def create_tables(df):
    closes = pd.pivot_table(df, values = 'AdjustedClose', index = "Date", columns = "SecuritiesCode")
    opens = pd.pivot_table(df, values = "Open", index = "Date", columns = "SecuritiesCode")
    highs = pd.pivot_table(df, values = "High", index = "Date", columns = "SecuritiesCode")
    lows = pd.pivot_table(df, values = "Low", index = "Date", columns = "SecuritiesCode")
    volumes = pd.pivot_table(df, values = "Volume", index = "Date", columns = "SecuritiesCode")
    return closes,opens,highs,lows,volumes
def atr(highs,lows,closes,i): #calculate average total cost
    a = highs - lows
    b = abs(highs - closes.shift(1))
    c = abs(lows - closes.shift(1))
    return pd.melt(pd.DataFrame(np.max([a,b,c], axis = 0), index = a.index, columns = a.columns).rolling(i).mean(),ignore_index=False).reset_index().rename(columns = {"value":f"atr_{i}"}).dropna()

In [None]:
def feature_stock_list(df):
    df=df[['SecuritiesCode','Section/Products','33SectorCode','17SectorCode','NewIndexSeriesSizeCode','IssuedShares','MarketCapitalization']]
    df[df=='-']=np.nan
    df['IssuedShares'].fillna(df['IssuedShares'].mean(),inplace=True)
    df['MarketCapitalization'].fillna(df['MarketCapitalization'].mean(),inplace=True)
    df.fillna(0,inplace=True)
    df['Section/ProductsCode'] = LabelEncoder().fit_transform(df['Section/Products'])
    df.drop('Section/Products',axis=1,inplace=True)
    df[['33SectorCode','17SectorCode','NewIndexSeriesSizeCode']]=df[['33SectorCode','17SectorCode','NewIndexSeriesSizeCode']].astype(np.int64)
    return df

In [None]:
def adjust_financial(df_financial):
    df_financial['Date'] = pd.to_datetime(df_financial['Date'])
    # Drop invalid row
    df_financial = df_financial[~df_financial['DisclosedTime'].isna()]
    
    # Extract only consolidated financial reports
    df_financial = df_financial[df_financial['TypeOfDocument'].str.contains('FinancialStatements_Consolidated')]

    # If disclosure time >= 15:00, we can't use the information on the day
    df_financial['DisclosedDateTime'] = pd.to_datetime(df_financial['DisclosedUnixTime'], unit = 's')
    df_financial['DisclosedDateTime'] = df_financial['DisclosedDateTime'].dt.tz_localize('utc').dt.tz_convert('Asia/Tokyo')
    df_financial['Date'] = df_financial['Date'] + (df_financial['DisclosedDateTime'].dt.hour >= 15) * timedelta(days = 1)

    df_financial['EarningsPerShare'] = pd.to_numeric(df_financial["EarningsPerShare"], errors = 'coerce').fillna(0)
    df_financial['ForecastEarningsPerShare'] = pd.to_numeric(df_financial["ForecastEarningsPerShare"], errors = 'coerce').fillna(0)
    df_financial['NetSales'] = pd.to_numeric(df_financial["NetSales"], errors = 'coerce').fillna(0)
    df_financial['Profit'] = pd.to_numeric(df_financial["Profit"], errors = 'coerce').fillna(0)
    df_financial['ForecastProfit'] = pd.to_numeric(df_financial["ForecastProfit"], errors = 'coerce').fillna(0)
    df_financial['EquityToAssetRatio'] = pd.to_numeric(df_financial["EquityToAssetRatio"], errors = 'coerce').fillna(0)
    df_financial['ForecastDividendPerShareAnnual'] = pd.to_numeric(df_financial["ForecastDividendPerShareAnnual"], errors = 'coerce').fillna(0)
    adjust_list=['OrdinaryProfit', 'BookValuePerShare','Equity','TotalAssets','OperatingProfit','ForecastOperatingProfit','ForecastOrdinaryProfit','ForecastNetSales']
    for ob in adjust_list:
        df_financial[ob] = pd.to_numeric(df_financial[ob], errors = 'coerce').fillna(0)
    df_financial['Date']= df_financial['Date'].dt.strftime("%Y-%m-%d")
    return df_financial[['Date', 'SecuritiesCode', 'EarningsPerShare', 'ForecastEarningsPerShare', 'NetSales', 'Profit', 'ForecastProfit', 
                         'EquityToAssetRatio', 'ForecastDividendPerShareAnnual',
                         #need process,   TotalAssets-equirty= total liability
                         'OrdinaryProfit', 'BookValuePerShare','Equity','TotalAssets','OperatingProfit',
                         'ForecastOperatingProfit','ForecastOrdinaryProfit','ForecastNetSales','AverageNumberOfShares','DisclosedDate']]
def create_financial_features(df_financial):
        # Days Since Disclosure
    features = df_financial
    features.drop(features[features['AverageNumberOfShares']=='－'].index,inplace=True)
    features['AverageNumberOfShares']=features['AverageNumberOfShares'].astype(float)
    features['DaysSinceDisclosure'] = (pd.to_datetime(features['Date']) - pd.to_datetime(features['DisclosedDate'])).dt.days
    # Base Amount Features
    features = features.loc[:,~features.columns.duplicated()]
    features['ForecastDividend'] =  features['AverageNumberOfShares']*features['ForecastDividendPerShareAnnual']
    features['ForecastEarnings'] =  features['AverageNumberOfShares']*features['ForecastEarningsPerShare']
    features.drop('DisclosedDate',inplace=True,axis=1)
    features['eps_feps'] = (features['EarningsPerShare'] / features['ForecastEarningsPerShare']).replace([np.inf, -np.inf], np.nan).fillna(0)
#     features['pmargin'] = (features['Profit'] / features['NetSales']).replace([np.inf, -np.inf], np.nan).fillna(0)
    features['total_liability']= (features['TotalAssets']- features['Equity']).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_expense1"] = (features["NetSales"] - features["OperatingProfit"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_expense2"] = (features["OperatingProfit"] - features["OrdinaryProfit"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_expense3"] = (features["OrdinaryProfit"] - features["Profit"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    
    features["f_expense1"] = (features["ForecastNetSales"] - features["ForecastOperatingProfit"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_expense2"] = (features["ForecastOperatingProfit"] - features["ForecastOrdinaryProfit"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_expense3"] = (features["ForecastOrdinaryProfit"] - features["ForecastProfit"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    
    
    features["r_pm1"]  = (features["Profit"]   / features["NetSales"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_roe1"] = (features["Profit"]   / features["Equity"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_roa1"] = (features["Profit"]   / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["f_pm1"]  = (features["ForecastProfit"] / features["ForecastNetSales"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_roe1"] = (features["ForecastProfit"] / features["Equity"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_roa1"] = (features["ForecastProfit"] / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["r_pm2"]  = (features["OrdinaryProfit"]   / features["NetSales"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_roe2"] = (features["OrdinaryProfit"]   / features["Equity"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_roa2"] = (features["OrdinaryProfit"]   / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["f_pm2"]  = (features["ForecastOrdinaryProfit"] / features["ForecastNetSales"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_roe2"] = (features["ForecastOrdinaryProfit"] / features["Equity"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_roa2"] = (features["ForecastOrdinaryProfit"] / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["r_pm3"]  = (features["OperatingProfit"]   / features["NetSales"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_roe3"] = (features["OperatingProfit"]   / features["Equity"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_roa3"] = (features["OperatingProfit"]   / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["f_pm3"]  = (features["ForecastOperatingProfit"] / features["ForecastNetSales"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_roe3"] = (features["ForecastOperatingProfit"] / features["Equity"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_roa3"] = (features["ForecastOperatingProfit"] / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["r_cost1"] = (((features["NetSales"] - features["OperatingProfit"])/features["NetSales"])).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_cost2"] = (((features["OperatingProfit"] - features["OrdinaryProfit"])/features["NetSales"])).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["r_cost3"] = (((features["OrdinaryProfit"] - features["Profit"])/features["NetSales"])).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["f_cost1"] = (((features["ForecastNetSales"] - features["ForecastOperatingProfit"])/features["ForecastNetSales"])).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_cost2"] = (((features["ForecastOperatingProfit"] - features["ForecastOrdinaryProfit"])/features["ForecastNetSales"])).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_cost3"] = (((features["ForecastOrdinaryProfit"] - features["ForecastProfit"])/features["ForecastNetSales"])).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["r_turn"] = (features["NetSales"] / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    features["f_turn"] = (features["ForecastNetSales"] / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)

    features["equity_ratio"] = (features["Equity"] / features["TotalAssets"]).replace([np.inf, -np.inf], np.nan).fillna(0)
    feat2 = ["NetSales", "OperatingProfit", "OrdinaryProfit", "Profit", "ForecastNetSales", "ForecastOperatingProfit", "ForecastOrdinaryProfit", "ForecastProfit",
             "r_expense1", "r_expense2", "r_expense3", "f_expense1", "f_expense2", "f_expense3",
             "TotalAssets", "Equity"]
    feat3 = ["r_pm1", "r_roe1", "r_roa1", "f_pm1", "f_roe1", "f_roa1", 
                 "r_pm2", "r_roe2", "r_roa2", "f_pm2", "f_roe2", "f_roa2",
                 "r_pm3", "r_roe3", "r_roa3", "f_pm3", "f_roe3", "f_roa3",
                 "r_cost1", "r_cost2", "r_cost3", "f_cost1", "f_cost2", "f_cost3",
                 "r_turn", "f_turn", "equity_ratio"
                ]
    d_feat2=[]
    d_feat3 = []

    features=reduce_memory(features)
    dfuse=pd.DataFrame()

    
    for code in features['SecuritiesCode'].unique():
        
        dfsingle=features[features['SecuritiesCode']==code]
        for f in feat2:
            dfsingle["d_"+f] = (dfsingle[f].diff(1)).replace([np.inf, -np.inf], np.nan).fillna(0)
            d_feat2.append("d_"+f)
        for f in feat3:
            dfsingle["d_"+f] = (dfsingle[f].diff(1)).replace([np.inf, -np.inf], np.nan).fillna(0)
            d_feat3.append("d_"+f)
        
        dfsingle["m_sales"] = dfsingle["NetSales"] - dfsingle["ForecastNetSales"].shift(1)
        dfsingle["m_ope_income"] = dfsingle["OperatingProfit"] - dfsingle["ForecastOperatingProfit"].shift(1)
        dfsingle["m_ord_income"] = dfsingle["OrdinaryProfit"] - dfsingle["ForecastOrdinaryProfit"].shift(1)
        dfsingle["m_net_income"] = dfsingle["Profit"] - dfsingle["ForecastProfit"].shift(1)
        dfsingle["m_expense1"] = dfsingle["r_expense1"] - dfsingle["f_expense1"].shift(1)
        dfsingle["m_expense2"] = dfsingle["r_expense2"] - dfsingle["f_expense2"].shift(1)
        dfsingle["m_expense3"] = dfsingle["r_expense3"] - dfsingle["f_expense3"].shift(1)

        dfsingle["m_pm1"] = dfsingle["r_pm1"] - dfsingle["f_pm1"].shift(1)
        dfsingle["m_pm2"] = dfsingle["r_pm2"] - dfsingle["f_pm2"].shift(1)
        dfsingle["m_pm3"] = dfsingle["r_pm3"] - dfsingle["f_pm3"].shift(1)
        dfsingle["m_roe1"] = dfsingle["r_roe1"] - dfsingle["f_roe1"].shift(1)
        dfsingle["m_roe2"] = dfsingle["r_roe2"] - dfsingle["f_roe2"].shift(1)
        dfsingle["m_roe3"] = dfsingle["r_roe3"] - dfsingle["f_roe3"].shift(1)
        dfsingle["m_roa1"] = dfsingle["r_roa1"] - dfsingle["f_roa1"].shift(1)
        dfsingle["m_roa2"] = dfsingle["r_roa2"] - dfsingle["f_roa2"].shift(1)
        dfsingle["m_roa3"] = dfsingle["r_roa3"] - dfsingle["f_roa3"].shift(1)
        dfsingle["m_cost1"] = dfsingle["r_cost1"] - dfsingle["f_cost1"].shift(1)
        dfsingle["m_cost2"] = dfsingle["r_cost2"] - dfsingle["f_cost2"].shift(1)
        dfsingle["m_cost3"] = dfsingle["r_cost3"] - dfsingle["f_cost3"].shift(1)
        
    
        
        dfuse= dfuse.append(dfsingle)
    features=dfuse
    
    d_feat4 = ["m_sales", "m_ope_income", "m_ord_income", "m_net_income", "m_expense1", "m_expense2", "m_expense3",
                   "m_pm1", "m_pm2", "m_pm3", "m_roe1", "m_roe2", "m_roe3", "m_roa1", "m_roa2", "m_roa3",
                   "m_cost1", "m_cost2", "m_cost3"]
    for deats in d_feat4:
        features[deats]=features[deats].replace([np.inf, -np.inf], np.nan).fillna(0)
    
    return features

In [None]:
def feature_create(df,financials,stock_list):
    onl=['SecuritiesCode','Date']
    
    df= adjust_price(df)
    df.drop(['Close','CumulativeAdjustmentFactor','AdjustmentFactor'],axis=1,inplace=True)
    df = df.sort_values(by = ["SecuritiesCode","Date"], ascending = True).reset_index(drop = True)

    closes,opens,highs,lows,volumes= create_tables(df)
    
    #boarder bonds 
    df['Middle Band']= df.merge(pd.melt(closes.rolling(window=20).mean(),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"Middle Band"}),on=onl)['Middle Band']
    df['Upper Band'] = df['Middle Band'] + 1.96*df['AdjustedClose'].rolling(window=20).std()
    df['Lower Band'] = df['Middle Band'] - 1.96*df['AdjustedClose'].rolling(window=20).std()
    #time features             year, month, day, day of year
    df['Year']=pd.to_numeric(df['Date'].str[0:4])
    df['Month']=pd.to_numeric(df['Date'].str[5:7])
    df['Day']=pd.to_numeric(df['Date'].str[8:10])
    df['Date'] = pd.to_datetime(df['Date'])
    df['Day_Of_Year'] = df['Date'].dt.dayofyear
    df['Date']= df['Date'].dt.strftime("%Y-%m-%d")
    #single row features  
    df['Close_Open_change_rate'] = (df['AdjustedClose'] - df['Open']) / df['AdjustedClose']
    df['High_Low_change_rate'] = (df['High'] - df['Low']) / df['AdjustedClose']
    #rsi and cat
    dfa= RSI_create(df,5)
    df=df.merge(dfa,how='left',on=['Date','SecuritiesCode'],suffixes=('', 'b')).set_axis(df.index)
    df['rsicat'] = list(map(rsi_class, df['rsi']))
    df['rsicat'] = LabelEncoder().fit_transform(df['rsicat'])
    print('finish rsi!')
    stock_list= feature_stock_list(stock_list)
    df=df.merge(stock_list,on='SecuritiesCode',how='left')
    financials=adjust_financial(financials)
    financials=create_financial_features(financials).sort_values(['Date','SecuritiesCode'])
    financials['SecuritiesCode']=financials['SecuritiesCode'].astype(int)
    df=df.merge(financials,on=['Date','SecuritiesCode'],how='left')
    
    
    for column_name in financials.columns:
        df[column_name]=df.groupby("SecuritiesCode")[column_name].ffill().reset_index(level=0, drop=True)
        df[column_name].fillna(0, inplace = True)
    df=reduce_memory(df)
    print('finish finanicals!')
    #volatility/ rolling features
    period= [1, 5,10,20,30,50,100]
    for i in period:
        df[f"{i}D-EMA"]= df.merge(pd.melt(closes.ewm(span=i,adjust=False).mean(),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"{i}D-EMA"}),on=onl)[f"{i}D-EMA"]
        if i!=1:
            df["Volatility_{}_Day".format(i)] = np.log(df['AdjustedClose']).groupby(df["SecuritiesCode"]).diff().rolling(i).std().reset_index(level=0, drop=True)
        df["Return_{}_Day".format(i)] = df.groupby("SecuritiesCode")['AdjustedClose'].pct_change(i).reset_index(level=0, drop=True)
        if i!=1:
            df["vola_{}".format(i)] = df.groupby("SecuritiesCode")["Return_1_Day"].rolling(i).std().reset_index(level=0, drop=True)
        df['Volumn_{}_Day'.format(i)] = df.groupby("SecuritiesCode")['Volume'].rolling(i).mean().reset_index(level=0, drop=True)
        df['Close{}_Day'.format(i)] = df.groupby("SecuritiesCode")['AdjustedClose'].rolling(i).mean().reset_index(level=0, drop=True)
        df[f'atr_{i}']=df.merge(atr(highs,lows,closes,i),on=['SecuritiesCode','Date'])[f'atr_{i}']
        
        a = highs - lows
        df[f"atrday_{i}"]=df.merge(pd.melt(a.rolling(i).mean(),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"atrday_{i}"}),on=onl)[f"atrday_{i}"]
        quote_volumes = (volumes * (closes + opens) / 2).rolling(i).sum()
        df[f"qvol_{i}"]=df.merge(pd.melt(quote_volumes, ignore_index=False).reset_index().dropna().rename(columns = {"value": f"qvol_{i}"}),on=onl)[f"qvol_{i}"]
        a = abs(highs - closes.shift(1))
        df[f'atrgap_{i}']=df.merge(pd.melt(a.rolling(i).mean(),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"atrgap_{i}"}),on=onl)[f'atrgap_{i}']

        a = abs(lows - closes.shift(1))
        df[f"atrlow_{i}"]=df.merge(pd.melt(a.rolling(i).mean(),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"atrhige_{i}"}),on=onl)[f"atrhige_{i}"]
        if i!=1:
            df[f'variation_{i}']=df.merge(pd.melt((closes.diff()/closes.shift(1)).rolling(i).std(),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"variation_{i}"}),on=onl)[f'variation_{i}']
        df[f"HL_{i}"]=df.merge(pd.melt((highs.rolling(i).max()-lows.rolling(i).min()),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"HL_{i}"}),on= onl)[f"HL_{i}"]
        df[f"market_impact_{i}"]=df.merge(pd.melt((closes.diff()/volumes).rolling(i).mean(),ignore_index=False).reset_index().dropna().rename(columns = {"value":f"market_impact_{i}"}), on=onl)[f"market_impact_{i}"]
    print(len(df.columns))
    return df

In [None]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()
cols=['Date','SecuritiesCode','Open','High','Low','Close','Volume','AdjustmentFactor']
model = xgb.Booster()
model.load_model("../input/model-test/model.txt")
prices=reduce_memory(prices)
for (Testprices, options, Testfinancials, trades, secondary_prices, df_pred) in iter_test:
    print('---------------------loop once-----------------------')
    Testprices=Testprices.drop(['RowId','ExpectedDividend'],axis=1)

#     finalfinancials= financials.append(Testfinancials,ignore_index=True)
    finalfinancials=financials
    
    current_date=datetime.strptime(Testprices["Date"].iloc[0],'%Y-%m-%d')
    print("test date-------------------------->",current_date)
    totaldata = prices.loc[pd.to_datetime(prices["Date"]) < current_date]
    totaldata=pd.concat([totaldata, Testprices[cols]]).reset_index(drop=True)
    totaldata.drop("SupervisionFlag",axis=1,inplace=True)


    totaldata= feature_create(totaldata,finalfinancials,stock_list)
    totaldata=reduce_memory(totaldata)
    X_test=totaldata[pd.to_datetime(totaldata["Date"]) == current_date].drop('Date',axis=1)
    X_test.drop("Target",axis=1,inplace=True)
    print("finish featuring")

#     del totaldata
#     gc.collect()

    xgtest = xgb.DMatrix(X_test.values)
    y_pred = model.predict(xgtest)
    df_pred['Target'] = y_pred
    df_pred = df_pred.sort_values(by = "Target", ascending = False)
    df_pred['Rank'] = np.arange(len(df_pred.index))
    df_pred = df_pred.sort_values(by = "SecuritiesCode", ascending = True)
    df_pred.drop(["Target"], axis = 1)
    submission = df_pred[["Date", "SecuritiesCode", "Rank"]]
    print(submission.head(5))

    assert sample_prediction.Rank.nunique() == 2000, 'duplicate rank'
    assert sample_prediction["Rank"].notna().all(), 'na value'
    assert sample_prediction["Rank"].min() == 0, 'rank below 0'
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1, 'rank above 1999'

    env.predict(submission)
    print("finsih!!!!!!!!!!!!!!!")

In [None]:
# prices.drop("SupervisionFlag",axis=1,inplace=True)
# prices= feature_create(prices,financials,stock_list)
# prices=reduce_memory(prices)
# prices.to_pickle("totaldatafinal.pkl")


In [None]:
# totaldata.drop('Date',inplace=True,axis=1)
# totaldata.to_csv("totaldatafinal.csv")

In [None]:
# percent_missing = prices.isnull().sum() * 100 / len(prices)
# missing_value_df = pd.DataFrame({'column_name': prices.columns,
#                                  'percent_missing': percent_missing})
# missing_value_df.sort_values('percent_missing', inplace=True)

In [None]:
# missing_value_df[200:]

In [None]:
# model = xgb.XGBRegressor(
#     n_estimators=1500,
#     max_depth=6,#16
#     learning_rate=0.01,#try 0.015
# #     subsample=0.8,
#     colsample_bytree=0.1,
#     missing=-999,
#     random_state=2020,
#     min_child_weight=1 ,
#     gamma=0,
    
#     tree_method='gpu_hist' # THE MAGICAL PARAMETER
#     )

In [None]:
# from sklearn.model_selection import TimeSeriesSplit
# from sklearn.metrics import mean_squared_error,mean_absolute_error

**cross validation**

In [None]:
# ts_fold = TimeSeriesSplit(n_splits=10,gap=10000)
# sharpe_ratio=[]
    
# for fold, (train_idx, val_idx) in enumerate(ts_fold.split(X, y)):
    
#     print("\n========================== Fold {} ==========================".format(fold+1))
#     if (fold+1)>8:
#         print(train_idx, val_idx)
#         X_train, y_train = X.iloc[train_idx,:], y[train_idx]
#         X_valid, y_val = X.iloc[val_idx,:], y[val_idx]

#         print("Train Date range: {} to {}".format(X_train.Date.min(),X_train.Date.max()))
#         print("Valid Date range: {} to {}".format(X_valid.Date.min(),X_valid.Date.max()))

#         X_train.drop(['Date','SecuritiesCode'], axis=1, inplace=True)
#         X_val=X_valid[X_valid.columns[~X_valid.columns.isin(['Date','SecuritiesCode'])]]
#         val_dates=X_valid.Date.unique()[1:-1]
#         print("\nTrain Shape: {} {}, Valid Shape: {} {}".format(X_train.shape, y_train.shape, X_val.shape, y_val.shape))

#         model.fit(X_train, y_train, early_stopping_rounds=20, eval_set=[(X_train, y_train),(X_val, y_val)], verbose=300)
#         gc.collect()
#         y_pred = model.predict(X_val)
#         rmse = np.sqrt(mean_squared_error(y_val, y_pred))
#         mae = mean_absolute_error(y_val, y_pred)

#         rank=[]
#         X_val_df=X_valid[X_valid.Date.isin(val_dates)]
#         for i in X_val_df.Date.unique():
#             temp_df = X_val_df[X_val_df.Date == i].drop(['Date','SecuritiesCode'],axis=1)
#             temp_df["pred"] = model.predict(temp_df)
#             temp_df["Rank"] = (temp_df["pred"].rank(method="first", ascending=False)-1).astype(int)
#             rank.append(temp_df["Rank"].values)

#         stock_rank=pd.Series([x for y in rank for x in y], name="Rank")
#         df=pd.concat([X_val_df.reset_index(drop=True),stock_rank,prices[prices.Date.isin(val_dates)]['Target'].reset_index(drop=True)], axis=1)
#         sharpe=calc_spread_return_sharpe(df)
#         sharpe_ratio.append(sharpe)
#         print("Valid Sharpe: {}, RMSE: {}, MAE: {}".format(sharpe,rmse,mae))

#         del X_train, y_train,  X_val, y_val
#         gc.collect()
#     else:
#         continue
# print("\nAverage cross-validation Sharpe Ratio: {:.4f}, standard deviation = {:.2f}.".format(np.mean(sharpe_ratio),np.std(sharpe_ratio)))