In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

#configuration
warnings.simplefilter('ignore')
pd.set_option('max_column', None)
sns.set_style('darkgrid')
colors = sns.color_palette('Set2')

In [None]:
TRAIN_DIR = "../input/jpx-tokyo-stock-exchange-prediction/train_files"
TEST_DIR = "../input/jpx-tokyo-stock-exchange-prediction/example_test_files"

In [None]:
%%time
df_prices = pd.read_csv(os.path.join(TRAIN_DIR, 'stock_prices.csv'))
df_prices_sec = pd.read_csv(os.path.join(TRAIN_DIR, 'secondary_stock_prices.csv'))
df_fins = pd.read_csv(os.path.join(TRAIN_DIR, 'financials.csv'))
df_opts = pd.read_csv(os.path.join(TRAIN_DIR, 'options.csv'))
df_trades = pd.read_csv(os.path.join(TRAIN_DIR, 'trades.csv'))
stock_list = pd.read_csv(os.path.join("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv"))

df_test_options = pd.read_csv(os.path.join(TEST_DIR, 'stock_prices.csv'))
df_test_prices_sec = pd.read_csv(os.path.join(TEST_DIR, 'secondary_stock_prices.csv'))
df_test_fins = pd.read_csv(os.path.join(TEST_DIR, 'financials.csv'))
df_test_opts = pd.read_csv(os.path.join(TEST_DIR, 'options.csv'))
df_test_trades = pd.read_csv(os.path.join(TEST_DIR, 'trades.csv'))


In [None]:
def summarize(df, file_name, n_rows_to_show=5):
    """Simply summarize the given DataFrame.
    
    Parameters:
        df: pd.DataFrame, raw DataFrame
        file_name: str, name of the file
        n_rows_to_show: int, number of rows to show 
    """
    print(f"=====Summary of {file_name}=====")
    print(f"Shape: {df.shape}")
    
    nan_ratio = pd.isna(df).sum() / len(df) * 100
    nan_ratio.sort_values(ascending=False, inplace=True)
    nan_ratio = nan_ratio.to_frame(name='NaN Ratio').T
    print("NaN ratio:")
    display(nan_ratio)   
    
    display(df.head(n_rows_to_show))


summarize(df_prices, "stock_prices.csv")

In [None]:
row_id = (df_prices['Date'].apply(lambda date: ''.join(date.split('-'))+'_')
          + df_prices['SecuritiesCode'].astype(str))
assert (df_prices['RowId'] == row_id).all()

In [None]:
n_dates = df_prices['Date'].nunique()
date_min, date_max = df_prices['Date'].min(), df_prices['Date'].max()
n_stocks = df_prices['SecuritiesCode'].nunique()

print(f"Number of unique dates: {n_dates} ({date_min} ~ {date_max})\n"
      f"Number of unique stocks: {n_stocks}")

In [None]:
def plot_volume_within_range(prime=True):
    df_ = df_prices.copy() if prime else df_prices_sec.copy()
    securities = df_['SecuritiesCode']
    df_vol = df_prices.groupby(["SecuritiesCode"]).Volume.sum().reset_index()
    vol = df_vol['Volume']/n_dates
    securities = df_vol['SecuritiesCode']
    
    plt.figure(figsize=(15, 15))
    plt.subplot(211)
    plt.scatter(securities, vol, color="orange", label="Scatter plot")
    plt.legend()
    plt.subplot(212)
    plt.plot(securities, vol, color="green", label="line plot")
    plt.legend()
    plt.suptitle('Volume Plots')
    plt.show()

In [None]:
plot_volume_within_range(prime=True)
df_volume = df_prices.groupby(["SecuritiesCode"]).Volume.sum().reset_index()

In [None]:
print(df_volume)
df_volume.describe()

In [None]:
sample_securities = df_volume.loc[(df_volume['Volume']>1.50e+08)&(df_volume['Volume']<1.70e+08)]                     # Get rows in range
#print(sample_securities)      
sample_securities.describe()
df_sample = sample_securities
#print(df_sample.shape)
df_sample

In [None]:
df_sample_prices=df_prices[['Date','SecuritiesCode','Close']].loc[df_prices['SecuritiesCode'].isin(df_sample['SecuritiesCode'])]

import pandas as pd
#AAPL_DIR = "../input/aapl-2015-2020"
#df_sample_1 = pd.read_csv(os.path.join(AAPL_DIR, 'AAPL_2015_2020.csv'))
#print(df_sample_1)
df_sample_prices['SecuritiesCode']

In [None]:
number_null_date=[]
for i in sample_securities['SecuritiesCode']:
    number_n = df_sample_prices.loc[df_sample_prices['SecuritiesCode']==i].isnull().sum().sum()
    number_null_date.append(number_n)
#print(number_null_date)
#print(len(number_null_date))

In [None]:
df_sample_prices=df_sample_prices.reset_index()
#print(df_sample_prices)
#print(df_sample_prices[df_sample_prices['Close'].isnull()])
#print(len(df_sample_prices[df_sample_prices['Close'].isnull()]))


In [None]:
list_sample_each = [] # it is a list of dataframe

for i in df_sample_prices['SecuritiesCode'].unique():
    list_sample_each.append(df_sample_prices.loc[df_sample_prices['SecuritiesCode']==i])

print(list_sample_each[52])

In [None]:
#list_sample_each[0]['SecuritiesCode']
# list_sample_each is a list of dataframe object

In [None]:
list_sample_fillna = [] # it is a list of dataframe
for i in range(53):
    df_sample = list_sample_each[i].reset_index()['Close']
    if len(df_sample) > 913:
        df_sample.iloc[913]=df_sample.iloc[912]
        #print(df_sample.iloc[913], i)
    else:
        print(len(df_sample), i)
    list_sample_fillna.append(df_sample)
# df_sample is a dataframe object anda data_sample.iloc is a series object
print(np.array(list_sample_fillna[0]))
print(np.array(list_sample_fillna).shape)
print(np.array(list_sample_fillna[0]).reshape(-1,1).shape)
print(np.array(list_sample_fillna[0]).shape)
#print(scaler.fit_transform(np.array(list_sample_fillna[0]).reshape(-1,1)))
#print(scaler.inverse_transform(scaler.fit_transform(np.array(list_sample_fillna[0]).reshape(-1,1))))

In [None]:
number_null_date=[]
for i in list_sample_fillna:
    number_n = i.isnull().sum()
    number_null_date.append(number_n)
print(number_null_date)
print(len(number_null_date))
list_sample_fillna[-2:]

In [None]:
# df_sample_price --> list_sample_each --> list_sample_fillna --> list_sample_scaled: used to 
# create the datasets for models 

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(0,1))
#a = scaler.fit_transform(np.array(list_sample_fillna[0]).reshape(-1,1))
#print(a)
list_sample_scaled = []
for i in list_sample_fillna:    
    #scaler=MinMaxScaler(feature_range=(0,1))
    #print('i before scaled: ',i)
    i = scaler.fit_transform(np.array(i).reshape(-1,1))
    #print('i after scaled and inversed: ',scaler.inverse_transform(i))
    list_sample_scaled.append(i)
print(list_sample_scaled[0])
print(list_sample_fillna[0])
#scaler=MinMaxScaler(feature_range=(0,1))
#https://stackoverflow.com/questions/49885007/how-to-use-scikit-learn-inverse-transform-with-new-values
#print(scaler.inverse_transform(list_sample_scaled[0])) # ??? why is this different from the original?
print(scaler.inverse_transform(scaler.transform(np.array(list_sample_fillna[0]).reshape(-1,1))))

training_size=int(len(df_sample_1)*0.65)
test_size=len(df_sample_1)-training_size
train_data,test_data=df_sample_1[0:training_size,:],df_sample_1[training_size:len(df_sample_1),:]
print(train_data.shape,test_data.shape)

In [None]:
print(list_sample_scaled[0][0:2])
print(int(len(list_sample_scaled[0])*0.65))
split_point = int(len(list_sample_scaled[0])*0.65)
train_data,test_data=list_sample_scaled[0][0:split_point,:],list_sample_scaled[0][split_point:len(i),:]

training_size=int(len(df_sample_1)*0.65)
test_size=len(df_sample_1)-training_size
train_data,test_data=df_sample_1[0:training_size,:],df_sample_1[training_size:len(df_sample_1),:]
print(train_data.shape,test_data.shape)

train_D = []
test_D = []
for i in (list_sample_fillna):
    training_size=int(len(i)*0.65)
    test_size=len(i)-training_size
    train_data,test_data=i[0:training_size,:],i[training_size:len(i),:]
    print(train_data.shape,test_data.shape)
    #train_D.append(train_data)
    #test_D.append(test_data)

In [None]:
import numpy
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step):
        a = dataset[i:(i+time_step), 0]
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return numpy.array(dataX), numpy.array(dataY)
time_step = 100
X_Train = []
y_Train = []
X_Test = []
y_Test = []
for i in list_sample_scaled:
    training_size=int(len(i)*0.65)
    test_size=len(i)-training_size
    train_data,test_data=i[0:training_size],i[training_size:len(i)]
    #print(train_data.shape,test_data.shape)
    X_train, y_train = create_dataset(train_data, time_step)
    X_test, y_test = create_dataset(test_data, time_step)
    X_train =X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
    X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)
    X_Train.append(X_train)
    y_Train.append(y_train)
    X_Test.append(X_test)
    y_Test.append(y_test)
#print(X_train,y_train)
#print(X_test,y_test)#
#print(X_train.shape,y_train.shape)
#print(X_test.shape,y_test.shape)

In [None]:
#X_train =X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
#X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)
#X_train =X_train.reshape(X_train.shape[0],100 , 1)
#X_test = X_test.reshape(X_test.shape[0],100 , 1)
print(X_Train[0].shape,y_Train[0].shape)
print(X_Test[0].shape,y_Test[0].shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from keras.layers import Dropout
from tensorflow import keras

In [None]:
model=Sequential()
model.add(LSTM(50,return_sequences=True,input_shape=(100,1)))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='mean_squared_error',optimizer=opt)
model.fit(X_Train[10],y_Train[10],epochs=50,batch_size=32)

In [None]:
print(list_sample_each[0])
print(list_sample_fillna[0])
#print(scaler.inverse_transform(list_sample_scaled[0])) # ??? why is this dfferent? one stock one scaler
print(list_sample_scaled[0][0:1])

In [None]:
# This block of code is not used later on. What I tried to do here is to create corresponding dataframe 
# set to later on append the model prediction column. Combined dataframes will be later used.
import datetime
split_date = list_sample_each[0]['Date'][len(X_Train[0])+100:len(X_Train[0])+101].values[0]
print(split_date)
str_date = split_date.split('-')
int_date =[]
for i in str_date:
    int_date.append(int(i))
print(int_date)
split_date = datetime.date(int_date[0],int_date[1],int_date[2])
print(split_date)

In [None]:
# same purpose as above

df_sample_to_predict = df_sample_prices.loc[pd.to_datetime(df_sample_prices['Date'])>pd.to_datetime(split_date)]
print(df_sample_to_predict)
#print(len(df_sample_to_predict.loc[df_sample_to_predict['SecuritiesCode']==1815]))
print(df_sample_to_predict.loc[df_sample_to_predict['SecuritiesCode']==1815][-21:])
print(list_sample_fillna[0][-21:])


In [None]:
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step):
        a = dataset[i:(i+time_step), 0]
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return numpy.array(dataX), numpy.array(dataY)
time_step = 100
X_Train = []
y_Train = []
X_Test = []
y_Test = []
for i in list_sample_scaled:
    training_size=int(len(i)*0.65)
    test_size=len(i)-training_size
    train_data,test_data=i[0:training_size],i[training_size:len(i)]
    #print(train_data.shape,test_data.shape)
    X_train, y_train = create_dataset(train_data, time_step)
    X_test, y_test = create_dataset(test_data, time_step)
    X_train =X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
    X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)
    X_Train.append(X_train)
    y_Train.append(y_train)
    X_Test.append(X_test)
    y_Test.append(y_test)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

list_test_Predicted = []
for i in range(53):
    train_predict=model.predict(X_Train[i])   
    test_predict=model.predict(X_Test[i])
    scaler=MinMaxScaler(feature_range=(0,1))
    #i = scaler.fit_transform(np.array(i).reshape(-1,1))
    scaler.fit_transform(np.array(list_sample_fillna[i]).reshape(-1,1))
    train_predict=scaler.inverse_transform(train_predict)
    #scaler=MinMaxScaler(feature_range=(0,1))
    #scaler.fit_transform(np.array(list_sample_fillna[i]).reshape(-1,1)
    test_predict=scaler.inverse_transform(test_predict)
    list_test_Predicted.append(test_predict)

    #print(test_predict)
    #print(test_predict.astype)
    #list_sample_unscaled= scaler.inverse_transform(list_sample_scaled[i])
    list_sample_unscaled = (scaler.inverse_transform(scaler.fit_transform(np.array(list_sample_fillna[i]).reshape(-1,1))))
    x_train = range(100,len(X_Train[i])+100)
    x_test = range(len(X_Train[i])+200,len(X_Train[i])+len(X_Test[i])+200)
    #print(test_predict[-len(X_Train[i])+200:-300]) 
    #print(test_predict[-21:])
    #print(list_sample_unscaled[-21:])

    if i < 3:
        figure(figsize=(12, 8), dpi=160)
        plt.title(str(list_sample_each[i][0:1]['SecuritiesCode']), fontdict=None, loc='center', pad=None)
        plt.plot(list_sample_unscaled,'b')
        plt.plot(x_train,train_predict,'orange')
        plt.plot(x_test,test_predict,'g')
        plt.ylabel('Close_Price')
        plt.show()


In [None]:
# import math
# from sklearn.metrics import mean_squared_error
# math.sqrt(mean_squared_error(y_train,train_predict))

In [None]:
list_sample_each[0]

In [None]:
# code from JPX - Detailed EDA notebook
df_no_prices = df_prices[df_prices['Close'].isna()]
print(f"Number of samples without prices: {len(df_no_prices)}")
print(df_no_prices[2000:2002])

In [None]:
list_df_sample_fillna = [] # it is a list of dataframe
for i in range(53):
    df_sample = list_sample_each[i][['Date','SecuritiesCode','Close']].reset_index()
    nu = df_sample[df_sample['Close'].isnull()].index.values
    for j in nu:
        #print(df_sample.iloc[j])
        df_sample['Close'].iloc[j]=df_sample['Close'].iloc[j-1]
        #print(df_sample.iloc[j])
        list_df_sample_fillna.append(df_sample)

In [None]:
list_train_df, list_test_df = [],[]
for i in list_df_sample_fillna:
    training_size=int(len(i)*0.65)
    test_size=len(i)-training_size
    train_df,test_df=i[0:training_size],i[training_size:len(i)]
    list_train_df.append(train_df)
    list_test_df.append(test_df)
#print(len(list_train_df[52]))
#print(len(list_test_df[52]))
print((list_train_df[0]))
print((list_test_df[0]))
print((list_test_df[0]).isnull().sum())


## Calculate Target

𝑟(𝑘,𝑡)=(𝐶(𝑘,𝑡+2)−𝐶(𝑘,𝑡+1))/𝐶(𝑘,𝑡+1)

In [None]:
print(len(list_test_Predicted[2]))
print(len(list_test_df[2]))

print(df_sample_to_predict)

In [None]:
#print((list_test_Predicted))
#print((list_test_df[2]))

In [None]:
list_test_Predicted_df = []
for i in list_test_df:
    list_test_Predicted_df.append(i[100:])
print(list_test_Predicted_df[0])

In [None]:
counter = 0
for i in list_test_Predicted_df:
    i['Predicted']=list_test_Predicted[counter]
    counter = counter + 1
print(list_test_Predicted_df[50])

𝑟(𝑘,𝑡)=(𝐶(𝑘,𝑡+2)−𝐶(𝑘,𝑡+1))/𝐶(𝑘,𝑡+1)

In [None]:
#list_test_Predicted_df[52]['Predicted'].iloc[0]
#len(list_test_Predicted_df)

In [None]:
list_Rt = []
for i in range(len(list_test_Predicted_df)):
    rt = []
    for j in range (len(list_test_Predicted_df[i])-2):
        rt.append((list_test_Predicted_df[i]['Predicted'].iloc[j+2]-list_test_Predicted_df[i]['Predicted'].iloc[j+1])/list_test_Predicted_df[i]['Predicted'].iloc[j+1])
    rt.append('null')
    rt.append('null')
    list_test_Predicted_df[i]['RateOfChange'] = rt
    #print(len(rt), rt)
    #print(list_test_Predicted_df[i])
    #list_Rt.append(rt)
print(list_test_Predicted_df)   

    

In [None]:
df_data = pd.concat(list_test_Predicted_df, sort= False)

In [None]:
df_ROC= df_data.groupby('Date')['RateOfChange'].apply(list)
df_SC = df_data.groupby('Date')['SecuritiesCode'].apply(list)
df_Date = df_data.groupby('Date')['Date'].apply(list)
#print(df_rank[['SecuritiesCode','RateOfChange']])
print(df_ROC)
print(df_SC)
print(df_Date)
list_rank = []
for i in range(len(list_test_Predicted_df[0]['Date'])):
    df_date = pd.DataFrame({'Date':df_Date[i],
                            'SecuritiesCode':df_SC[i],
                            'ROC':df_ROC[i]})
    df_date = df_date.sort_values(by = 'ROC',ascending = False)
    list_rank.append(df_date)
print(list_rank)

#print(df_sample_to_predict.groupby(['SecuritiesCode']))
#for key,item in df_sample_to_predict.groupby(['SecuritiesCode']):
#    print(df_sample_to_predict.groupby(['SecuritiesCode']).get_group(key)[-321:], "\n\n")

print(list_sample_each[0][-len(list_test_Predicted[0]):])

print(len(list_test_Predicted[0].flatten()))
list_sample_each[0]['Predicted_Close'] = 0
list_sample_each[0]['Predicted_Close'][-len(list_test_Predicted[0]):]=list_test_Predicted[0].flatten()
print(list_sample_each[0])

print(len(list_test_Predicted[0].flatten()))
j=0
for i in list_sample_each:
    i['Predicted_Close'][-len(list_test_Predicted[j]):]=list_test_Predicted[j].flatten()
    j=j+1
    #i['Predicted_Close']=i['Close']
print(list_sample_each[0])

for i in list_test_Predicted:
    df_sample_to_predict['SecuritiesCode'].unique()['Close'][-len(i):]=i
for key,item in df_sample_to_predict.groupby(['SecuritiesCode']):
    print(df_sample_to_predict.groupby(['SecuritiesCode']).get_group(key)[-321:], "\n\n")

list_sample_each = []

for i in df_sample_prices['SecuritiesCode'].unique():
    list_sample_each.append(df_sample_prices.loc[df_sample_prices['SecuritiesCode']==i])
print(len(list_sample_each))


    

df.join(df_sample_to_predict.groupby(['SecuritiesCode']))

import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go

df = px.data.stocks()
print(df)
fig = px.line(df, x='date', y=["MSFT","GOOG",'FB',"AMZN"])
fig.show()

# Evaluation function

import numpy as np
import pandas as pd


def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

calc_spread_return_sharpe(df: df_sample_predict, portfolio_size: int = 5, toprank_weight_ratio: float = 2) -> float: