In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../data/preprocessed_kospi31_daily_stock_chart.csv')
df

Unnamed: 0,code,name,date,open,high,low,close,z_d5,z_d10,z_d15,z_d20,z_d25,z_d30
0,U182,kospi50_index,20030102,-0.000572,0.003749,-0.017282,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,U182,kospi50_index,20030103,-0.016228,0.002173,-0.020027,0.044230,-0.021178,-0.021178,-0.021178,-0.021178,-0.021178,-0.021178
2,U182,kospi50_index,20030106,-0.003413,0.002105,-0.014164,0.009321,-0.020145,-0.020145,-0.020145,-0.020145,-0.020145,-0.020145
3,U182,kospi50_index,20030107,0.042358,0.042912,-0.000457,-0.022060,0.001468,0.001468,0.001468,0.001468,0.001468,0.001468
4,U182,kospi50_index,20030108,0.007934,0.014432,-0.001829,-0.000962,0.001946,0.001946,0.001946,0.001946,0.001946,0.001946
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147835,A005380,현대차,20210826,-0.002381,0.009524,-0.009524,0.002387,-0.014762,0.001190,0.017937,0.027381,0.036381,0.045238
147836,A005380,현대차,20210827,0.002398,0.004796,-0.016787,-0.007143,-0.000959,0.003357,0.020624,0.031535,0.040000,0.049081
147837,A005380,현대차,20210830,0.004762,0.011905,0.000000,0.007194,-0.002381,-0.007143,0.009365,0.022262,0.029619,0.038413
147838,A005380,현대차,20210831,-0.011765,0.000000,-0.021176,0.011905,-0.011294,-0.019059,-0.005176,0.008471,0.015247,0.023843


In [3]:
df = df[1:]
df.index = np.arange(len(df))
df

Unnamed: 0,code,name,date,open,high,low,close,z_d5,z_d10,z_d15,z_d20,z_d25,z_d30
0,U182,kospi50_index,20030103,-0.016228,0.002173,-0.020027,0.044230,-0.021178,-0.021178,-0.021178,-0.021178,-0.021178,-0.021178
1,U182,kospi50_index,20030106,-0.003413,0.002105,-0.014164,0.009321,-0.020145,-0.020145,-0.020145,-0.020145,-0.020145,-0.020145
2,U182,kospi50_index,20030107,0.042358,0.042912,-0.000457,-0.022060,0.001468,0.001468,0.001468,0.001468,0.001468,0.001468
3,U182,kospi50_index,20030108,0.007934,0.014432,-0.001829,-0.000962,0.001946,0.001946,0.001946,0.001946,0.001946,0.001946
4,U182,kospi50_index,20030109,0.023883,0.025510,0.000000,-0.036831,0.038606,0.033550,0.033550,0.033550,0.033550,0.033550
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147834,A005380,현대차,20210826,-0.002381,0.009524,-0.009524,0.002387,-0.014762,0.001190,0.017937,0.027381,0.036381,0.045238
147835,A005380,현대차,20210827,0.002398,0.004796,-0.016787,-0.007143,-0.000959,0.003357,0.020624,0.031535,0.040000,0.049081
147836,A005380,현대차,20210830,0.004762,0.011905,0.000000,0.007194,-0.002381,-0.007143,0.009365,0.022262,0.029619,0.038413
147837,A005380,현대차,20210831,-0.011765,0.000000,-0.021176,0.011905,-0.011294,-0.019059,-0.005176,0.008471,0.015247,0.023843


In [4]:
def market_stock_split(df):
    market_df = df[df.code=='U182']
    stock_df = df[df.code!='U182']
    return market_df, stock_df

def stock_seperate(stocks):
    stock_names = stocks['name'].unique()
    res = []
    for stock in stock_names:
        res.append(stocks[stocks.name==stock])
    return res

def train_test_split(stocks_df, test_pct=0.2, valid_pct=0.2):
    if not isinstance(stocks_df, (list, tuple)):
        return train_test_split([stocks_df], test_pct, valid_pct)

    stocks_train, stocks_valid, stocks_test = [],[],[]
    for stock in stocks_df:
        test_size = int(len(stock)*test_pct)
        valid_size = int(len(stock)*valid_pct)
        train_size = len(stock) - test_size - valid_size

        stocks_train.append(stock[:train_size])
        stocks_valid.append(stock[train_size:train_size+valid_size])
        stocks_test.append(stock[-test_size:])

    return stocks_train, stocks_valid, stocks_test

def input_target_split(stocks_df):
    target = [stock['close'].apply(lambda x: int(x>0)) for stock in stocks_df]
    return stocks_df, target

def window_split(inputs, targets, window_size=10):
    inputs = inputs.values
    targets = targets.values
    X = []
    y = []
    for i in range(window_size, len(inputs)):
        X.append(inputs[i-window_size:i])
        y.append(targets[i])

    return np.array(X), np.array(y)

def stocks_window_split(stocks_inputs, stocks_target, window_size=10):
    X,y = [],[]
    for j in range(window_size, len(stocks_inputs[0])):
        bucket_X = []
        bucket_y = []
        for i in range(len(stocks_inputs)):
            stock = stocks_inputs[i].values
            target = stocks_target[i].values
            bucket_X.append(stock[j-window_size:j])
            bucket_y.append(target[j])
        X.append(bucket_X)
        y.append(bucket_y)
    return np.array(X), np.array(y)

def prepare_data(df, **kwargs):
    market, stocks = market_stock_split(df)
    stocks = stock_seperate(stocks)
    print(len(stocks))
    for i in range(len(stocks)):
        stocks[i].drop(['code','name', 'date'], axis=1, inplace=True)
    market.drop(['code','name', 'date'], axis=1, inplace=True)

    market_train_valid_test = train_test_split(market,
                                               test_pct=kwargs.get('test_pct'),
                                               valid_pct=kwargs.get('valid_pct')) # [m_train], [m_valid], [m_test]
    stock_train_valid_test = train_test_split(stocks,
                                              test_pct=kwargs.get('test_pct'),
                                              valid_pct=kwargs.get('valid_pct')) # [s_train1, ...], [s_valid1, ...], [s_test1, ...]

    print(f'market_train_valid_test: {len(market_train_valid_test)}')
    print(f'market_train_valid_test[0]: {len(market_train_valid_test[0])}')

    print(f'stock_train_valid_test: {len(stock_train_valid_test)}')
    print(f'stock_train_valid_test[0]: {len(stock_train_valid_test[0])}')
    print(f'stock_train_valid_test[0][0]: {stock_train_valid_test[0][0].shape}')

    market_data = [input_target_split(m) for m in market_train_valid_test] # [([m_train_X], [m_train_y]), (...), (...)]
    stock_data = [input_target_split(s) for s in stock_train_valid_test] # [([s_train1_X, ...], [s_train1_y, ...]), (...), (...)]

    print(f'market_data: {len(market_data)}')
    print(f'market_data[0]: {len(market_data[0])}')
    print(f'market_data[0][0]: {len(market_data[0][0])}')
    print(f'market_data[0][0][0]: {market_data[0][0][0].shape}')

    print(f'stock_data: {len(stock_data)}')
    print(f'stock_data[0]: {len(stock_data[0])}')
    print(f'stock_data[0][0]: {len(stock_data[0][0])}')
    print(f'stock_data[0][0][0]: {stock_data[0][0][0].shape}')

    window_size = kwargs['window_size']
    market_data = [stocks_window_split(*m, window_size) for m in market_data]
    stock_data = [stocks_window_split(*s, window_size) for s in stock_data]
    return market_data, stock_data

In [7]:
options = {
    'window_size':10,
    'test_pct':0.2,
    'valid_pct':0.2,
}
market_data, stock_data = prepare_data(df, **options)

31
market_train_valid_test: 3
market_train_valid_test[0]: 1
stock_train_valid_test: 3
stock_train_valid_test[0]: 31
stock_train_valid_test[0][0]: (2772, 10)
market_data: 3
market_data[0]: 2
market_data[0][0]: 1
market_data[0][0][0]: (2773, 10)
stock_data: 3
stock_data[0]: 2
stock_data[0][0]: 31
stock_data[0][0][0]: (2772, 10)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [14]:
(X_train_m, y_train_m), (X_vaild_m, y_vaild_m), (X_tset_m, y_test_m) = market_data
(X_train_s, y_train_s), (X_vaild_s, y_vaild_s), (X_tset_s, y_test_s) = stock_data

In [16]:
X_train_s.shape

(2762, 31, 10, 10)

In [17]:
X_vaild_s.shape


(914, 31, 10, 10)

In [19]:
int(len(df)*0.8)

118271

In [21]:
df[df.code!='U182']

Unnamed: 0,code,name,date,open,high,low,close,z_d5,z_d10,z_d15,z_d20,z_d25,z_d30
4619,A030200,KT,20030102,0.018036,0.022044,-0.006012,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4620,A030200,KT,20030103,-0.034351,0.000000,-0.041985,0.050100,-0.023855,-0.023855,-0.023855,-0.023855,-0.023855,-0.023855
4621,A030200,KT,20030106,-0.009542,0.005725,-0.019084,0.000000,-0.015903,-0.015903,-0.015903,-0.015903,-0.015903,-0.015903
4622,A030200,KT,20030107,0.036893,0.036893,-0.003883,-0.017176,0.000971,0.000971,0.000971,0.000971,0.000971,0.000971
4623,A030200,KT,20030108,0.003846,0.019231,-0.007692,0.009709,-0.006923,-0.006923,-0.006923,-0.006923,-0.006923,-0.006923
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147834,A005380,현대차,20210826,-0.002381,0.009524,-0.009524,0.002387,-0.014762,0.001190,0.017937,0.027381,0.036381,0.045238
147835,A005380,현대차,20210827,0.002398,0.004796,-0.016787,-0.007143,-0.000959,0.003357,0.020624,0.031535,0.040000,0.049081
147836,A005380,현대차,20210830,0.004762,0.011905,0.000000,0.007194,-0.002381,-0.007143,0.009365,0.022262,0.029619,0.038413
147837,A005380,현대차,20210831,-0.011765,0.000000,-0.021176,0.011905,-0.011294,-0.019059,-0.005176,0.008471,0.015247,0.023843
