In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
df = pd.read_excel('./kakao-market-price.xlsx')
df = df.rename(columns={'Datetime(2022-10-13)': 'Date'})
df.head()

Unnamed: 0,Open,High,Low,Close,Date
0,64300,64500,64000,64500,2023-01-16 15:00:00
1,64800,64800,64200,64300,2023-01-16 14:00:00
2,64400,64800,64000,64700,2023-01-16 13:00:00
3,64800,65000,64300,64300,2023-01-16 12:00:00
4,64200,64800,64100,64800,2023-01-16 11:00:00


In [5]:
df['Date'].dtype, df['Open'].dtype, df['High'].dtype, df['Low'].dtype, df['Close'].dtype

(dtype('<M8[ns]'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'))

In [7]:
# Reverse & Set 'Date' to index
df = df.iloc[::-1].reset_index().drop('index', axis=1)
df.head()

Unnamed: 0,Open,High,Low,Close,Date
0,49850,49850,49850,49850,2022-10-13 07:59:59.985
1,49450,49500,47300,48000,2022-10-13 08:59:59.990
2,48050,48150,47550,47850,2022-10-13 10:00:00.000
3,47800,47800,47400,47500,2022-10-13 11:00:00.000
4,47550,48200,47450,47900,2022-10-13 12:00:00.005


In [8]:
df['Date'].dt.hour.unique()

array([ 7,  8, 10, 11, 12, 13, 14, 15,  9])

In [9]:
date = df['Date'].dt.date.astype(str)
bins = [0, 6.5, 12, 15, 24]
labels = ['_Closed', '_AM', '_PM', '_Closed']
period = pd.cut(df['Date'].dt.hour, bins=bins, labels=labels, include_lowest=True, ordered=False)

df['DayIndex'] = date.str.cat(period)
df.set_index('DayIndex')
df.head()

Unnamed: 0,Open,High,Low,Close,Date,DayIndex
0,49850,49850,49850,49850,2022-10-13 07:59:59.985,2022-10-13_AM
1,49450,49500,47300,48000,2022-10-13 08:59:59.990,2022-10-13_AM
2,48050,48150,47550,47850,2022-10-13 10:00:00.000,2022-10-13_AM
3,47800,47800,47400,47500,2022-10-13 11:00:00.000,2022-10-13_AM
4,47550,48200,47450,47900,2022-10-13 12:00:00.005,2022-10-13_AM


In [11]:
period.unique()

['_AM', '_PM']
Categories (3, object): ['_AM', '_Closed', '_PM']

In [12]:
group = df.groupby('DayIndex', as_index=False)
new_df = group['Open'].first()
new_df['High'] = group['High'].max()['High']
new_df['Low'] = group['Low'].min()['Low']
new_df['Close'] = group['Close'].last()['Close']
new_df.set_index('DayIndex')
new_df.head()

Unnamed: 0,DayIndex,Open,High,Low,Close
0,2022-10-13_AM,49850,49850,47300,47900
1,2022-10-13_PM,47850,48100,47300,43700
2,2022-10-14_AM,49050,51300,49050,50900
3,2022-10-14_PM,51000,51500,50800,51400
4,2022-10-17_AM,51400,51400,46550,48750


In [13]:
new_df[['Open', 'High', 'Low', 'Close']].head()

Unnamed: 0,Open,High,Low,Close
0,49850,49850,47300,47900
1,47850,48100,47300,43700
2,49050,51300,49050,50900
3,51000,51500,50800,51400
4,51400,51400,46550,48750


In [14]:
eval_vector = new_df[['Open', 'High', 'Low', 'Close']].to_numpy().reshape(-1, 1)
#scaler = StandardScaler().fit(eval_vector)
# 일반적으로 시계열 데이터는 MinMaxScaling 이 적절하므로, MinMaxScaling을 적용함.
scaler = MinMaxScaler().fit(eval_vector)

In [15]:
new_df['Open'] = scaler.transform(new_df['Open'].to_numpy().reshape(-1, 1))
new_df['High'] = scaler.transform(new_df['High'].to_numpy().reshape(-1, 1))
new_df['Low'] = scaler.transform(new_df['Low'].to_numpy().reshape(-1, 1))
new_df['Close'] = scaler.transform(new_df['Close'].to_numpy().reshape(-1, 1))
new_df.head()

Unnamed: 0,DayIndex,Open,High,Low,Close
0,2022-10-13_AM,0.748025,0.748025,0.705613,0.715593
1,2022-10-13_PM,0.714761,0.718919,0.705613,0.645738
2,2022-10-14_AM,0.734719,0.772141,0.734719,0.765489
3,2022-10-14_PM,0.767152,0.775468,0.763825,0.773805
4,2022-10-17_AM,0.773805,0.773805,0.693139,0.72973


In [18]:
new_df.to_excel('kakao-stock-preprocessed.xlsx')