In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
df = pd.read_excel('./kakao-sample2.xlsx')
df = df.rename(columns={'Unnamed: 4': 'Date'})
df.head()

Unnamed: 0,Open,High,Low,Close,Date
0,56900,57200,56800,57200,2023-01-06 15:00:00
1,57400,57500,56600,56900,2023-01-06 14:00:00
2,57400,57600,57300,57300,2023-01-06 13:00:00
3,57500,57700,57300,57400,2023-01-06 12:00:00
4,57300,57600,56900,57500,2023-01-06 11:00:00


In [3]:
df['Date'].dtype

dtype('<M8[ns]')

In [4]:
# Reverse & Set 'Date' to index
df = df.iloc[::-1].reset_index().drop('index', axis=1)
df.head()

Unnamed: 0,Open,High,Low,Close,Date
0,55500,55500,55500,55500,2022-12-09 08:00:00
1,56100,57700,56000,57000,2022-12-09 09:00:00
2,56900,57700,56800,57700,2022-12-09 10:00:00
3,57600,58200,57600,58000,2022-12-09 11:00:00
4,58100,58100,57800,57900,2022-12-09 12:00:00


In [5]:
df['Date'].dt.hour.unique()

array([ 8,  9, 10, 11, 12, 13, 14, 15])

In [6]:
date = df['Date'].dt.date.astype(str)
bins = [0, 7.5, 12, 15, 24]
labels = ['_Closed', '_AM', '_PM', '_Closed']
period = pd.cut(df['Date'].dt.hour, bins=bins, labels=labels, include_lowest=True, ordered=False)

df['DayIndex'] = date.str.cat(period)
df.set_index('DayIndex')
df.head()

Unnamed: 0,Open,High,Low,Close,Date,DayIndex
0,55500,55500,55500,55500,2022-12-09 08:00:00,2022-12-09_AM
1,56100,57700,56000,57000,2022-12-09 09:00:00,2022-12-09_AM
2,56900,57700,56800,57700,2022-12-09 10:00:00,2022-12-09_AM
3,57600,58200,57600,58000,2022-12-09 11:00:00,2022-12-09_AM
4,58100,58100,57800,57900,2022-12-09 12:00:00,2022-12-09_AM


In [7]:
group = df.groupby('DayIndex', as_index=False)
new_df = group['Open'].first()
new_df['High'] = group['High'].max()['High']
new_df['Low'] = group['Low'].min()['Low']
new_df['Close'] = group['Close'].last()['Close']
new_df.set_index('DayIndex')
new_df.head()

Unnamed: 0,DayIndex,Open,High,Low,Close
0,2022-12-09_AM,55500,58200,55500,57900
1,2022-12-09_PM,57900,58100,57500,58100
2,2022-12-12_AM,58100,58700,57200,58200
3,2022-12-12_PM,58100,58400,58100,58100
4,2022-12-13_AM,58100,59600,57800,58200


In [8]:
new_df[['Open', 'High', 'Low', 'Close']].head()

Unnamed: 0,Open,High,Low,Close
0,55500,58200,55500,57900
1,57900,58100,57500,58100
2,58100,58700,57200,58200
3,58100,58400,58100,58100
4,58100,59600,57800,58200


In [9]:
eval_vector = new_df[['Open', 'High', 'Low', 'Close']].to_numpy().reshape(-1, 1)
#scaler = StandardScaler().fit(eval_vector)
# 일반적으로 시계열 데이터는 MinMaxScaling 이 적절하므로, MinMaxScaling을 적용함.
scaler = MinMaxScaler().fit(eval_vector)

In [10]:
new_df['Open'] = scaler.transform(new_df['Open'].to_numpy().reshape(-1, 1))
new_df['High'] = scaler.transform(new_df['High'].to_numpy().reshape(-1, 1))
new_df['Low'] = scaler.transform(new_df['Low'].to_numpy().reshape(-1, 1))
new_df['Close'] = scaler.transform(new_df['Close'].to_numpy().reshape(-1, 1))
new_df.head()

Unnamed: 0,DayIndex,Open,High,Low,Close
0,2022-12-09_AM,0.5,0.829268,0.5,0.792683
1,2022-12-09_PM,0.792683,0.817073,0.743902,0.817073
2,2022-12-12_AM,0.817073,0.890244,0.707317,0.829268
3,2022-12-12_PM,0.817073,0.853659,0.817073,0.817073
4,2022-12-13_AM,0.817073,1.0,0.780488,0.829268


In [11]:
new_df.to_excel('kakao-stock-processed2.xlsx')