## Import

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
import os
import datetime
import pickle

from decimal import Decimal

## Load Data

In [2]:
df = pd.read_csv("../dataset/min_kospi200F(20200909).csv", index_col=0)
df

Unnamed: 0,date,time,open,high,low,close,prevClose,vol
0,20100216,901,207.55,207.65,207.50,207.60,207.50,3985
1,20100216,902,207.60,207.65,207.25,207.55,207.50,5095
2,20100216,903,207.55,207.80,207.50,207.60,207.50,2175
3,20100216,904,207.55,207.85,207.55,207.80,207.50,1301
4,20100216,905,207.80,208.15,207.80,208.05,207.50,3870
...,...,...,...,...,...,...,...,...
941511,20200403,1534,231.40,231.40,231.30,231.30,231.65,210
941512,20200403,1535,231.30,231.50,231.30,231.40,231.65,932
941513,20200403,1535,231.30,231.50,231.30,231.40,231.65,932
941514,20200403,1545,231.65,231.65,231.65,231.65,231.65,6748


## 기초 전처리 : 중복값 삭제 및 결측치 확인

In [None]:
# 중복값 삭제 
df.drop_duplicates(inplace=True)

In [5]:
df.head(10)

Unnamed: 0,date,time,open,high,low,close,prevClose,vol
0,20100216,901,207.55,207.65,207.5,207.6,207.5,3985
1,20100216,902,207.6,207.65,207.25,207.55,207.5,5095
2,20100216,903,207.55,207.8,207.5,207.6,207.5,2175
3,20100216,904,207.55,207.85,207.55,207.8,207.5,1301
4,20100216,905,207.8,208.15,207.8,208.05,207.5,3870
5,20100216,906,208.05,208.15,207.95,208.15,207.5,1650
6,20100216,907,208.1,208.2,208.0,208.15,207.5,1447
7,20100216,908,208.15,208.3,208.05,208.15,207.5,2049
8,20100216,909,208.1,208.5,208.1,208.4,207.5,2558
9,20100216,910,208.4,208.65,208.35,208.65,207.5,2573


In [7]:
df.isna().sum(axis=0)

date         0
time         0
open         0
high         0
low          0
close        0
prevClose    0
vol          0
dtype: int64

각 열마다 결측치는 존재하지 않는다.  

## timestep 변환

In [None]:
# time 열이 '908' 같은 숫자거나 float일 수 있으니 정제하기 
df['time'] = df['time'].apply(lambda x: f"{int(float(x)):04d}")  # 908 -> '0908'

# date와 time을 문자열로 변환하고 붙이기
df['datetime_str'] = df['date'].astype(str) + df['time']

# datetime으로 변환
df['datetime'] = pd.to_datetime(df['datetime_str'], format='%Y%m%d%H%M')

# datetime으로 통합된 지표 삭제
df.drop(['date','time','datetime_str'], axis=1, inplace=True)  

# datetime으로 idx를 수정
df.set_index('datetime', inplace=True)
df.head()

Unnamed: 0_level_0,open,high,low,close,prevClose,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-02-16 09:01:00,207.55,207.65,207.5,207.6,207.5,3985
2010-02-16 09:02:00,207.6,207.65,207.25,207.55,207.5,5095
2010-02-16 09:03:00,207.55,207.8,207.5,207.6,207.5,2175
2010-02-16 09:04:00,207.55,207.85,207.55,207.8,207.5,1301
2010-02-16 09:05:00,207.8,208.15,207.8,208.05,207.5,3870


## 빠진 시간이 없는지 확인하기 

In [12]:
# 1시간마다 그룹핑 → 각 시간에 몇 개의 행이 있는지 세기
under_60_data_list = []

minute_counts = df.resample('h').size()  # 1시간마다 개수 세기

# 60분 미만인 시간만 필터링
incomplete_hours = minute_counts[minute_counts < 59]

# 결과 출력
print("다 채워지지 않은 1시간 구간들:")
for ts, count in incomplete_hours.items():
    if count != 0:
        # print(f"{ts} -> {count}분 있음")
        under_60_data_list.append([ts, ts.time(), count])

다 채워지지 않은 1시간 구간들:


In [15]:
pd.DataFrame(under_60_data_list, columns=['datetime', 'hour', 'cnt'])

Unnamed: 0,datetime,hour,cnt
0,2010-02-16 15:00:00,15:00:00,7
1,2010-02-17 15:00:00,15:00:00,7
2,2010-02-18 15:00:00,15:00:00,7
3,2010-02-19 15:00:00,15:00:00,7
4,2010-02-22 15:00:00,15:00:00,8
...,...,...,...
2560,2020-03-30 15:00:00,15:00:00,37
2561,2020-03-31 15:00:00,15:00:00,37
2562,2020-04-01 15:00:00,15:00:00,37
2563,2020-04-02 15:00:00,15:00:00,37


In [44]:
# 일 단위로 묶기
missing_times_by_day = {}

# 날짜별로 반복
for day, group in df.groupby(df.index.date):
    # 해당 날짜의 실제 시간 목록
    actual_times = group.index.time
    
    # 기준 시간: 09:00 ~ 14:59, 분 단위로 생성
    expected_times = [
        datetime.time(h, m)
        for h in range(9, 15)
        for m in range(60) if (h,m) != (9,0) 
    ]
    
    # 누락된 시간 확인
    missing = [t for t in expected_times if t not in actual_times]
    
    if missing:
        missing_times_by_day[day] = missing

# 결과 출력
for day, missing in missing_times_by_day.items():
    print(f"{day} 누락된 시각 {len(missing)}개:")
    print(", ".join(t.strftime("%H:%M") for t in missing))
    print()

2010-07-16 누락된 시각 359개:
09:01, 09:02, 09:03, 09:04, 09:05, 09:06, 09:07, 09:08, 09:09, 09:10, 09:11, 09:12, 09:13, 09:14, 09:15, 09:16, 09:17, 09:18, 09:19, 09:20, 09:21, 09:22, 09:23, 09:24, 09:25, 09:26, 09:27, 09:28, 09:29, 09:30, 09:31, 09:32, 09:33, 09:34, 09:35, 09:36, 09:37, 09:38, 09:39, 09:40, 09:41, 09:42, 09:43, 09:44, 09:45, 09:46, 09:47, 09:48, 09:49, 09:50, 09:51, 09:52, 09:53, 09:54, 09:55, 09:56, 09:57, 09:58, 09:59, 10:00, 10:01, 10:02, 10:03, 10:04, 10:05, 10:06, 10:07, 10:08, 10:09, 10:10, 10:11, 10:12, 10:13, 10:14, 10:15, 10:16, 10:17, 10:18, 10:19, 10:20, 10:21, 10:22, 10:23, 10:24, 10:25, 10:26, 10:27, 10:28, 10:29, 10:30, 10:31, 10:32, 10:33, 10:34, 10:35, 10:36, 10:37, 10:38, 10:39, 10:40, 10:41, 10:42, 10:43, 10:44, 10:45, 10:46, 10:47, 10:48, 10:49, 10:50, 10:51, 10:52, 10:53, 10:54, 10:55, 10:56, 10:57, 10:58, 10:59, 11:00, 11:01, 11:02, 11:03, 11:04, 11:05, 11:06, 11:07, 11:08, 11:09, 11:10, 11:11, 11:12, 11:13, 11:14, 11:15, 11:16, 11:17, 11:18, 11:19, 11:

2020-03-13, 2020-03-19에 각각 10시 44분, 12시 7분에 서킷브레이커가 발생했다.  
30분동안 장이 정지된다.  
출처 : https://www.moef.go.kr/sisa/dictionary/detail?idx=1427

In [None]:
df[df.index.date == datetime.date(2010, 7, 16)] # 장마감 시간 값 하나만 있고, 이외 나머지 데이터가 하나도 없다. 

Unnamed: 0_level_0,open,high,low,close,prevClose,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-07-16 15:15:00,227.05,229.95,227.05,229.1,229.65,3901


In [53]:
df[df.index.date == datetime.date(2010, 7, 19)].tail(5)

Unnamed: 0_level_0,open,high,low,close,prevClose,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-07-19 15:03:00,225.95,226.0,225.85,225.9,227.05,742
2010-07-19 15:04:00,225.85,225.9,225.8,225.85,227.05,1547
2010-07-19 15:05:00,225.8,226.0,225.75,225.75,227.05,2228
2010-07-19 15:06:00,225.8,225.8,225.8,225.8,227.05,2
2010-07-19 15:15:00,225.6,225.6,225.6,225.6,227.05,2331


In [84]:
df[
    (df.index.date < datetime.date(2016,8,1)) &
    (df.index.time == datetime.time(15, 5)) & 
    ~np.isin(df.index.date, event_days['KoreanSAT'])
]

Unnamed: 0_level_0,open,high,low,close,prevClose,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-02-16 15:05:00,209.40,209.50,209.35,209.50,207.50,1457
2010-02-17 15:05:00,213.30,213.45,213.30,213.40,209.65,2002
2010-02-18 15:05:00,212.75,212.80,212.65,212.75,213.60,1878
2010-02-19 15:05:00,208.70,209.00,208.65,209.00,212.85,2546
2010-02-22 15:05:00,213.15,213.45,213.10,213.40,209.00,2601
...,...,...,...,...,...,...
2016-07-25 15:05:00,250.90,250.95,250.85,250.90,250.75,334
2016-07-26 15:05:00,253.20,253.30,253.15,253.30,250.85,851
2016-07-27 15:05:00,253.15,253.25,253.15,253.20,253.30,705
2016-07-28 15:05:00,252.15,252.20,252.10,252.15,253.35,702


In [88]:
df[
    (df.index.date < datetime.date(2016,8,1)) &
    (df.index.time == datetime.time(15, 6)) & 
    ~np.isin(df.index.date, event_days['KoreanSAT'])
]

Unnamed: 0_level_0,open,high,low,close,prevClose,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-02-22 15:06:00,213.45,213.45,213.45,213.45,209.00,1
2010-02-24 15:06:00,210.95,210.95,210.95,210.95,213.45,10
2010-02-26 15:06:00,207.65,207.65,207.65,207.65,206.90,1
2010-03-09 15:06:00,218.10,218.10,218.10,218.10,217.60,1
2010-03-19 15:06:00,221.35,221.35,221.35,221.35,220.10,1
...,...,...,...,...,...,...
2013-05-22 15:06:00,260.45,260.45,260.45,260.45,259.10,1
2013-07-19 15:06:00,243.15,243.15,243.10,243.10,243.45,3
2013-08-21 15:06:00,242.00,242.00,242.00,242.00,245.25,1
2013-09-27 15:06:00,265.20,265.20,265.20,265.20,265.60,3


In [77]:
df[np.isin(df.index.date, event_days['KoreanSAT'])]

Unnamed: 0_level_0,open,high,low,close,prevClose,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-11-18 10:01:00,248.45,249.15,248.40,248.90,247.45,5201
2010-11-18 10:02:00,248.90,249.20,248.65,248.70,247.45,3059
2010-11-18 10:03:00,248.75,249.00,248.70,248.95,247.45,2626
2010-11-18 10:04:00,248.90,249.40,248.90,249.35,247.45,4316
2010-11-18 10:05:00,249.35,249.50,249.00,249.05,247.45,3382
...,...,...,...,...,...,...
2019-11-14 16:32:00,284.00,284.15,284.00,284.10,281.85,1983
2019-11-14 16:33:00,284.10,284.15,284.05,284.05,281.85,1640
2019-11-14 16:34:00,284.05,284.10,283.95,284.05,281.85,1175
2019-11-14 16:35:00,284.00,284.10,283.95,284.10,281.85,1600


In [None]:
event_days = {'CircuitBreaker' : [datetime.date(2020,3,13),
                                  datetime.date(2020,3,19)],
               'KoreanSAT' : [datetime.date(2010,11,18),
                              datetime.date(2011,11,10),
                              datetime.date(2012,11,8),
                              datetime.date(2013,11,7),
                              datetime.date(2014,11,13),
                              datetime.date(2015,11,12),
                              datetime.date(2016,11,17),
                              datetime.date(2017,11,16),
                              datetime.date(2017,11,23),
                              datetime.date(2018,11,15),
                              datetime.date(2019,11,14)]
                              }

[datetime.date(2010, 11, 18),
 datetime.date(2011, 11, 10),
 datetime.date(2012, 11, 8),
 datetime.date(2013, 11, 7),
 datetime.date(2014, 11, 13),
 datetime.date(2015, 11, 12),
 datetime.date(2016, 11, 17),
 datetime.date(2017, 11, 16),
 datetime.date(2017, 11, 23),
 datetime.date(2018, 11, 15),
 datetime.date(2019, 11, 14)]

15:45(장 마감시간)은 맨날 하나씩 있다. 10분 전부터 거래를 안 보여주다가 뺌  
 
