# 시계열 데이터 기초

## 1. 날짜 및 시간 자료형

### (1) datetime
#### 날짜, 시간을 위한 자료형(파이썬 표준 라이브러리)

In [1]:
from datetime import datetime

# 현재 날짜와 시간을 알려줌
now = datetime.now()
now

datetime.datetime(2024, 4, 12, 16, 15, 0, 551174)

In [2]:
now.year, now.month, now.day

(2024, 4, 12)

#### 시간 차이 계산

In [3]:
delta = datetime(2022, 2, 11) - datetime.now()
delta

datetime.timedelta(days=-792, seconds=27768, microseconds=940758)

In [4]:
delta.days, delta.seconds, delta.microseconds

(-792, 27768, 940758)

### (2) datetime.timedelta

In [5]:
from datetime import timedelta  # 날짜 또는 시간에 datetime 객체를 더하거나 빼는 등의 연산 가능

In [6]:
now + timedelta(100)    # 일수를 더함.

datetime.datetime(2024, 7, 21, 16, 15, 0, 551174)

### (3) datetime을 문자형으로 변경

In [8]:
stamp = datetime(2024, 4, 12)
str(stamp)

'2024-04-12 00:00:00'

### (4) 문자형을 날짜로 변환

In [9]:
stamp.strftime('%Y-%m-%d')

'2024-04-12'

In [11]:
value = '2024-04-12'
datetime.strptime(value, '%Y-%m-%d')  # 문자열을 날짜시간 형식으로 변환

datetime.datetime(2024, 4, 12, 0, 0)

### datetime 포맷 규칙

In [12]:
# 포맷     설명
#---------------
# %Y    년도 4자리
# %y    년도 2자리
# %m    월 2자리
# %d    일 2자리
# %H    24시간 표시형식(0~23시)
# %I    12시간 표시형식(01~12시)
# %M    2자리 분
# %S    초
# %w    요일 (0(일요일) - 6)
# %F    %Y-%m-%d 형식의 의미
# %D    %m/%d/%y  (04/12/2024)

## 2. Parser

### (1) 날짜 및 시간 파싱

In [17]:
from dateutil.parser import parse
parse('2024-04-12')  # 문자열을 해석해서 datetime으로 변환해 줌

datetime.datetime(2024, 4, 12, 0, 0)

In [18]:
parse('Jan 31, 2024 12:30 PM')

datetime.datetime(2024, 1, 31, 12, 30)

In [19]:
# 예: 2030년 12월 5일
parse('5/12/2030', dayfirst = True)

datetime.datetime(2030, 12, 5, 0, 0)

### (2) 많은 날짜를 한꺼번에 처리

In [20]:
import pandas as pd
datelist = ['2023-07-06 12:00:00', '2024-07-06 12:00:00',  '2025-07-06 12:00:00']
pd.to_datetime(datelist)

DatetimeIndex(['2023-07-06 12:00:00', '2024-07-06 12:00:00',
               '2025-07-06 12:00:00'],
              dtype='datetime64[ns]', freq=None)

### NaT(Not a Time) : 누락된 타임스탬프 데이터

In [21]:
pd.to_datetime(datelist + [None])

DatetimeIndex(['2023-07-06 12:00:00', '2024-07-06 12:00:00',
               '2025-07-06 12:00:00', 'NaT'],
              dtype='datetime64[ns]', freq=None)

### (3) date_range

#### 시작날짜 + 종료날짜를 지정하여 날짜 범위 생성

In [23]:
index = pd.date_range('2024-01-01', '2024-04-30')
index

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10',
               ...
               '2024-04-21', '2024-04-22', '2024-04-23', '2024-04-24',
               '2024-04-25', '2024-04-26', '2024-04-27', '2024-04-28',
               '2024-04-29', '2024-04-30'],
              dtype='datetime64[ns]', length=121, freq='D')

### 시작 날짜 + 기간을 사용해서 날짜 범위 생성

In [24]:
pd.date_range(start='2024-04-01', periods=10)

DatetimeIndex(['2024-04-01', '2024-04-02', '2024-04-03', '2024-04-04',
               '2024-04-05', '2024-04-06', '2024-04-07', '2024-04-08',
               '2024-04-09', '2024-04-10'],
              dtype='datetime64[ns]', freq='D')

In [25]:
pd.date_range(end='2024-04-01', periods=10)

DatetimeIndex(['2024-03-23', '2024-03-24', '2024-03-25', '2024-03-26',
               '2024-03-27', '2024-03-28', '2024-03-29', '2024-03-30',
               '2024-03-31', '2024-04-01'],
              dtype='datetime64[ns]', freq='D')

### BM : BusinessMonthEnd (월 영업 마감일)

In [26]:
pd.date_range('2024-01-01', '2024-12-30', freq='BM')

DatetimeIndex(['2024-01-31', '2024-02-29', '2024-03-29', '2024-04-30',
               '2024-05-31', '2024-06-28', '2024-07-31', '2024-08-30',
               '2024-09-30', '2024-10-31', '2024-11-29'],
              dtype='datetime64[ns]', freq='BM')

## 3. 시계열 데이터 기초

In [31]:
import pandas as pd
import numpy as np

# yfinance - 야후 금융사이트에 접속해서 주식 데이터를 스크래핑해주는 패키지
# yfinance 설치 : pip install yfinance
import yfinance as yfin
# data-reader 설치: conda install pandas-datareader
from pandas_datareader import data  # 야후 금융데이터에서 판다스로 데이터를 로드해 주는 모듈

yfin.pdr_override()

df = data.get_data_yahoo('005930.KS', start='2024-01-01', end='2024-04-13')

[*********************100%%**********************]  1 of 1 completed


In [32]:
df.index

DatetimeIndex(['2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05',
               '2024-01-08', '2024-01-09', '2024-01-10', '2024-01-11',
               '2024-01-12', '2024-01-15', '2024-01-16', '2024-01-17',
               '2024-01-18', '2024-01-19', '2024-01-22', '2024-01-23',
               '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-29',
               '2024-01-30', '2024-01-31', '2024-02-01', '2024-02-02',
               '2024-02-05', '2024-02-06', '2024-02-07', '2024-02-08',
               '2024-02-13', '2024-02-14', '2024-02-15', '2024-02-16',
               '2024-02-19', '2024-02-20', '2024-02-21', '2024-02-22',
               '2024-02-23', '2024-02-26', '2024-02-27', '2024-02-28',
               '2024-02-29', '2024-03-04', '2024-03-05', '2024-03-06',
               '2024-03-07', '2024-03-08', '2024-03-11', '2024-03-12',
               '2024-03-13', '2024-03-14', '2024-03-15', '2024-03-18',
               '2024-03-19', '2024-03-20', '2024-03-21', '2024-03-22',
      

In [33]:
df[::2]

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-02,78200.0,79800.0,78200.0,79600.0,79600.0,17142847
2024-01-04,76100.0,77300.0,76100.0,76600.0,76600.0,15324439
2024-01-08,77000.0,77500.0,76400.0,76500.0,76500.0,11088724
2024-01-10,75000.0,75200.0,73200.0,73600.0,73600.0,20259529
2024-01-12,73000.0,74100.0,72800.0,73100.0,73100.0,13038939
2024-01-16,73500.0,73700.0,72500.0,72600.0,72600.0,14760415
2024-01-18,71600.0,72000.0,70700.0,71700.0,71700.0,17853397
2024-01-22,75900.0,76000.0,75000.0,75100.0,75100.0,19673375
2024-01-24,75200.0,75200.0,73500.0,74000.0,74000.0,12860661
2024-01-26,73700.0,74500.0,73300.0,73400.0,73400.0,11160062


In [34]:
df.index.dtype

dtype('<M8[ns]')

In [35]:
df.index[0]

Timestamp('2024-01-02 00:00:00')

In [36]:
stamp = df.index[2]
stamp

Timestamp('2024-01-04 00:00:00')

### 월 선택

In [37]:
df.loc['2024-04']

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-04-01,83200.0,83300.0,82000.0,82000.0,82000.0,20116513
2024-04-02,82900.0,85000.0,82900.0,85000.0,85000.0,37077944
2024-04-03,84300.0,85000.0,83500.0,84100.0,84100.0,30493347
2024-04-04,85200.0,85500.0,84300.0,85300.0,85300.0,25248934
2024-04-05,84500.0,85000.0,83800.0,84500.0,84500.0,18883752
2024-04-08,85200.0,86000.0,84500.0,84500.0,84500.0,18953232
2024-04-09,84500.0,84900.0,83100.0,83600.0,83600.0,23725956
2024-04-11,83200.0,84700.0,82500.0,84100.0,84100.0,25538009
2024-04-12,84700.0,84900.0,83200.0,83700.0,83700.0,16983378


### 기간 선택

In [38]:
from datetime import datetime
df[datetime(2024,2,1):]

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-02-01,73000.0,74200.0,72900.0,73600.0,73600.0,19881033
2024-02-02,74000.0,75200.0,73700.0,75200.0,75200.0,14955881
2024-02-05,74200.0,74800.0,73500.0,74300.0,74300.0,19026021
2024-02-06,74300.0,74700.0,73300.0,74400.0,74400.0,14559254
2024-02-07,74600.0,75500.0,74300.0,75000.0,75000.0,16566445
2024-02-08,75000.0,75200.0,73600.0,74100.0,74100.0,20810708
2024-02-13,74800.0,75200.0,74400.0,75200.0,75200.0,21966745
2024-02-14,73700.0,74300.0,73700.0,74000.0,74000.0,12434945
2024-02-15,74200.0,74400.0,73000.0,73200.0,73200.0,14120600
2024-02-16,73300.0,73400.0,72500.0,72800.0,72800.0,13444781


In [39]:
# 문자열로 기간 선택
df['2024-02-01' : '2024-02-29']

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-02-01,73000.0,74200.0,72900.0,73600.0,73600.0,19881033
2024-02-02,74000.0,75200.0,73700.0,75200.0,75200.0,14955881
2024-02-05,74200.0,74800.0,73500.0,74300.0,74300.0,19026021
2024-02-06,74300.0,74700.0,73300.0,74400.0,74400.0,14559254
2024-02-07,74600.0,75500.0,74300.0,75000.0,75000.0,16566445
2024-02-08,75000.0,75200.0,73600.0,74100.0,74100.0,20810708
2024-02-13,74800.0,75200.0,74400.0,75200.0,75200.0,21966745
2024-02-14,73700.0,74300.0,73700.0,74000.0,74000.0,12434945
2024-02-15,74200.0,74400.0,73000.0,73200.0,73200.0,14120600
2024-02-16,73300.0,73400.0,72500.0,72800.0,72800.0,13444781


### 데이터 삭제 (truncate)

In [40]:
df.truncate(after='2024-04-01')

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-02,78200.0,79800.0,78200.0,79600.0,79600.0,17142847
2024-01-03,78500.0,78800.0,77000.0,77000.0,77000.0,21753644
2024-01-04,76100.0,77300.0,76100.0,76600.0,76600.0,15324439
2024-01-05,76700.0,77100.0,76400.0,76600.0,76600.0,11304316
2024-01-08,77000.0,77500.0,76400.0,76500.0,76500.0,11088724
...,...,...,...,...,...,...
2024-03-26,79700.0,80100.0,79200.0,79900.0,79900.0,30551494
2024-03-27,79200.0,80000.0,79200.0,79800.0,79800.0,17424595
2024-03-28,79400.0,81000.0,79200.0,80800.0,80800.0,25084812
2024-03-29,81200.0,82500.0,80900.0,82400.0,82400.0,27126366
