- Most financial data are time series.

- In other words, they are related to time.

In [1]:
import pandas as pd

df = pd.read_csv('data/test.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,price,volume,buy,sell
0,2018-11-21 09:00:04.278571,266.200012,1.0,0.0,1.0
1,2018-11-21 09:00:08.557143,266.200012,1.0,0.0,1.0
2,2018-11-21 09:00:12.835714,266.25,21.0,21.0,0.0
3,2018-11-21 09:00:17.114286,266.299988,1.0,1.0,0.0
4,2018-11-21 09:00:21.392857,266.350006,2.0,2.0,0.0


#### [Datetime](https://docs.python.org/3/library/datetime.html)

- datetime object : The time of a specific moment.

- Timedelta object : The interval between two datetime obects.

In [2]:
from datetime import datetime as dt
from datetime import timedelta as td

#datetime
print(dt.now())
print(type(dt.now()))
print(dt(2021, 7, 20))
print(dt(2021, 7, 20, 20, 31, 10, 333))

2021-09-17 14:23:18.855235
<class 'datetime.datetime'>
2021-07-20 00:00:00
2021-07-20 20:31:10.000333


In [3]:
#timedelta
delta = dt(2021, 7, 20) - dt(2021, 1, 1)
print(delta)
print(type(delta))

200 days, 0:00:00
<class 'datetime.timedelta'>


In [4]:
dt(2021, 1,1) + td(200)

datetime.datetime(2021, 7, 20, 0, 0)

### Strings $\rightarrow$ Datetime object (Strings $\leftarrow$ Datetime object)
    - strftime, strptime, parse, pd.to_datetime


In [4]:
date = dt(2021, 1, 1)
date

datetime.datetime(2021, 1, 1, 0, 0)

In [5]:
date.strftime()

TypeError: strftime() missing required argument 'format' (pos 1)

In [9]:
text = date.strftime('%Y-%m-%d') #날짜 표기 형식에 관한 것이다. Y : year, m : month, d : dates
text

'2021-01-01'

In [11]:
dt.strptime(text)

TypeError: strptime() takes exactly 2 arguments (1 given)

In [12]:
dt.strptime(text, '%Y-%m-%d')

datetime.datetime(2021, 1, 1, 0, 0)

In [13]:
from dateutil.parser import parse # 안에 어떠한 형식으로 되어있던지 동일한 형식으로 변환해준다

print(parse('2018-09-01'))
print(parse('Jan. 21, 2015  11:00 P.M'))

2018-09-01 00:00:00
2015-01-21 23:00:00




In [14]:
df.iloc[:,0]

0         2018-11-21 09:00:04.278571
1         2018-11-21 09:00:08.557143
2         2018-11-21 09:00:12.835714
3         2018-11-21 09:00:17.114286
4         2018-11-21 09:00:21.392857
                     ...            
865025    2018-12-28 15:35:56.900000
865026    2018-12-28 15:35:27.950000
865027    2018-12-28 15:35:57.900000
865028    2018-12-28 15:35:28.950000
865029    2018-12-28 15:35:58.900000
Name: Unnamed: 0, Length: 865030, dtype: object

In [15]:
type(df.iloc[:,0][0])

str

In [16]:
pd.to_datetime(df.iloc[:,0])

0        2018-11-21 09:00:04.278571
1        2018-11-21 09:00:08.557143
2        2018-11-21 09:00:12.835714
3        2018-11-21 09:00:17.114286
4        2018-11-21 09:00:21.392857
                    ...            
865025   2018-12-28 15:35:56.900000
865026   2018-12-28 15:35:27.950000
865027   2018-12-28 15:35:57.900000
865028   2018-12-28 15:35:28.950000
865029   2018-12-28 15:35:58.900000
Name: Unnamed: 0, Length: 865030, dtype: datetime64[ns]

## Exercise

- When performing analysis using actual financial data, errors often occur due to time.

In [1]:
import pandas as pd

df = pd.read_csv('data/test.csv', index_col = 0, parse_dates=True)
df.head()

Unnamed: 0,price,volume,buy,sell
2018-11-21 09:00:04.278571,266.200012,1.0,0.0,1.0
2018-11-21 09:00:08.557143,266.200012,1.0,0.0,1.0
2018-11-21 09:00:12.835714,266.25,21.0,21.0,0.0
2018-11-21 09:00:17.114286,266.299988,1.0,1.0,0.0
2018-11-21 09:00:21.392857,266.350006,2.0,2.0,0.0


- We need the index to be datetime index to avoid errors.

In [2]:
df.resample('5t').sum()

Unnamed: 0,price,volume,buy,sell
2018-11-21 09:00:00,628554.503082,13897.0,6837.0,7060.0
2018-11-21 09:05:00,501096.249268,10763.0,5210.0,5553.0
2018-11-21 09:10:00,272312.548187,6052.0,2727.0,3325.0
2018-11-21 09:15:00,253077.450592,6281.0,3356.0,2925.0
2018-11-21 09:20:00,143832.248688,3959.0,1868.0,2091.0
...,...,...,...,...
2018-12-28 15:15:00,101365.750122,2265.0,948.0,1317.0
2018-12-28 15:20:00,98997.949554,2916.0,1383.0,1533.0
2018-12-28 15:25:00,107369.349915,2815.0,662.0,2153.0
2018-12-28 15:30:00,180406.699310,5101.0,2098.0,3003.0


In [21]:
type(df.index[0])
type(df.index[0])

str

In [5]:
df.index = pd.to_datetime(df.index)
df.resample('5t').last() #시간 기준을 이용해 재정렬 한다는 뜻, 시계열 자료를 다룰 때 굉장히 유용하다.

Unnamed: 0,price,volume,buy,sell
2018-11-21 09:00:00,265.850006,2.0,0.0,2.0
2018-11-21 09:05:00,266.250000,2.0,0.0,2.0
2018-11-21 09:10:00,265.850006,35.0,0.0,35.0
2018-11-21 09:15:00,266.399994,5.0,5.0,0.0
2018-11-21 09:20:00,266.350006,1.0,1.0,0.0
...,...,...,...,...
2018-12-28 15:15:00,262.549988,17.0,0.0,17.0
2018-12-28 15:20:00,262.649994,2.0,2.0,0.0
2018-12-28 15:25:00,262.299988,1.0,1.0,0.0
2018-12-28 15:30:00,262.100006,5.0,0.0,5.0


In [6]:
df.resample('5t').min() # min,max,prod,ohlc 등의 method를 다양하게 사용할 수 있다

Unnamed: 0,price,volume,buy,sell
2018-11-21 09:00:00,265.799988,1.0,0.0,0.0
2018-11-21 09:05:00,265.500000,1.0,0.0,0.0
2018-11-21 09:10:00,265.799988,1.0,0.0,0.0
2018-11-21 09:15:00,265.750000,1.0,0.0,0.0
2018-11-21 09:20:00,266.250000,1.0,0.0,0.0
...,...,...,...,...
2018-12-28 15:15:00,262.500000,1.0,0.0,0.0
2018-12-28 15:20:00,262.450012,1.0,0.0,0.0
2018-12-28 15:25:00,262.250000,1.0,0.0,0.0
2018-12-28 15:30:00,262.049988,1.0,0.0,0.0


In [7]:
df.resample('5t').ohlc()

Unnamed: 0_level_0,price,price,price,price,volume,volume,volume,volume,buy,buy,buy,buy,sell,sell,sell,sell
Unnamed: 0_level_1,open,high,low,close,open,high,low,close,open,high,low,close,open,high,low,close
2018-11-21 09:00:00,266.200012,266.649994,265.799988,265.850006,1.0,200.0,1.0,2.0,0.0,200.0,0.0,0.0,1.0,126.0,0.0,2.0
2018-11-21 09:05:00,265.950012,266.450012,265.500000,266.250000,1.0,158.0,1.0,2.0,0.0,119.0,0.0,0.0,1.0,158.0,0.0,2.0
2018-11-21 09:10:00,266.100006,266.450012,265.799988,265.850006,3.0,139.0,1.0,35.0,3.0,73.0,0.0,0.0,0.0,139.0,0.0,35.0
2018-11-21 09:15:00,266.149994,266.450012,265.750000,266.399994,6.0,119.0,1.0,5.0,0.0,111.0,0.0,5.0,6.0,119.0,0.0,0.0
2018-11-21 09:20:00,266.399994,266.450012,266.250000,266.350006,4.0,100.0,1.0,1.0,4.0,100.0,0.0,1.0,0.0,96.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-28 15:15:00,262.600006,262.649994,262.500000,262.549988,50.0,80.0,1.0,17.0,50.0,59.0,0.0,0.0,0.0,80.0,0.0,17.0
2018-12-28 15:20:00,262.600006,262.750000,262.450012,262.649994,4.0,140.0,1.0,2.0,0.0,140.0,0.0,2.0,4.0,100.0,0.0,0.0
2018-12-28 15:25:00,262.649994,262.700012,262.250000,262.299988,1.0,184.0,1.0,1.0,1.0,184.0,0.0,1.0,0.0,175.0,0.0,0.0
2018-12-28 15:30:00,262.350006,262.399994,262.049988,262.100006,91.0,144.0,1.0,5.0,91.0,110.0,0.0,0.0,0.0,144.0,0.0,5.0


In [24]:
df.loc[df.index > dt(2018,12,1),]

Unnamed: 0,price,volume,buy,sell
2018-12-03 09:00:03.523529,275.600006,2.0,2.0,0.0
2018-12-03 09:00:07.047059,275.600006,1.0,1.0,0.0
2018-12-03 09:00:10.570588,275.600006,20.0,20.0,0.0
2018-12-03 09:00:14.094118,275.600006,6.0,6.0,0.0
2018-12-03 09:00:17.617647,275.600006,6.0,6.0,0.0
...,...,...,...,...
2018-12-28 15:35:56.900000,262.100006,10.0,0.0,10.0
2018-12-28 15:35:27.950000,262.100006,2.0,0.0,2.0
2018-12-28 15:35:57.900000,262.100006,5.0,0.0,5.0
2018-12-28 15:35:28.950000,262.100006,4.0,0.0,4.0
