In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

In [2]:
dates=[datetime(2011,1,2),datetime(2011,1,5),
      datetime(2011,1,7),datetime(2011,1,8),
      datetime(2011,1,10),datetime(2011,1,12)]

ts=pd.Series(np.random.randn(6),index=dates)
ts

2011-01-02   -1.152247
2011-01-05   -0.401223
2011-01-07    0.468132
2011-01-08   -1.086417
2011-01-10   -0.553148
2011-01-12    0.459614
dtype: float64

In [3]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [4]:
# 시계열 객체 간의 산술 연산
# 날짜에 맞춰져서 계산
ts+ts[::2] #ts에서 매 두 번째 항목 선택

2011-01-02   -2.304495
2011-01-05         NaN
2011-01-07    0.936263
2011-01-08         NaN
2011-01-10   -1.106297
2011-01-12         NaN
dtype: float64

In [5]:
# DatetimeIndex의 스칼라값-pandas의 Timestamp 객체
stamp=ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

## 1. 색인,선택,부분 선택

In [6]:
# pandas.Series와 동일하게 동작
stamp=ts.index[2]
ts[stamp]

0.468131534424338

In [7]:
# 해석할수있는 문자열로 넘겨서 편리하게 사용
ts['1/10/2011']

-0.5531484146816324

In [8]:
ts['20110110']

-0.5531484146816324

In [9]:
ts['2011/1/10']

-0.5531484146816324

* 긴 시계열에서 연을 넘기거나 연,월만 넘겨서 데이터의 일부 구간만 선택하기

In [10]:
longer_ts=pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))
longer_ts

2000-01-01    0.721842
2000-01-02   -0.753256
2000-01-03   -1.349576
2000-01-04    0.188647
2000-01-05    0.442854
                ...   
2002-09-22    1.650765
2002-09-23    1.361112
2002-09-24   -0.404672
2002-09-25    2.503329
2002-09-26   -0.215319
Freq: D, Length: 1000, dtype: float64

In [11]:
# 연만 넘기기, 해당 연도 넘기기
longer_ts['2001']

2001-01-01   -0.073773
2001-01-02    0.241955
2001-01-03    0.916662
2001-01-04    0.625445
2001-01-05   -1.473348
                ...   
2001-12-27    0.475122
2001-12-28   -2.093555
2001-12-29    0.766515
2001-12-30   -1.305065
2001-12-31   -1.856759
Freq: D, Length: 365, dtype: float64

In [12]:
# 해당 연도,월만 넘기기
longer_ts['2001-05']

2001-05-01   -0.304244
2001-05-02   -0.453332
2001-05-03   -0.033898
2001-05-04    0.187220
2001-05-05    0.610196
2001-05-06   -1.957052
2001-05-07    0.229351
2001-05-08    0.126025
2001-05-09    0.375029
2001-05-10    0.513293
2001-05-11    1.573093
2001-05-12   -0.218375
2001-05-13    1.098726
2001-05-14   -0.576037
2001-05-15   -0.245277
2001-05-16   -0.957771
2001-05-17   -2.301972
2001-05-18   -0.480609
2001-05-19   -0.192436
2001-05-20   -1.005291
2001-05-21    0.891503
2001-05-22    0.248560
2001-05-23    0.371685
2001-05-24   -0.322627
2001-05-25   -1.103487
2001-05-26   -0.668375
2001-05-27    0.918294
2001-05-28    0.204888
2001-05-29    0.387182
2001-05-30   -0.821803
2001-05-31    1.489970
Freq: D, dtype: float64

In [13]:
# datetime 객체로 데이터 잘라내기
ts[datetime(2011,1,7):]

2011-01-07    0.468132
2011-01-08   -1.086417
2011-01-10   -0.553148
2011-01-12    0.459614
dtype: float64

In [14]:
# 타임스탬프를 이용해서 Series로 나누기
ts

2011-01-02   -1.152247
2011-01-05   -0.401223
2011-01-07    0.468132
2011-01-08   -1.086417
2011-01-10   -0.553148
2011-01-12    0.459614
dtype: float64

In [15]:
ts['1/6/2011':'1/11/2011']

2011-01-07    0.468132
2011-01-08   -1.086417
2011-01-10   -0.553148
dtype: float64

In [16]:
# truncate() 
# TimeSeries를 두 개의 날짜로 나눈다.
ts.truncate(after='1/9/2011')

2011-01-02   -1.152247
2011-01-05   -0.401223
2011-01-07    0.468132
2011-01-08   -1.086417
dtype: float64

In [17]:
ts.truncate(after='2011/1/9')

2011-01-02   -1.152247
2011-01-05   -0.401223
2011-01-07    0.468132
2011-01-08   -1.086417
dtype: float64

* dataframe에 적용하기

In [18]:
dates=pd.date_range('1/1/2000',periods=100,freq='W-WED')
long_df=pd.DataFrame(np.random.randn(100,4),
                    index=dates,
                    columns=['Colorado','Texas','New York','Ohio'])
long_df.loc['2001/5'] # 2001년 5월 날짜 추출하기

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.496015,0.573603,1.370304,0.106483
2001-05-09,-0.788377,-0.694467,-0.120536,-0.358053
2001-05-16,-1.790909,-0.609281,0.213161,-0.687685
2001-05-23,1.164675,-0.141247,0.26399,0.32654
2001-05-30,-0.374922,-0.584842,0.167927,-1.807363


## 2. 중복된 색인을 갖는 시계열

In [19]:
dates=pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000',
                       '1/2/2000','1/3/2000'])

dup_ts=pd.Series(np.arange(5),index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [20]:
dup_ts.index.is_unique

False

In [21]:
dup_ts['1/2/2000'] # 중복있음

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [22]:
# 유일하지 않은 타임스탬프를 가지는 데이터를 집계하기
# groupby에 level=0(단일 단계 인덱싱)을 넘기기

grouped=dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [23]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64