# 时间序列

## 日期和时间数据类型及工具

In [1]:
from datetime import datetime
now = datetime.now()
now

datetime.datetime(2019, 7, 5, 8, 34, 21, 509790)

In [2]:
now.year, now.month, now.day

(2019, 7, 5)

In [3]:
# datetime以毫秒形式存储日期和时间
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(926, 56700)

In [4]:
# datetime.timedelta(926, 56700)第一项是天数，第二项是不足一天的秒数
delta.days

926

In [5]:
from datetime import timedelta
start = datetime(2011, 1, 7)
# timedelta的单位是天
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [6]:
start - 2 * timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

In [7]:
# 表10-1：datetime模块中的数据类型

### 字符串和datetime的相互转换

In [8]:
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [9]:
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [10]:
# 表10-2列出了全部的格式化编码
# datetime.strptime可以用这些格式化编码将字符串转换为日期
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [12]:
datestrs = ['7/6/2011', '8/6/2011']
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

In [13]:
# 使用dateutil中的parser.parse方法解析常见日期格式
from dateutil.parser import parse
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [14]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [15]:
# 国际通用格式
# 传入dayfirst参数
parse('6/12/2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

In [19]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06', '2011-08-06'], dtype='datetime64[ns]', freq=None)

In [18]:
idx = pd.to_datetime(datestrs + [None])
idx
# NaT是pandas中时间戳数据的NA值

DatetimeIndex(['2011-07-06', '2011-08-06', 'NaT'], dtype='datetime64[ns]', freq=None)

## 时间序列基础

In [20]:
from datetime import datetime
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),
         datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = Series(np.random.randn(6), index=dates)
ts

2011-01-02   -0.533765
2011-01-05    1.184326
2011-01-07    0.397664
2011-01-08   -1.343647
2011-01-10   -0.572669
2011-01-12   -1.471191
dtype: float64

In [21]:
type(ts)

pandas.core.series.Series

In [22]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

### 索引、选取、子集构造

In [24]:
# 对于较长的时间序列，只需传入“年”或年月就可以选取数据的切片
long_ts = Series(np.random.randn(1000), 
                 index=pd.date_range('1/1/2000', periods=1000))
long_ts

2000-01-01    0.509824
2000-01-02   -0.799639
2000-01-03    2.474567
2000-01-04   -1.502340
2000-01-05    0.056066
2000-01-06   -2.285385
2000-01-07   -1.004748
2000-01-08    1.571629
2000-01-09   -0.379101
2000-01-10   -0.514256
2000-01-11   -1.384684
2000-01-12   -0.059792
2000-01-13    0.182032
2000-01-14    0.170821
2000-01-15    0.746919
2000-01-16   -0.575850
2000-01-17    0.151939
2000-01-18    0.774145
2000-01-19    1.925844
2000-01-20   -0.303430
2000-01-21    0.443049
2000-01-22   -0.104173
2000-01-23   -0.455586
2000-01-24    1.416013
2000-01-25   -0.146295
2000-01-26   -0.749886
2000-01-27   -0.286144
2000-01-28   -0.194314
2000-01-29   -1.443533
2000-01-30    0.324189
                ...   
2002-08-28   -1.253780
2002-08-29    0.272573
2002-08-30    0.015996
2002-08-31   -0.000628
2002-09-01   -0.540295
2002-09-02   -0.762195
2002-09-03   -1.742780
2002-09-04   -0.481506
2002-09-05   -1.588878
2002-09-06    1.844198
2002-09-07   -0.295374
2002-09-08    1.012691
2002-09-09 

In [25]:
long_ts['2001-05']

2001-05-01    0.770601
2001-05-02   -0.995692
2001-05-03   -0.355303
2001-05-04   -0.518345
2001-05-05   -0.812146
2001-05-06   -0.313681
2001-05-07   -0.852360
2001-05-08    1.199617
2001-05-09    0.119462
2001-05-10   -1.520039
2001-05-11    0.112727
2001-05-12    1.039279
2001-05-13    1.072820
2001-05-14   -0.874968
2001-05-15    1.458567
2001-05-16    0.094654
2001-05-17    0.543652
2001-05-18    0.069243
2001-05-19   -1.425627
2001-05-20    0.169042
2001-05-21    0.155580
2001-05-22    1.499070
2001-05-23   -0.175373
2001-05-24   -0.824612
2001-05-25    2.469361
2001-05-26    1.306188
2001-05-27   -0.146518
2001-05-28    0.060123
2001-05-29   -1.040753
2001-05-30   -1.446815
2001-05-31   -2.035973
Freq: D, dtype: float64

In [26]:
# 通过日期进行切片的方式只对规则Series有效
long_ts[datetime(2001, 1, 7):]

2001-01-07    1.251929
2001-01-08    0.165487
2001-01-09    0.664212
2001-01-10   -1.054928
2001-01-11   -1.872465
2001-01-12    0.801979
2001-01-13    1.167239
2001-01-14   -1.116650
2001-01-15    1.947516
2001-01-16    0.560517
2001-01-17    0.151258
2001-01-18    1.928815
2001-01-19    0.679731
2001-01-20   -0.341354
2001-01-21    1.162335
2001-01-22    0.142067
2001-01-23   -1.271721
2001-01-24    1.008962
2001-01-25   -1.371334
2001-01-26   -1.529587
2001-01-27    1.561775
2001-01-28    1.207142
2001-01-29   -0.690115
2001-01-30    1.052904
2001-01-31   -0.927070
2001-02-01   -0.057532
2001-02-02   -1.999323
2001-02-03   -1.544660
2001-02-04    0.448385
2001-02-05    0.084785
                ...   
2002-08-28   -1.253780
2002-08-29    0.272573
2002-08-30    0.015996
2002-08-31   -0.000628
2002-09-01   -0.540295
2002-09-02   -0.762195
2002-09-03   -1.742780
2002-09-04   -0.481506
2002-09-05   -1.588878
2002-09-06    1.844198
2002-09-07   -0.295374
2002-09-08    1.012691
2002-09-09 

In [27]:
# 剪除2002-09-01后的记录
long_ts.truncate(after='9/1/2002')

2000-01-01    0.509824
2000-01-02   -0.799639
2000-01-03    2.474567
2000-01-04   -1.502340
2000-01-05    0.056066
2000-01-06   -2.285385
2000-01-07   -1.004748
2000-01-08    1.571629
2000-01-09   -0.379101
2000-01-10   -0.514256
2000-01-11   -1.384684
2000-01-12   -0.059792
2000-01-13    0.182032
2000-01-14    0.170821
2000-01-15    0.746919
2000-01-16   -0.575850
2000-01-17    0.151939
2000-01-18    0.774145
2000-01-19    1.925844
2000-01-20   -0.303430
2000-01-21    0.443049
2000-01-22   -0.104173
2000-01-23   -0.455586
2000-01-24    1.416013
2000-01-25   -0.146295
2000-01-26   -0.749886
2000-01-27   -0.286144
2000-01-28   -0.194314
2000-01-29   -1.443533
2000-01-30    0.324189
                ...   
2002-08-03   -1.372252
2002-08-04    0.059774
2002-08-05   -0.798326
2002-08-06   -0.622135
2002-08-07    0.169688
2002-08-08    1.111875
2002-08-09    0.738475
2002-08-10   -0.028846
2002-08-11   -0.303552
2002-08-12   -2.153486
2002-08-13    0.970732
2002-08-14    0.291492
2002-08-15 

In [28]:
# date_range的用法
help(pd.date_range)

Help on function date_range in module pandas.core.indexes.datetimes:

date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)
    Return a fixed frequency DatetimeIndex.
    
    Parameters
    ----------
    start : str or datetime-like, optional
        Left bound for generating dates.
    end : str or datetime-like, optional
        Right bound for generating dates.
    periods : integer, optional
        Number of periods to generate.
    freq : str or DateOffset, default 'D' (calendar daily)
        Frequency strings can have multiples, e.g. '5H'. See
        :ref:`here <timeseries.offset_aliases>` for a list of
        frequency aliases.
    tz : str or tzinfo, optional
        Time zone name for returning localized DatetimeIndex, for example
        'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
        timezone-naive.
    normalize : bool, default False
        Normalize start/end dates to midnight befo

In [30]:
pd.date_range("11:00", "21:30", freq="30min")

DatetimeIndex(['2019-07-05 11:00:00', '2019-07-05 11:30:00',
               '2019-07-05 12:00:00', '2019-07-05 12:30:00',
               '2019-07-05 13:00:00', '2019-07-05 13:30:00',
               '2019-07-05 14:00:00', '2019-07-05 14:30:00',
               '2019-07-05 15:00:00', '2019-07-05 15:30:00',
               '2019-07-05 16:00:00', '2019-07-05 16:30:00',
               '2019-07-05 17:00:00', '2019-07-05 17:30:00',
               '2019-07-05 18:00:00', '2019-07-05 18:30:00',
               '2019-07-05 19:00:00', '2019-07-05 19:30:00',
               '2019-07-05 20:00:00', '2019-07-05 20:30:00',
               '2019-07-05 21:00:00', '2019-07-05 21:30:00'],
              dtype='datetime64[ns]', freq='30T')

In [31]:
pd.date_range("11:00", "21:30", freq="30min").time

array([datetime.time(11, 0), datetime.time(11, 30), datetime.time(12, 0),
       datetime.time(12, 30), datetime.time(13, 0), datetime.time(13, 30),
       datetime.time(14, 0), datetime.time(14, 30), datetime.time(15, 0),
       datetime.time(15, 30), datetime.time(16, 0), datetime.time(16, 30),
       datetime.time(17, 0), datetime.time(17, 30), datetime.time(18, 0),
       datetime.time(18, 30), datetime.time(19, 0), datetime.time(19, 30),
       datetime.time(20, 0), datetime.time(20, 30), datetime.time(21, 0),
       datetime.time(21, 30)], dtype=object)

### 带有重复索引的时间序列

In [32]:
dates =pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000'])
dup_ts = Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [34]:
dup_ts.index.is_unique

False

In [36]:
grouped = dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [37]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

## 日期的范围、频率以及移动

In [38]:
ts

2011-01-02   -0.533765
2011-01-05    1.184326
2011-01-07    0.397664
2011-01-08   -1.343647
2011-01-10   -0.572669
2011-01-12   -1.471191
dtype: float64

In [39]:
# 频率的转换（或重采样）是一个比较大的主题
ts.resample('D')

DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]

### 生成日期范围

In [40]:
index = pd.date_range('4/1/2012', '6/1/2012')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [41]:
# date_range会产生按天计算的时间点
pd.date_range(start='4/1/2012', periods=20)

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [42]:
pd.date_range(end='6/1/2012', periods=20)

DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')

### 频率和日期偏移量

In [43]:
from pandas.tseries.offsets import Hour, Minute
hour = Hour()
hour

<Hour>

In [44]:
four_hour = Hour(4)
four_hour

<4 * Hours>

In [45]:
# 大部分偏移对象都可以通过加法进行连接
Hour(2) + Minute(30)

<150 * Minutes>

In [46]:
# 表10-4：时间序列的基础频率
# '2h30min'
pd.date_range('1/1/2000', periods=10, freq='2h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 02:30:00',
               '2000-01-01 05:00:00', '2000-01-01 07:30:00',
               '2000-01-01 10:00:00', '2000-01-01 12:30:00',
               '2000-01-01 15:00:00', '2000-01-01 17:30:00',
               '2000-01-01 20:00:00', '2000-01-01 22:30:00'],
              dtype='datetime64[ns]', freq='150T')

### WOM日期

In [49]:
# WOM（Week of Month）是一种非常实用的频率**类**
rng = pd.date_range('1/2/2012', '9/2/2012', freq='WOM-3FRI')
rng

DatetimeIndex(['2012-01-20', '2012-02-17', '2012-03-16', '2012-04-20',
               '2012-05-18', '2012-06-15', '2012-07-20', '2012-08-17'],
              dtype='datetime64[ns]', freq='WOM-3FRI')

## 移动（超前和滞后）数据

In [50]:
ts = Series(np.random.randn(4),
            index=pd.date_range('1/1/2000', periods=4, freq='M'))  # 每月最后一个日历日
ts.shift(2)

2000-01-31         NaN
2000-02-29         NaN
2000-03-31   -0.135126
2000-04-30   -0.641378
Freq: M, dtype: float64

In [51]:
ts/ts.shift(1) - 1

2000-01-31         NaN
2000-02-29    3.746503
2000-03-31   -0.292053
2000-04-30   -0.367183
Freq: M, dtype: float64

In [52]:
ts.shift(2, freq='M')

2000-03-31   -0.135126
2000-04-30   -0.641378
2000-05-31   -0.454062
2000-06-30   -0.287338
Freq: M, dtype: float64

In [53]:
ts

2000-01-31   -0.135126
2000-02-29   -0.641378
2000-03-31   -0.454062
2000-04-30   -0.287338
Freq: M, dtype: float64

### 通过偏移量对日期进行位移

In [54]:
from pandas.tseries.offsets import Day, MonthEnd
now = datetime(2011, 11, 17)
now + 3 * Day()

Timestamp('2011-11-20 00:00:00')

In [55]:
# 第一次增量
now + MonthEnd()

Timestamp('2011-11-30 00:00:00')

In [56]:
# 第二次增量
now + MonthEnd(2)

Timestamp('2011-12-31 00:00:00')

In [57]:
# rollback和rollforward方法
# 显式的将日期向前或向后“滚动”
offset = MonthEnd()
offset.rollforward(now)

Timestamp('2011-11-30 00:00:00')

In [58]:
# 结合groupby实用这两个“滚动”方法
ts = Series(np.random.randn(20), 
            index=pd.date_range('1/15/2000', periods=20, freq='4d'))
ts.groupby(offset.rollforward).mean()

2000-01-31   -0.224748
2000-02-29    0.439092
2000-03-31    0.177991
dtype: float64

In [59]:
ts

2000-01-15   -1.437266
2000-01-19   -1.183783
2000-01-23   -1.516815
2000-01-27    1.849083
2000-01-31    1.165038
2000-02-04   -0.395847
2000-02-08    1.100797
2000-02-12    0.086345
2000-02-16    0.140107
2000-02-20    0.237469
2000-02-24    1.599722
2000-02-28    0.305051
2000-03-03    0.740336
2000-03-07    0.633311
2000-03-11   -0.391379
2000-03-15    0.261968
2000-03-19   -0.200353
2000-03-23    1.528305
2000-03-27   -1.341824
2000-03-31    0.193566
Freq: 4D, dtype: float64

## 时区处理

In [60]:
import pytz

In [61]:
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [62]:
# 从pytz获取时区对象
tz = pytz.timezone('US/Eastern')
tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

### 本地化和转换

In [63]:
# pandas中的时间序列是单纯的（naive）时区
rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')
ts = Series(np.random.randn(len(rng)), index=rng)

In [64]:
print((ts.index.tz))

None


In [65]:
# 在生成日期范围时还可以加上一个时区集
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [66]:
ts_utc = ts.tz_localize('UTC')

In [67]:
ts_utc

2012-03-09 09:30:00+00:00   -0.324207
2012-03-10 09:30:00+00:00    0.589462
2012-03-11 09:30:00+00:00    0.548389
2012-03-12 09:30:00+00:00   -0.912224
2012-03-13 09:30:00+00:00    0.584207
2012-03-14 09:30:00+00:00    0.435593
Freq: D, dtype: float64

In [68]:
ts_utc.index

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [69]:
ts_utc.tz_convert('US/Eastern')

2012-03-09 04:30:00-05:00   -0.324207
2012-03-10 04:30:00-05:00    0.589462
2012-03-11 05:30:00-04:00    0.548389
2012-03-12 05:30:00-04:00   -0.912224
2012-03-13 05:30:00-04:00    0.584207
2012-03-14 05:30:00-04:00    0.435593
Freq: D, dtype: float64

In [70]:
ts_eastern = ts.tz_localize('US/Eastern')
ts_eastern.tz_convert('UTC')

2012-03-09 14:30:00+00:00   -0.324207
2012-03-10 14:30:00+00:00    0.589462
2012-03-11 13:30:00+00:00    0.548389
2012-03-12 13:30:00+00:00   -0.912224
2012-03-13 13:30:00+00:00    0.584207
2012-03-14 13:30:00+00:00    0.435593
Freq: D, dtype: float64

In [71]:
ts_eastern.tz_convert('Europe/Berlin')

2012-03-09 15:30:00+01:00   -0.324207
2012-03-10 15:30:00+01:00    0.589462
2012-03-11 14:30:00+01:00    0.548389
2012-03-12 14:30:00+01:00   -0.912224
2012-03-13 14:30:00+01:00    0.584207
2012-03-14 14:30:00+01:00    0.435593
Freq: D, dtype: float64

In [72]:
# tz_localize和tz_convert也是DatetimeIndex的实例方法
ts.index.tz_localize('Asia/Shanghai')

DatetimeIndex(['2012-03-09 09:30:00+08:00', '2012-03-10 09:30:00+08:00',
               '2012-03-11 09:30:00+08:00', '2012-03-12 09:30:00+08:00',
               '2012-03-13 09:30:00+08:00', '2012-03-14 09:30:00+08:00'],
              dtype='datetime64[ns, Asia/Shanghai]', freq='D')

### 操作时区意识型Timestamp对象
**时区意识型**：即这种obj能够自动关注是否存在夏令时

In [74]:
stamp = pd.Timestamp('2011-03-12 04:00')
stamp_utc = stamp.tz_localize('utc')
stamp_utc.tz_convert('US/Eastern')

Timestamp('2011-03-11 23:00:00-0500', tz='US/Eastern')

In [75]:
stamp = pd.Timestamp('2011-03-12 04:00', tz='Asia/Shanghai')
# stamp_utc = stamp.tz_localize('utc')
stamp_utc.tz_convert('US/Eastern')

Timestamp('2011-03-11 23:00:00-0500', tz='US/Eastern')

In [76]:
# UNIX时间纪元（1970年1月1日起）
print(stamp_utc.value)
print(stamp_utc.tz_convert('US/Eastern').value)

1299902400000000000
1299902400000000000


In [77]:
# pandas的DateOffset对象执行时间算术运算时，会自动关注是否存在夏令时转变期
# 夏令时转变前30分钟，例如2012年3月12日01:30 AM
# 夏令时转变前90分钟，例如2012年11月4日00:30 AM
stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')
stamp

Timestamp('2012-03-12 01:30:00-0400', tz='US/Eastern')

In [78]:
stamp + Hour()

Timestamp('2012-03-12 02:30:00-0400', tz='US/Eastern')

In [79]:
stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')
stamp

Timestamp('2012-11-04 00:30:00-0400', tz='US/Eastern')

In [80]:
stamp + 2 * Hour()

Timestamp('2012-11-04 01:30:00-0500', tz='US/Eastern')

### 不同时区之间的运算

In [81]:
rng = pd.date_range('3/7/2012 09:30', periods=10, freq='B')  # Business：每工作日
ts = Series(np.random.randn(len(rng)), index=rng)
ts

2012-03-07 09:30:00    0.834917
2012-03-08 09:30:00   -1.019013
2012-03-09 09:30:00   -0.624170
2012-03-12 09:30:00   -1.519392
2012-03-13 09:30:00   -0.671608
2012-03-14 09:30:00    0.207043
2012-03-15 09:30:00    0.493061
2012-03-16 09:30:00    0.600239
2012-03-19 09:30:00   -0.866666
2012-03-20 09:30:00    1.188318
Freq: B, dtype: float64

In [82]:
ts1 = ts[7:].tz_localize('Europe/London')
ts2 = ts[2:].tz_localize('Europe/Moscow')
result = ts1 + ts2

In [83]:
# 两个时间序列的时区不同，最终合并结果就会是UTC
result.index

DatetimeIndex(['2012-03-09 05:30:00+00:00', '2012-03-12 05:30:00+00:00',
               '2012-03-13 05:30:00+00:00', '2012-03-14 05:30:00+00:00',
               '2012-03-15 05:30:00+00:00', '2012-03-16 05:30:00+00:00',
               '2012-03-16 09:30:00+00:00', '2012-03-19 05:30:00+00:00',
               '2012-03-19 09:30:00+00:00', '2012-03-20 05:30:00+00:00',
               '2012-03-20 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq=None)

## 时期及其算术运算

In [84]:
p = pd.Period(2007, freq='A-DEC')
p

Period('2007', 'A-DEC')

In [85]:
p + 5

Period('2012', 'A-DEC')

In [86]:
pd.Period('2014', freq='A-DEC') - p

7

In [88]:
# PeriodIndex类的构造对象还允许直接使用一组字符串
values = ['2001Q3', '2002Q2', '2003Q1']
index = pd.PeriodIndex(values, freq='Q-DEC')
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

### 时间的频率转换

In [89]:
# 将一个年度时期转化为当年的一个月度时期
p = pd.Period('2007', freq='A-DEC')
p.asfreq('M', how='start')

Period('2007-01', 'M')

In [90]:
# 在A-JUN频率中，月份“2007年8月”实际上是属于周期“2008年”的
p = pd.Period('2007-08', 'M')
p.asfreq('A-JUN')

Period('2008', 'A-JUN')

### 按季度计算的时期频率

In [91]:
p = pd.Period('2012Q4', freq='Q-JAN')
p

Period('2012Q4', 'Q-JAN')

In [92]:
# 在以1月结束的财年中，2012Q4是从11月到1月
p.asfreq('D', 'start')

Period('2011-11-01', 'D')

In [93]:
p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60
p4pm

Period('2012-01-30 16:00', 'T')

In [94]:
p4pm.to_timestamp()

Timestamp('2012-01-30 16:00:00')

### 将Timestamp转换为Period（及其反向过程）

In [96]:
rng = pd.date_range('1/1/2000', periods=3, freq='M')
ts = Series(np.random.randn(3), index=rng)
pts = ts.to_period()  # 要转化为时间戳，用to_timestamp即可
ts

2000-01-31   -0.723357
2000-02-29   -0.130231
2000-03-31    0.654935
Freq: M, dtype: float64

In [97]:
pts

2000-01   -0.723357
2000-02   -0.130231
2000-03    0.654935
Freq: M, dtype: float64

### 通过数组创建PeriodIndex

In [98]:
data = pd.read_csv('macrodata.csv')
data

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.980,139.7,2.82,5.8,177.146,0.00,0.00
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.150,141.7,3.08,5.1,177.830,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.260,1916.4,29.350,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.370,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.540,139.6,3.50,5.2,180.007,2.31,1.19
5,1960.0,2.0,2834.390,1792.9,298.152,460.400,1966.1,29.550,140.2,2.68,5.2,180.671,0.14,2.55
6,1960.0,3.0,2839.022,1785.8,296.375,474.676,1967.8,29.750,140.9,2.36,5.6,181.528,2.70,-0.34
7,1960.0,4.0,2802.616,1788.2,259.764,476.434,1966.6,29.840,141.1,2.29,6.3,182.287,1.21,1.08
8,1961.0,1.0,2819.264,1787.7,266.405,475.854,1984.5,29.810,142.1,2.37,6.8,182.992,-0.40,2.77
9,1961.0,2.0,2872.005,1814.3,286.246,480.328,2014.4,29.920,142.9,2.29,7.0,183.691,1.47,0.81


In [99]:
index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC')
index

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203, freq='Q-DEC')

In [100]:
data.index = index
data.infl

1959Q1    0.00
1959Q2    2.34
1959Q3    2.74
1959Q4    0.27
1960Q1    2.31
1960Q2    0.14
1960Q3    2.70
1960Q4    1.21
1961Q1   -0.40
1961Q2    1.47
1961Q3    0.80
1961Q4    0.80
1962Q1    2.26
1962Q2    0.13
1962Q3    2.11
1962Q4    0.79
1963Q1    0.53
1963Q2    2.75
1963Q3    0.78
1963Q4    2.46
1964Q1    0.13
1964Q2    0.90
1964Q3    1.29
1964Q4    2.05
1965Q1    1.28
1965Q2    2.54
1965Q3    0.89
1965Q4    2.90
1966Q1    4.99
1966Q2    2.10
          ... 
2002Q2    1.56
2002Q3    2.66
2002Q4    3.08
2003Q1    1.31
2003Q2    1.09
2003Q3    2.60
2003Q4    3.02
2004Q1    2.35
2004Q2    3.61
2004Q3    3.58
2004Q4    2.09
2005Q1    4.15
2005Q2    1.85
2005Q3    9.14
2005Q4    0.40
2006Q1    2.60
2006Q2    3.97
2006Q3   -1.58
2006Q4    3.30
2007Q1    4.58
2007Q2    2.75
2007Q3    3.45
2007Q4    6.38
2008Q1    2.82
2008Q2    8.53
2008Q3   -3.16
2008Q4   -8.79
2009Q1    0.94
2009Q2    3.37
2009Q3    3.56
Freq: Q-DEC, Name: infl, Length: 203, dtype: float64

## 重采样及频率转换

In [101]:
# pandas对象都有一个resample方法
# 它是各种频率转换工作的主力函数
rng = pd.date_range('1/1/2000', periods=100, freq='D')
ts = Series(np.random.randn(len(rng)), index=rng)
ts.resample('M', how='mean')

the new syntax is .resample(...).mean()
  """


2000-01-31    0.272642
2000-02-29   -0.022578
2000-03-31    0.160658
2000-04-30    0.137490
Freq: M, dtype: float64

In [102]:
ts.resample('M', kind='period').mean()

2000-01    0.272642
2000-02   -0.022578
2000-03    0.160658
2000-04    0.137490
Freq: M, dtype: float64

### 降采样

In [103]:
rng = pd.date_range('1/1/2000', periods=12, freq='T')
ts = Series(np.arange(12), index=rng)
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32

In [105]:
# 通过求和的方式将这些数据聚合到“5分钟”块中
# 实际工作中常用到closed和label参数
# 但真正应该关注的是要如何对数据分段
ts.resample('5min', how='sum')

the new syntax is .resample(...).sum()
  


2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int32

#### OHLC重采样

In [106]:
ts.resample('5min', how='ohlc')

the new syntax is .resample(...).ohlc()
  """Entry point for launching an IPython kernel.


Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,4,0,4
2000-01-01 00:05:00,5,9,5,9
2000-01-01 00:10:00,10,11,10,11


#### 通过groupby进行重采样

In [107]:
rng = pd.date_range('1/1/2000', periods=100, freq='D')
ts = Series(np.arange(100), index=rng)
ts.groupby(lambda x: x.month).mean()

1    15
2    45
3    75
4    95
dtype: int32

In [108]:
ts.groupby(lambda x: x.weekday).mean()

0    47.5
1    48.5
2    49.5
3    50.5
4    51.5
5    49.0
6    50.0
dtype: float64

### 升采样和插值

In [109]:
frame = DataFrame(np.random.randn(2, 4),
                  index=pd.date_range('1/1/2000', periods=2, freq='W-WED'),
                  columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame[:5]

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.383729,1.768354,1.809222,0.804973
2000-01-12,1.989752,0.568102,1.473225,-1.152727


In [112]:
# 将其重采样到日频率，默认会引入缺失值
df_daily = frame.resample('D')
df_daily

DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]

In [113]:
frame.resample('D', fill_method='ffill')

the new syntax is .resample(...).ffill()
  """Entry point for launching an IPython kernel.


Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.383729,1.768354,1.809222,0.804973
2000-01-06,-0.383729,1.768354,1.809222,0.804973
2000-01-07,-0.383729,1.768354,1.809222,0.804973
2000-01-08,-0.383729,1.768354,1.809222,0.804973
2000-01-09,-0.383729,1.768354,1.809222,0.804973
2000-01-10,-0.383729,1.768354,1.809222,0.804973
2000-01-11,-0.383729,1.768354,1.809222,0.804973
2000-01-12,1.989752,0.568102,1.473225,-1.152727


### 通过时期进行重采样

在*降采样*中，目标频率必须是源频率的**子时期（subperiod）**  
在*升采样*中，目标频率必须是源频率的**超时期（superperiod）**

## 时间序列绘图