# Time Series / Date functionality

1. `TimeStamp` 与 `Period`
2. `TimeStamp` 转换
3. `daterange`
4. `TimeStamp` 限制
5. 索引
6. 时间和日期的组成部分
7. `DateOffset` 对象
8. 时间序列相关方法
9. `resample`
10. `period`
11. 各类时间表示之间的转换
12. 代表越界范围
13. 时区处理

In [1]:
import random
import time

from datetime import datetime, timedelta
import pandas as pd
import numpy as np

In [2]:
# 生成时间序列
rng = pd.date_range('1/1/2011', periods=72, freq='H')
rng[:5]

DatetimeIndex(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
               '2011-01-01 02:00:00', '2011-01-01 03:00:00',
               '2011-01-01 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [3]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.head()

2011-01-01 00:00:00    0.596171
2011-01-01 01:00:00   -1.085139
2011-01-01 02:00:00    0.966557
2011-01-01 03:00:00    1.252029
2011-01-01 04:00:00    1.896551
Freq: H, dtype: float64

In [4]:
# 转换为指定时间间隔的频率值
converted = ts.asfreq('45Min', method='pad')
converted.head()

2011-01-01 00:00:00    0.596171
2011-01-01 00:45:00    0.596171
2011-01-01 01:30:00   -1.085139
2011-01-01 02:15:00    0.966557
2011-01-01 03:00:00    1.252029
Freq: 45T, dtype: float64

In [5]:
# 按指定的频数汇总
ts.resample('D').mean()

2011-01-01    0.196271
2011-01-02    0.129170
2011-01-03   -0.072738
Freq: D, dtype: float64

## 1 创建时间序列方法
![./file/创建时间序列.png](attachment:%E5%88%9B%E5%BB%BA%E6%97%B6%E9%97%B4%E5%BA%8F%E5%88%97.png)

### 1.1 `TimeStamps` vs `Time Spans`

In [6]:
display(
    pd.Timestamp(datetime(2012, 5, 1)),
    pd.Timestamp('2012-05-01'),
    pd.Timestamp(2012, 5, 1)
)

Timestamp('2012-05-01 00:00:00')

Timestamp('2012-05-01 00:00:00')

Timestamp('2012-05-01 00:00:00')

In [7]:
display(
    pd.Period('2011-01'),
    pd.Period('2012-05', freq='D'),
)

Period('2011-01', 'M')

Period('2012-05-01', 'D')

- 转换成对应的 DatetimeIndex 和 PeriodIndex 

In [8]:
dates = [pd.Timestamp('2012-05-01'), 
         pd.Timestamp('2012-05-02'), 
         pd.Timestamp('2012-05-03')]
ts = pd.Series(np.random.randn(3), dates)

display(
    ts, 
    type(ts.index),
)

2012-05-01    1.338682
2012-05-02    0.782727
2012-05-03   -0.749772
dtype: float64

pandas.core.indexes.datetimes.DatetimeIndex

In [9]:
periods = [pd.Period('2012-01'), 
           pd.Period('2012-02'), 
           pd.Period('2012-03')]
ts = pd.Series(np.random.randn(3), periods)

display(
    ts, 
    type(ts.index),
    ts.index,
)

2012-01   -0.312061
2012-02   -0.543057
2012-03   -2.309281
Freq: M, dtype: float64

pandas.core.indexes.period.PeriodIndex

PeriodIndex(['2012-01', '2012-02', '2012-03'], dtype='period[M]', freq='M')

### 1.2 转换为 `Timestamps`

In [10]:
# 转换为 Timestamp
display(
    pd.to_datetime('Jul 31, 2009'),
    pd.to_datetime('2010-01-10'),
    pd.to_datetime('2005/11/23'),
    pd.to_datetime('01-14-2012'),
)

Timestamp('2009-07-31 00:00:00')

Timestamp('2010-01-10 00:00:00')

Timestamp('2005-11-23 00:00:00')

Timestamp('2012-01-14 00:00:00')

In [11]:
display(
    pd.to_datetime(['2005/11/23', '2010.12.31']),
    pd.to_datetime(['04-01-2012 10:00'], dayfirst=True),  # dayfirst无效
    pd.to_datetime(['14-01-2012', '01-14-2012'], dayfirst=True),
    pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])),
)

DatetimeIndex(['2005-11-23', '2010-12-31'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2012-01-04 10:00:00'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2012-01-14', '2012-01-14'], dtype='datetime64[ns]', freq=None)

0   2009-07-31
1   2010-01-10
2          NaT
dtype: datetime64[ns]

### 1.3 格式化解析

In [12]:
display(
    pd.to_datetime('2010/11/12', format='%Y/%m/%d'),
    pd.to_datetime('12-11-2010 00:00', format='%d-%m-%Y %H:%M')
)

Timestamp('2010-11-12 00:00:00')

Timestamp('2010-11-12 00:00:00')

### 1.4 拼接复合日期数据

In [13]:
df = pd.DataFrame({'year': [2015, 2016],
                   'month': [2, 3],
                   'day': [4, 5],
                   'hour': [2, 3]})
display(df)

print('='*50)

df1 = pd.to_datetime(df)
display(df1, type(df1))

print('='*50)

df2 = pd.to_datetime(df[['year', 'month', 'day']])
display(df2, type(df2))

Unnamed: 0,year,month,day,hour
0,2015,2,4,2
1,2016,3,5,3




0   2015-02-04 02:00:00
1   2016-03-05 03:00:00
dtype: datetime64[ns]

pandas.core.series.Series



0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]

pandas.core.series.Series

### 1.5 无效数据处理

In [14]:
try:
    pd.to_datetime(['2009/07/31', 'asd'])
except Exception as e:
    print(f"Error: {e}")

Error: ('Unknown string format:', 'asd')


In [15]:
display(
    pd.to_datetime(['2009/07/31', 'asd'], errors='ignore'),  # 保留数据
    pd.to_datetime(['2009/07/31', 'asd'], errors='coerce'),  # 转为NaT
)

array(['2009/07/31', 'asd'], dtype=object)

DatetimeIndex(['2009-07-31', 'NaT'], dtype='datetime64[ns]', freq=None)

### 1.6 将 `浮点数时间戳` 转换为 `Timestamps`

In [16]:
display(
    pd.to_datetime([1349720105, 1349806505, 1349892905, 
                1349979305, 1350065705], unit='s'),
    pd.to_datetime([1349720105100, 1349720105200, 1349720105300, 
                1349720105400, 1349720105500 ], unit='ms')
)

DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05',
               '2012-10-10 18:15:05', '2012-10-11 18:15:05',
               '2012-10-12 18:15:05'],
              dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2012-10-08 18:15:05.100000', '2012-10-08 18:15:05.200000',
               '2012-10-08 18:15:05.300000', '2012-10-08 18:15:05.400000',
               '2012-10-08 18:15:05.500000'],
              dtype='datetime64[ns]', freq=None)

### 1.7 将 `Timestamps` 转换为 `浮点数时间戳`

In [17]:
stamps = pd.date_range('2012-10-08 18:15:05', periods=4, freq='D')
f_s = (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

f_s

Int64Index([1349720105, 1349806505, 1349892905, 1349979305], dtype='int64')

### 1.8 使用 ` origin` 参数 

In [18]:
# 起始点
display(
    pd.to_datetime([1, 2, 3], 
                   unit='D', 
                   origin=pd.Timestamp('1960-01-01')),
    pd.to_datetime([1, 2, 3], 
                   unit='D',   # 1970-01-01 00:00:00
                  ) 
)

DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['1970-01-02', '1970-01-03', '1970-01-04'], dtype='datetime64[ns]', freq=None)

## 2 生成 `时间戳` 范围

In [19]:
dates = [datetime(2012, 5, 1), 
         datetime(2012, 5, 2), 
         datetime(2012, 5, 3)]
index = pd.DatetimeIndex(dates)
index1 = pd.Index(dates)

display(index, index1)

DatetimeIndex(['2012-05-01', '2012-05-02', '2012-05-03'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2012-05-01', '2012-05-02', '2012-05-03'], dtype='datetime64[ns]', freq=None)

In [20]:
# 根据范围生成时间戳
start = datetime(2011, 1, 1)
end = datetime(2012, 1, 1)

index = pd.date_range(start, end)  # 日历日
index1 = pd.bdate_range(start, end)   # 营业日

display(index, index1)

DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
               '2011-01-05', '2011-01-06', '2011-01-07', '2011-01-08',
               '2011-01-09', '2011-01-10',
               ...
               '2011-12-23', '2011-12-24', '2011-12-25', '2011-12-26',
               '2011-12-27', '2011-12-28', '2011-12-29', '2011-12-30',
               '2011-12-31', '2012-01-01'],
              dtype='datetime64[ns]', length=366, freq='D')

DatetimeIndex(['2011-01-03', '2011-01-04', '2011-01-05', '2011-01-06',
               '2011-01-07', '2011-01-10', '2011-01-11', '2011-01-12',
               '2011-01-13', '2011-01-14',
               ...
               '2011-12-19', '2011-12-20', '2011-12-21', '2011-12-22',
               '2011-12-23', '2011-12-26', '2011-12-27', '2011-12-28',
               '2011-12-29', '2011-12-30'],
              dtype='datetime64[ns]', length=260, freq='B')

In [21]:
# date_range 和 bdate_range 的参数组合， 如start, end, freq
index1 = pd.date_range(start, end, freq='BM')  # 月末
index2 = pd.date_range(start, end, freq='W')   # 周末
index3 = pd.bdate_range(end=end, periods=20)   # 配置间隔时间
index4 = pd.bdate_range(start=start, periods=20)  

display(index1, index2, index3, index4)

DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31', '2011-04-29',
               '2011-05-31', '2011-06-30', '2011-07-29', '2011-08-31',
               '2011-09-30', '2011-10-31', '2011-11-30', '2011-12-30'],
              dtype='datetime64[ns]', freq='BM')

DatetimeIndex(['2011-01-02', '2011-01-09', '2011-01-16', '2011-01-23',
               '2011-01-30', '2011-02-06', '2011-02-13', '2011-02-20',
               '2011-02-27', '2011-03-06', '2011-03-13', '2011-03-20',
               '2011-03-27', '2011-04-03', '2011-04-10', '2011-04-17',
               '2011-04-24', '2011-05-01', '2011-05-08', '2011-05-15',
               '2011-05-22', '2011-05-29', '2011-06-05', '2011-06-12',
               '2011-06-19', '2011-06-26', '2011-07-03', '2011-07-10',
               '2011-07-17', '2011-07-24', '2011-07-31', '2011-08-07',
               '2011-08-14', '2011-08-21', '2011-08-28', '2011-09-04',
               '2011-09-11', '2011-09-18', '2011-09-25', '2011-10-02',
               '2011-10-09', '2011-10-16', '2011-10-23', '2011-10-30',
               '2011-11-06', '2011-11-13', '2011-11-20', '2011-11-27',
               '2011-12-04', '2011-12-11', '2011-12-18', '2011-12-25',
               '2012-01-01'],
              dtype='datetime64[ns]', freq='W-S

DatetimeIndex(['2011-12-05', '2011-12-06', '2011-12-07', '2011-12-08',
               '2011-12-09', '2011-12-12', '2011-12-13', '2011-12-14',
               '2011-12-15', '2011-12-16', '2011-12-19', '2011-12-20',
               '2011-12-21', '2011-12-22', '2011-12-23', '2011-12-26',
               '2011-12-27', '2011-12-28', '2011-12-29', '2011-12-30'],
              dtype='datetime64[ns]', freq='B')

DatetimeIndex(['2011-01-03', '2011-01-04', '2011-01-05', '2011-01-06',
               '2011-01-07', '2011-01-10', '2011-01-11', '2011-01-12',
               '2011-01-13', '2011-01-14', '2011-01-17', '2011-01-18',
               '2011-01-19', '2011-01-20', '2011-01-21', '2011-01-24',
               '2011-01-25', '2011-01-26', '2011-01-27', '2011-01-28'],
              dtype='datetime64[ns]', freq='B')

In [22]:
index1 = pd.date_range('2018-01-01', '2018-01-05', periods=5)
index2 = pd.date_range('2018-01-01', '2018-01-05', periods=10)

display(index1, index2)

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05'],
              dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 10:40:00',
               '2018-01-01 21:20:00', '2018-01-02 08:00:00',
               '2018-01-02 18:40:00', '2018-01-03 05:20:00',
               '2018-01-03 16:00:00', '2018-01-04 02:40:00',
               '2018-01-04 13:20:00', '2018-01-05 00:00:00'],
              dtype='datetime64[ns]', freq=None)

In [23]:
# bdate_range
weekmask = 'Mon Wed Fri'
holidays = [datetime(2011, 1, 5), datetime(2011, 3, 14)]
index1 = pd.bdate_range(start, end, 
                        freq='C', 
                        weekmask=weekmask, 
                        holidays=holidays,)
index2 = pd.bdate_range(start, end, 
                        freq='CBMS', 
                        weekmask=weekmask, 
                        holidays=holidays,)

display(index1, index2)

DatetimeIndex(['2011-01-03', '2011-01-07', '2011-01-10', '2011-01-12',
               '2011-01-14', '2011-01-17', '2011-01-19', '2011-01-21',
               '2011-01-24', '2011-01-26',
               ...
               '2011-12-09', '2011-12-12', '2011-12-14', '2011-12-16',
               '2011-12-19', '2011-12-21', '2011-12-23', '2011-12-26',
               '2011-12-28', '2011-12-30'],
              dtype='datetime64[ns]', length=154, freq='C')

DatetimeIndex(['2011-01-03', '2011-02-02', '2011-03-02', '2011-04-01',
               '2011-05-02', '2011-06-01', '2011-07-01', '2011-08-01',
               '2011-09-02', '2011-10-03', '2011-11-02', '2011-12-02'],
              dtype='datetime64[ns]', freq='CBMS')

### 2.1 `Timestamp`限制

In [24]:
display(
    pd.Timestamp.min,
    pd.Timestamp.max,
)

Timestamp('1677-09-21 00:12:43.145225')

Timestamp('2262-04-11 23:47:16.854775807')

## 3 `index`

In [25]:
rng = pd.date_range(start, end, freq='BM')
ts = pd.Series(np.random.randn(len(rng)), index=rng)

# 再索引
display(
    ts.index,
    ts[:5].index,  # 支持切片操作
    ts[::2].index,
    ts['1/31/2011'],  # 字符串为索引
    ts[datetime(2011, 12, 25):],   # 时间戳为索引
    ts['10/31/2011':'12/31/2011']   # 字符串范围
)

DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31', '2011-04-29',
               '2011-05-31', '2011-06-30', '2011-07-29', '2011-08-31',
               '2011-09-30', '2011-10-31', '2011-11-30', '2011-12-30'],
              dtype='datetime64[ns]', freq='BM')

DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31', '2011-04-29',
               '2011-05-31'],
              dtype='datetime64[ns]', freq='BM')

DatetimeIndex(['2011-01-31', '2011-03-31', '2011-05-31', '2011-07-29',
               '2011-09-30', '2011-11-30'],
              dtype='datetime64[ns]', freq='2BM')

-0.3227839950756312

2011-12-30    0.010401
Freq: BM, dtype: float64

2011-10-31   -1.503697
2011-11-30    0.864834
2011-12-30    0.010401
Freq: BM, dtype: float64

In [26]:
display(
    ts['2011'],  # 年份
    ts['2011-6'],   # 月份
)

2011-01-31   -0.322784
2011-02-28   -0.871209
2011-03-31   -1.266387
2011-04-29    0.712583
2011-05-31    0.063998
2011-06-30    0.043064
2011-07-29    0.156769
2011-08-31    1.487245
2011-09-30   -0.854961
2011-10-31   -1.503697
2011-11-30    0.864834
2011-12-30    0.010401
Freq: BM, dtype: float64

2011-06-30    0.043064
Freq: BM, dtype: float64

In [27]:
dft = pd.DataFrame(
    np.random.randn(100000,1),
    columns=['A'],
    index=pd.date_range('20130101',periods=100000,freq='T')
)

display(
    dft.shape,
    dft.head(),
)

(100000, 1)

Unnamed: 0,A
2013-01-01 00:00:00,1.312935
2013-01-01 00:01:00,1.312621
2013-01-01 00:02:00,0.193682
2013-01-01 00:03:00,2.355485
2013-01-01 00:04:00,0.094012


In [28]:
dft['2013-01']

Unnamed: 0,A
2013-01-01 00:00:00,1.312935
2013-01-01 00:01:00,1.312621
2013-01-01 00:02:00,0.193682
2013-01-01 00:03:00,2.355485
2013-01-01 00:04:00,0.094012
2013-01-01 00:05:00,-1.301788
2013-01-01 00:06:00,-0.253025
2013-01-01 00:07:00,1.902301
2013-01-01 00:08:00,-0.813458
2013-01-01 00:09:00,-0.442346


In [29]:
dft['2013-01-01':'2013-01-05']

Unnamed: 0,A
2013-01-01 00:00:00,1.312935
2013-01-01 00:01:00,1.312621
2013-01-01 00:02:00,0.193682
2013-01-01 00:03:00,2.355485
2013-01-01 00:04:00,0.094012
2013-01-01 00:05:00,-1.301788
2013-01-01 00:06:00,-0.253025
2013-01-01 00:07:00,1.902301
2013-01-01 00:08:00,-0.813458
2013-01-01 00:09:00,-0.442346


In [30]:
dft['2013-1':'2013-2-28 00:00:00']

Unnamed: 0,A
2013-01-01 00:00:00,1.312935
2013-01-01 00:01:00,1.312621
2013-01-01 00:02:00,0.193682
2013-01-01 00:03:00,2.355485
2013-01-01 00:04:00,0.094012
2013-01-01 00:05:00,-1.301788
2013-01-01 00:06:00,-0.253025
2013-01-01 00:07:00,1.902301
2013-01-01 00:08:00,-0.813458
2013-01-01 00:09:00,-0.442346


In [31]:
dft['2013-1-15':'2013-1-15 12:30:00']

Unnamed: 0,A
2013-01-15 00:00:00,0.735112
2013-01-15 00:01:00,1.576990
2013-01-15 00:02:00,0.756380
2013-01-15 00:03:00,0.281304
2013-01-15 00:04:00,0.331131
2013-01-15 00:05:00,-0.186559
2013-01-15 00:06:00,0.745284
2013-01-15 00:07:00,-1.229007
2013-01-15 00:08:00,0.886235
2013-01-15 00:09:00,-1.758058


- `MultiIndex` 复合索引

In [32]:
index = pd.MultiIndex.from_product([pd.date_range('20130101', periods=10, freq='12H'),
                                    ['a', 'b']])
dft2 = pd.DataFrame(np.random.randn(20, 1),
                    columns=['A'],
                    index=index,
                   )
dft2

Unnamed: 0,Unnamed: 1,A
2013-01-01 00:00:00,a,1.408554
2013-01-01 00:00:00,b,1.793799
2013-01-01 12:00:00,a,0.785783
2013-01-01 12:00:00,b,-1.915734
2013-01-02 00:00:00,a,-0.837299
2013-01-02 00:00:00,b,0.396379
2013-01-02 12:00:00,a,-1.440247
2013-01-02 12:00:00,b,0.069101
2013-01-03 00:00:00,a,-0.20342
2013-01-03 00:00:00,b,-0.13624


In [33]:
dft2.loc['2013-01-05']

Unnamed: 0,Unnamed: 1,A
2013-01-05 00:00:00,a,-0.664415
2013-01-05 00:00:00,b,-0.04194
2013-01-05 12:00:00,a,-0.366098
2013-01-05 12:00:00,b,0.348256


In [34]:
idx = pd.IndexSlice
dft2 = dft2.swaplevel(0, 1).sort_index()

dft2.loc[idx[:, '2013-01-05'], :]

Unnamed: 0,Unnamed: 1,A
a,2013-01-05 00:00:00,-0.664415
a,2013-01-05 12:00:00,-0.366098
b,2013-01-05 00:00:00,-0.04194
b,2013-01-05 12:00:00,0.348256


### 3.2 切片与精度匹配

In [35]:
series_minute = pd.Series([1, 2, 3],
                          pd.DatetimeIndex(['2011-12-31 23:59:00',
                                            '2012-01-01 00:00:00',
                                            '2012-01-01 00:02:00']))
series_minute.index.resolution

'minute'

In [36]:
# 时间戳字符串的精度低于一分钟，视为切片
display(
    series_minute['2011-12-31 23'],
    series_minute['2011-12-31 23:59'],
    series_minute['2011-12-31 23:59:00'],
)

2011-12-31 23:59:00    1
dtype: int64

1

1

In [37]:
series_second = pd.Series([1, 2, 3],
                          pd.DatetimeIndex(['2011-12-31 23:59:01',
                                            '2012-01-01 00:00:20',
                                            '2012-01-01 00:02:31']))
series_second.index.resolution

'second'

In [38]:
# 时间戳字符串的精度低于一分钟，视为切片
display(
    series_second['2011-12-31 23'],
    series_second['2011-12-31 23:59'],
    series_second['2011-12-31 23:59:01'],
)

2011-12-31 23:59:01    1
dtype: int64

2011-12-31 23:59:01    1
dtype: int64

1

### 3.3 精确索引

In [39]:
dft[datetime(2013, 1, 1):datetime(2013,2,28)]

Unnamed: 0,A
2013-01-01 00:00:00,1.312935
2013-01-01 00:01:00,1.312621
2013-01-01 00:02:00,0.193682
2013-01-01 00:03:00,2.355485
2013-01-01 00:04:00,0.094012
2013-01-01 00:05:00,-1.301788
2013-01-01 00:06:00,-0.253025
2013-01-01 00:07:00,1.902301
2013-01-01 00:08:00,-0.813458
2013-01-01 00:09:00,-0.442346


In [40]:
dft[datetime(2013, 1, 1, 10, 12, 0):datetime(2013, 2, 28, 10, 12, 0)]

Unnamed: 0,A
2013-01-01 10:12:00,-0.138471
2013-01-01 10:13:00,0.675326
2013-01-01 10:14:00,0.640600
2013-01-01 10:15:00,1.128659
2013-01-01 10:16:00,1.518796
2013-01-01 10:17:00,-0.535805
2013-01-01 10:18:00,0.368785
2013-01-01 10:19:00,-0.991692
2013-01-01 10:20:00,1.020157
2013-01-01 10:21:00,-0.115631


### 3.2 截断和新建索引

In [41]:
rng2 = pd.date_range('2011-01-01', '2012-01-01', freq='W')
ts2 = pd.Series(np.random.randn(len(rng2)), index=rng2)
ts2.truncate(before='2011-11', after='2011-12')

2011-11-06   -0.302119
2011-11-13   -0.142922
2011-11-20    1.130825
2011-11-27    0.325270
Freq: W-SUN, dtype: float64

In [42]:
ts2['2011-11':'2011-12']

2011-11-06   -0.302119
2011-11-13   -0.142922
2011-11-20    1.130825
2011-11-27    0.325270
2011-12-04    0.061550
2011-12-11    0.405607
2011-12-18   -0.282557
2011-12-25   -1.452589
Freq: W-SUN, dtype: float64

In [43]:
ts2[[0, 2, 6]].index

DatetimeIndex(['2011-01-02', '2011-01-16', '2011-02-13'], dtype='datetime64[ns]', freq=None)

### 3.4 时间日期组成部分

- Timestamp属性：
![Timestamp属性.png](attachment:Timestamp%E5%B1%9E%E6%80%A7.png)

### 3.5 `DateOffset` 对象

![DateSet%E5%B1%9E%E6%80%A7.png](attachment:DateSet%E5%B1%9E%E6%80%A7.png)

In [44]:
from pandas.tseries.offsets import *

d = datetime(2008, 8, 18, 9, 0)
d + DateOffset(months=4, days=5)

Timestamp('2008-12-23 09:00:00')

In [45]:
day = Day()
display(day.apply(pd.Timestamp('2014-01-01 09:00')))

day = Day(normalize=True)
display(day.apply(pd.Timestamp('2014-01-01 09:00')))

hour = Hour()
display(hour.apply(pd.Timestamp('2014-01-01 22:00')))

hour = Hour(normalize=True)
display(hour.apply(pd.Timestamp('2014-01-01 22:00')))
display(hour.apply(pd.Timestamp('2014-01-01 23:00')))
display(hour.apply(pd.Timestamp('2014-01-01 20:00')))

Timestamp('2014-01-02 09:00:00')

Timestamp('2014-01-02 00:00:00')

Timestamp('2014-01-01 23:00:00')

Timestamp('2014-01-01 00:00:00')

Timestamp('2014-01-02 00:00:00')

Timestamp('2014-01-01 00:00:00')

- `Parametric Offsets`

## 4 `Time Series相关方法`

### 4.1 `shifting/lagging`

In [46]:
dft = pd.DataFrame(
    np.random.randn(100,1),
    columns=['A'],
    index=pd.date_range('20180701',periods=100,freq='D')
)

display(
    dft.shape,
    dft.head(),
)

(100, 1)

Unnamed: 0,A
2018-07-01,1.045989
2018-07-02,0.487476
2018-07-03,0.035368
2018-07-04,1.397846
2018-07-05,1.060655


In [47]:
dft1 = dft[:5]
display(
    dft1, 
    dft1.shift(1)   # 移动1天
)

Unnamed: 0,A
2018-07-01,1.045989
2018-07-02,0.487476
2018-07-03,0.035368
2018-07-04,1.397846
2018-07-05,1.060655


Unnamed: 0,A
2018-07-01,
2018-07-02,1.045989
2018-07-03,0.487476
2018-07-04,0.035368
2018-07-05,1.397846


In [48]:
dft1.shift(5, freq=pd.offsets.BDay())   #

Unnamed: 0,A
2018-07-06,1.045989
2018-07-09,0.487476
2018-07-10,0.035368
2018-07-11,1.397846
2018-07-12,1.060655


In [49]:
dft1.shift(5, freq='BM')   # 按所需周期移动索引

Unnamed: 0,A
2018-11-30,1.045989
2018-11-30,0.487476
2018-11-30,0.035368
2018-11-30,1.397846
2018-11-30,1.060655


In [50]:
dft1.tshift(5, freq='D')   # 移动时间索引

Unnamed: 0,A
2018-07-06,1.045989
2018-07-07,0.487476
2018-07-08,0.035368
2018-07-09,1.397846
2018-07-10,1.060655


### 4.2 `Frequency Conversion`

In [51]:
dr = pd.date_range('7/1/2018', periods=3, freq=3 * pd.offsets.BDay())
dr

DatetimeIndex(['2018-07-02', '2018-07-05', '2018-07-10'], dtype='datetime64[ns]', freq='3B')

In [52]:
ts = pd.Series(np.random.randn(3), index=dr)
ts

2018-07-02   -0.551723
2018-07-05    0.394037
2018-07-10   -1.123109
Freq: 3B, dtype: float64

In [53]:
ts.asfreq(BDay())    # 将时间序列转换为指定的频率

2018-07-02   -0.551723
2018-07-03         NaN
2018-07-04         NaN
2018-07-05    0.394037
2018-07-06         NaN
2018-07-09         NaN
2018-07-10   -1.123109
Freq: B, dtype: float64

In [54]:
ts.asfreq(BDay(), method='bfill')   # 指定插值方法
# method: 'pad' / 'ffill' / 'backfill' / 'bfill'

2018-07-02   -0.551723
2018-07-03    0.394037
2018-07-04    0.394037
2018-07-05    0.394037
2018-07-06   -1.123109
2018-07-09   -1.123109
2018-07-10   -1.123109
Freq: B, dtype: float64

### 4.3 `DatetimeIndex` 转换为 `python datetime`数组

    - `to_pydatetime`

## 5 `resample`

- 基于时间的组，重采样；
- 重采样后执行聚合操作：
    - `sum`, `mean`, `std`
    - `max`, `min`, `median`, `first`, `last`
    - `sem`, `oh1c`

In [55]:
rng = pd.date_range('7/1/2018', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

display(
    ts.resample('5Min').sum(),
    ts.resample('5Min').mean(),
    ts.resample('5Min').ohlc(),
    ts.resample('5Min').max(),
    ts.resample('5Min', closed='right').mean(),  # 右偏移默认为负偏移
    ts.resample('5Min', closed='left').mean(),   # 
)

2018-07-01    25919
Freq: 5T, dtype: int32

2018-07-01    259.19
Freq: 5T, dtype: float64

Unnamed: 0,open,high,low,close
2018-07-01,458,494,7,106


2018-07-01    494
Freq: 5T, dtype: int32

2018-06-30 23:55:00    458.000000
2018-07-01 00:00:00    257.181818
Freq: 5T, dtype: float64

2018-07-01    259.19
Freq: 5T, dtype: float64

In [56]:
rng2 = pd.date_range('1/1/2018', end='7/31/2018', freq='D')
ts2 = pd.Series(range(len(rng2)), index=rng2)
ts2.head()

2018-01-01    0
2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
Freq: D, dtype: int64

In [57]:
ts2.resample('M').max()

2018-01-31     30
2018-02-28     58
2018-03-31     89
2018-04-30    119
2018-05-31    150
2018-06-30    180
2018-07-31    211
Freq: M, dtype: int64

In [58]:
ts2.resample('SM').max()    # ？？？

2017-12-31     13
2018-01-15     29
2018-01-31     44
2018-02-15     57
2018-02-28     72
2018-03-15     88
2018-03-31    103
2018-04-15    118
2018-04-30    133
2018-05-15    149
2018-05-31    164
2018-06-15    179
2018-06-30    194
2018-07-15    210
2018-07-31    211
Freq: SM-15, dtype: int64

In [59]:
ts2.resample('SM', label='right', closed='right').max()  # ？？？

2018-01-15     14.0
2018-01-31     30.0
2018-02-15     45.0
2018-02-28     58.0
2018-03-15     73.0
2018-03-31     89.0
2018-04-15    104.0
2018-04-30    119.0
2018-05-15    134.0
2018-05-31    150.0
2018-06-15    165.0
2018-06-30    180.0
2018-07-15    195.0
2018-07-31    211.0
2018-08-15      NaN
Freq: SM-15, dtype: float64

### 5.1 `upresample`

In [60]:
ts2[:2].resample('50T').asfreq()

2018-01-01 00:00:00    0.0
2018-01-01 00:50:00    NaN
2018-01-01 01:40:00    NaN
2018-01-01 02:30:00    NaN
2018-01-01 03:20:00    NaN
2018-01-01 04:10:00    NaN
2018-01-01 05:00:00    NaN
2018-01-01 05:50:00    NaN
2018-01-01 06:40:00    NaN
2018-01-01 07:30:00    NaN
2018-01-01 08:20:00    NaN
2018-01-01 09:10:00    NaN
2018-01-01 10:00:00    NaN
2018-01-01 10:50:00    NaN
2018-01-01 11:40:00    NaN
2018-01-01 12:30:00    NaN
2018-01-01 13:20:00    NaN
2018-01-01 14:10:00    NaN
2018-01-01 15:00:00    NaN
2018-01-01 15:50:00    NaN
2018-01-01 16:40:00    NaN
2018-01-01 17:30:00    NaN
2018-01-01 18:20:00    NaN
2018-01-01 19:10:00    NaN
2018-01-01 20:00:00    NaN
2018-01-01 20:50:00    NaN
2018-01-01 21:40:00    NaN
2018-01-01 22:30:00    NaN
2018-01-01 23:20:00    NaN
Freq: 50T, dtype: float64

In [61]:
ts2[:2].resample('50T').ffill()

2018-01-01 00:00:00    0
2018-01-01 00:50:00    0
2018-01-01 01:40:00    0
2018-01-01 02:30:00    0
2018-01-01 03:20:00    0
2018-01-01 04:10:00    0
2018-01-01 05:00:00    0
2018-01-01 05:50:00    0
2018-01-01 06:40:00    0
2018-01-01 07:30:00    0
2018-01-01 08:20:00    0
2018-01-01 09:10:00    0
2018-01-01 10:00:00    0
2018-01-01 10:50:00    0
2018-01-01 11:40:00    0
2018-01-01 12:30:00    0
2018-01-01 13:20:00    0
2018-01-01 14:10:00    0
2018-01-01 15:00:00    0
2018-01-01 15:50:00    0
2018-01-01 16:40:00    0
2018-01-01 17:30:00    0
2018-01-01 18:20:00    0
2018-01-01 19:10:00    0
2018-01-01 20:00:00    0
2018-01-01 20:50:00    0
2018-01-01 21:40:00    0
2018-01-01 22:30:00    0
2018-01-01 23:20:00    0
Freq: 50T, dtype: int64

In [62]:
ts2[:2].resample('50T').ffill(limit=2)

2018-01-01 00:00:00    0.0
2018-01-01 00:50:00    0.0
2018-01-01 01:40:00    0.0
2018-01-01 02:30:00    NaN
2018-01-01 03:20:00    NaN
2018-01-01 04:10:00    NaN
2018-01-01 05:00:00    NaN
2018-01-01 05:50:00    NaN
2018-01-01 06:40:00    NaN
2018-01-01 07:30:00    NaN
2018-01-01 08:20:00    NaN
2018-01-01 09:10:00    NaN
2018-01-01 10:00:00    NaN
2018-01-01 10:50:00    NaN
2018-01-01 11:40:00    NaN
2018-01-01 12:30:00    NaN
2018-01-01 13:20:00    NaN
2018-01-01 14:10:00    NaN
2018-01-01 15:00:00    NaN
2018-01-01 15:50:00    NaN
2018-01-01 16:40:00    NaN
2018-01-01 17:30:00    NaN
2018-01-01 18:20:00    NaN
2018-01-01 19:10:00    NaN
2018-01-01 20:00:00    NaN
2018-01-01 20:50:00    NaN
2018-01-01 21:40:00    NaN
2018-01-01 22:30:00    NaN
2018-01-01 23:20:00    NaN
Freq: 50T, dtype: float64

### 5.2 稀疏重采样

In [63]:
rng = pd.date_range('2018-5-1', periods=50, freq='D') + pd.Timedelta('1s')
ts = pd.Series(range(50), index=rng)
ts

2018-05-01 00:00:01     0
2018-05-02 00:00:01     1
2018-05-03 00:00:01     2
2018-05-04 00:00:01     3
2018-05-05 00:00:01     4
2018-05-06 00:00:01     5
2018-05-07 00:00:01     6
2018-05-08 00:00:01     7
2018-05-09 00:00:01     8
2018-05-10 00:00:01     9
2018-05-11 00:00:01    10
2018-05-12 00:00:01    11
2018-05-13 00:00:01    12
2018-05-14 00:00:01    13
2018-05-15 00:00:01    14
2018-05-16 00:00:01    15
2018-05-17 00:00:01    16
2018-05-18 00:00:01    17
2018-05-19 00:00:01    18
2018-05-20 00:00:01    19
2018-05-21 00:00:01    20
2018-05-22 00:00:01    21
2018-05-23 00:00:01    22
2018-05-24 00:00:01    23
2018-05-25 00:00:01    24
2018-05-26 00:00:01    25
2018-05-27 00:00:01    26
2018-05-28 00:00:01    27
2018-05-29 00:00:01    28
2018-05-30 00:00:01    29
2018-05-31 00:00:01    30
2018-06-01 00:00:01    31
2018-06-02 00:00:01    32
2018-06-03 00:00:01    33
2018-06-04 00:00:01    34
2018-06-05 00:00:01    35
2018-06-06 00:00:01    36
2018-06-07 00:00:01    37
2018-06-08 0

In [64]:
ts.resample('100H').sum()

2018-05-01 00:00:00     10
2018-05-05 04:00:00     26
2018-05-09 08:00:00     42
2018-05-13 12:00:00     58
2018-05-17 16:00:00     74
2018-05-21 20:00:00     90
2018-05-26 00:00:00    135
2018-05-30 04:00:00    126
2018-06-03 08:00:00    142
2018-06-07 12:00:00    158
2018-06-11 16:00:00    174
2018-06-15 20:00:00    190
Freq: 100H, dtype: int64

In [65]:
from functools import partial
from pandas.tseries.frequencies import to_offset

def round(t, freq):
    freq = to_offset(freq)
    return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value)

# ts.groupby(partial(round, freq='100H')).sum()

t = pd.Timestamp('2018-05-01 00:00:00')
freq = to_offset('10H')

t.value, freq.delta.value, pd.Timestamp((t.value // freq.delta.value) * freq.delta.value)

(1525132800000000000, 36000000000000, Timestamp('2018-04-30 16:00:00'))

### 5.4 `aggregation` 聚集

In [66]:
df = pd.DataFrame(np.random.randn(1000, 3),
                  index=pd.date_range('1/1/2018', freq='H', periods=1000),
                  columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
2018-01-01 00:00:00,-0.641483,0.376164,0.054979
2018-01-01 01:00:00,-1.889191,-0.525938,0.998815
2018-01-01 02:00:00,-2.12037,-0.425816,-0.887046
2018-01-01 03:00:00,0.010533,-0.533166,0.369447
2018-01-01 04:00:00,1.05921,-1.55153,0.415163


In [67]:
r = df.resample('30H')
r.mean()

Unnamed: 0,A,B,C
2018-01-01 00:00:00,-0.145521,-0.056345,0.127398
2018-01-02 06:00:00,-0.199006,0.20672,0.144012
2018-01-03 12:00:00,-0.08799,-0.039319,-0.132885
2018-01-04 18:00:00,-0.102854,0.123491,0.077739
2018-01-06 00:00:00,-0.344653,0.243819,0.085964
2018-01-07 06:00:00,0.252343,0.201871,-0.150269
2018-01-08 12:00:00,-0.035449,0.033659,-0.233286
2018-01-09 18:00:00,0.084194,0.128494,-0.576378
2018-01-11 00:00:00,-0.10234,-0.1225,-0.095192
2018-01-12 06:00:00,-0.090403,0.107877,0.275836


In [68]:
r['A'].mean()

2018-01-01 00:00:00   -0.145521
2018-01-02 06:00:00   -0.199006
2018-01-03 12:00:00   -0.087990
2018-01-04 18:00:00   -0.102854
2018-01-06 00:00:00   -0.344653
2018-01-07 06:00:00    0.252343
2018-01-08 12:00:00   -0.035449
2018-01-09 18:00:00    0.084194
2018-01-11 00:00:00   -0.102340
2018-01-12 06:00:00   -0.090403
2018-01-13 12:00:00    0.216959
2018-01-14 18:00:00    0.219870
2018-01-16 00:00:00   -0.509056
2018-01-17 06:00:00   -0.231657
2018-01-18 12:00:00    0.041246
2018-01-19 18:00:00   -0.025503
2018-01-21 00:00:00    0.045037
2018-01-22 06:00:00    0.177681
2018-01-23 12:00:00    0.203597
2018-01-24 18:00:00   -0.206251
2018-01-26 00:00:00   -0.306234
2018-01-27 06:00:00    0.200071
2018-01-28 12:00:00    0.089901
2018-01-29 18:00:00    0.063395
2018-01-31 00:00:00    0.024920
2018-02-01 06:00:00   -0.043509
2018-02-02 12:00:00   -0.262457
2018-02-03 18:00:00   -0.061859
2018-02-05 00:00:00   -0.398728
2018-02-06 06:00:00    0.066443
2018-02-07 12:00:00    0.027308
2018-02-

In [69]:
r['A'].agg([np.sum, np.mean, np.std])

Unnamed: 0,sum,mean,std
2018-01-01 00:00:00,-4.365619,-0.145521,1.233464
2018-01-02 06:00:00,-5.970191,-0.199006,0.916564
2018-01-03 12:00:00,-2.639714,-0.08799,0.997241
2018-01-04 18:00:00,-3.085624,-0.102854,0.780231
2018-01-06 00:00:00,-10.339576,-0.344653,1.081032
2018-01-07 06:00:00,7.570282,0.252343,0.839099
2018-01-08 12:00:00,-1.063462,-0.035449,0.974744
2018-01-09 18:00:00,2.525827,0.084194,0.896909
2018-01-11 00:00:00,-3.070194,-0.10234,1.130699
2018-01-12 06:00:00,-2.712082,-0.090403,1.167818


In [70]:
r['A'].agg([np.sum, np.mean])

Unnamed: 0,sum,mean
2018-01-01 00:00:00,-4.365619,-0.145521
2018-01-02 06:00:00,-5.970191,-0.199006
2018-01-03 12:00:00,-2.639714,-0.08799
2018-01-04 18:00:00,-3.085624,-0.102854
2018-01-06 00:00:00,-10.339576,-0.344653
2018-01-07 06:00:00,7.570282,0.252343
2018-01-08 12:00:00,-1.063462,-0.035449
2018-01-09 18:00:00,2.525827,0.084194
2018-01-11 00:00:00,-3.070194,-0.10234
2018-01-12 06:00:00,-2.712082,-0.090403


In [71]:
r['A'].agg(['sum', 'mean'])

Unnamed: 0,sum,mean
2018-01-01 00:00:00,-4.365619,-0.145521
2018-01-02 06:00:00,-5.970191,-0.199006
2018-01-03 12:00:00,-2.639714,-0.08799
2018-01-04 18:00:00,-3.085624,-0.102854
2018-01-06 00:00:00,-10.339576,-0.344653
2018-01-07 06:00:00,7.570282,0.252343
2018-01-08 12:00:00,-1.063462,-0.035449
2018-01-09 18:00:00,2.525827,0.084194
2018-01-11 00:00:00,-3.070194,-0.10234
2018-01-12 06:00:00,-2.712082,-0.090403


In [72]:
r.agg({'A' : np.sum,
       'B' : lambda x: np.std(x, ddof=1)})

Unnamed: 0,A,B
2018-01-01 00:00:00,-4.365619,1.110206
2018-01-02 06:00:00,-5.970191,0.855337
2018-01-03 12:00:00,-2.639714,0.864737
2018-01-04 18:00:00,-3.085624,0.780075
2018-01-06 00:00:00,-10.339576,0.814235
2018-01-07 06:00:00,7.570282,1.117953
2018-01-08 12:00:00,-1.063462,0.922071
2018-01-09 18:00:00,2.525827,1.054654
2018-01-11 00:00:00,-3.070194,0.952821
2018-01-12 06:00:00,-2.712082,0.839172


In [73]:
r.agg({'A' : 'sum', 'B' : 'std'})

Unnamed: 0,A,B
2018-01-01 00:00:00,-4.365619,1.110206
2018-01-02 06:00:00,-5.970191,0.855337
2018-01-03 12:00:00,-2.639714,0.864737
2018-01-04 18:00:00,-3.085624,0.780075
2018-01-06 00:00:00,-10.339576,0.814235
2018-01-07 06:00:00,7.570282,1.117953
2018-01-08 12:00:00,-1.063462,0.922071
2018-01-09 18:00:00,2.525827,1.054654
2018-01-11 00:00:00,-3.070194,0.952821
2018-01-12 06:00:00,-2.712082,0.839172


In [74]:
r.agg({'A' : ['sum','std'], 
       'B' : ['mean','std'] })

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sum,std,mean,std
2018-01-01 00:00:00,-4.365619,1.233464,-0.056345,1.110206
2018-01-02 06:00:00,-5.970191,0.916564,0.20672,0.855337
2018-01-03 12:00:00,-2.639714,0.997241,-0.039319,0.864737
2018-01-04 18:00:00,-3.085624,0.780231,0.123491,0.780075
2018-01-06 00:00:00,-10.339576,1.081032,0.243819,0.814235
2018-01-07 06:00:00,7.570282,0.839099,0.201871,1.117953
2018-01-08 12:00:00,-1.063462,0.974744,0.033659,0.922071
2018-01-09 18:00:00,2.525827,0.896909,0.128494,1.054654
2018-01-11 00:00:00,-3.070194,1.130699,-0.1225,0.952821
2018-01-12 06:00:00,-2.712082,1.167818,0.107877,0.839172


In [75]:
index = pd.MultiIndex.from_arrays(
    [[1,2,3,4,5], 
    pd.date_range('2015-01-01', freq='W', periods=5)], 
    names=['v','d'])

data = {'date': pd.date_range('2015-01-01', freq='W', periods=5), 
        'a': np.arange(5)}

df = pd.DataFrame(data=data,index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,date,a
v,d,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2015-01-04,2015-01-04,0
2,2015-01-11,2015-01-11,1
3,2015-01-18,2015-01-18,2
4,2015-01-25,2015-01-25,3
5,2015-02-01,2015-02-01,4


In [76]:
df.resample('M', on='date').sum()

Unnamed: 0_level_0,a
date,Unnamed: 1_level_1
2015-01-31,6
2015-02-28,4


## 6 `Period` 时间跨度

- `freq` 取值为 `(D, H, T, S, L, U, N)`;

In [77]:
p = pd.Period('2018', freq='A-DEC')

display(p, p + 1)

Period('2018', 'A-DEC')

Period('2019', 'A-DEC')

In [78]:
p = pd.Period('2018-1-1', freq='D')

display(p, p-3)

Period('2018-01-01', 'D')

Period('2017-12-29', 'D')

In [79]:
p = pd.Period('2018-1-1 19:00', freq='H')

In [80]:
pd.Period('2018-1-1 19:00', freq='5H')

Period('2018-01-01 19:00', '5H')

- `offset` 和 `timedelta` 的区别

In [81]:
p = pd.Period('2014-07-01 09:00', freq='H')

p1 = p + Hour(2)
p2 = p + timedelta(minutes=120)
p3 = p + np.timedelta64(7200, 's')

display(
    p,
    p1,
    p2,
    p3,
)

# 只有频率相同的才可以相加
try:
    p + Minute(5)
except Exception as e:
    print(f"Error: {e}")

Period('2014-07-01 09:00', 'H')

Period('2014-07-01 11:00', 'H')

Period('2014-07-01 11:00', 'H')

Period('2014-07-01 11:00', 'H')

Error: Input cannot be converted to Period(freq=H)


In [82]:
# 月份差异
p = pd.Period('2014-07', freq='M')
p1 = p + MonthEnd(3)
try:
    p2 = p + MonthBegin(3)
except Exception as e:
    print(f"Error: {e}")

display(p1)

Error: Input has different freq=3MS from Period(freq=M)


Period('2014-10', 'M')

In [83]:
p1 = pd.Period('2012', freq='A-DEC')
p2 = pd.Period('2002', freq='A-DEC')

display(p2 - p1)

-10

### 6.2 `period_index` 周期指数和周期范围 `period_range`

In [84]:
prng = pd.period_range('1/1/2011', '1/1/2012', 
                       freq='M')
prng

PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05', '2011-06',
             '2011-07', '2011-08', '2011-09', '2011-10', '2011-11', '2011-12',
             '2012-01'],
            dtype='period[M]', freq='M')

In [85]:
pd.PeriodIndex(['2011-1', '2011-2', '2011-3'], 
               freq='M')

PeriodIndex(['2011-01', '2011-02', '2011-03'], dtype='period[M]', freq='M')

In [86]:
pd.PeriodIndex(start='2014-01', 
               freq='3M', 
               periods=4)

PeriodIndex(['2014-01', '2014-04', '2014-07', '2014-10'], dtype='period[3M]', freq='3M')

In [87]:
pd.PeriodIndex(
    start=pd.Period('2017Q1', freq='Q'),
    end=pd.Period('2017Q2', freq='Q'), 
    freq='M')

PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], dtype='period[M]', freq='M')

In [88]:
# 创建 series
ps = pd.Series(np.random.randn(len(prng)), prng)
ps

2011-01   -0.534796
2011-02    0.591118
2011-03   -1.171746
2011-04   -1.731343
2011-05    0.942469
2011-06    0.264055
2011-07    0.097198
2011-08    1.623017
2011-09   -0.499368
2011-10   -0.503337
2011-11   -0.081254
2011-12    0.438441
2012-01    0.464529
Freq: M, dtype: float64

In [89]:
# PeriodIndex 支持加减法
idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H')

display(
    idx,
    idx+Hour(2),
)

PeriodIndex(['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00',
             '2014-07-01 12:00', '2014-07-01 13:00'],
            dtype='period[H]', freq='H')

PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00',
             '2014-07-01 14:00', '2014-07-01 15:00'],
            dtype='period[H]', freq='H')

In [90]:
idx = pd.period_range('2014-07', periods=5, freq='M')

display(
    idx,
    idx + MonthEnd(3),
)

PeriodIndex(['2014-07', '2014-08', '2014-09', '2014-10', '2014-11'], dtype='period[M]', freq='M')

PeriodIndex(['2014-10', '2014-11', '2014-12', '2015-01', '2015-02'], dtype='period[M]', freq='M')

### 6.3 `Period Dtype`

- 时区感知

In [91]:
pi = pd.period_range('2016-01-01', periods=3, freq='M')

display(
    pi,
    pi.dtype,
    pi.astype('period[D]'),
    pi.astype('datetime64[ns]'),  #转换为datetime
)

PeriodIndex(['2016-01', '2016-02', '2016-03'], dtype='period[M]', freq='M')

period[M]

PeriodIndex(['2016-01-31', '2016-02-29', '2016-03-31'], dtype='period[D]', freq='D')

DatetimeIndex(['2016-01-01', '2016-02-01', '2016-03-01'], dtype='datetime64[ns]', freq='MS')

In [92]:
# datetimeIndex => periodIndex
dti = pd.date_range('2011-01-01', freq='M', periods=3)

display(
    dti,
    dti.astype('period[M]')
)

DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], dtype='datetime64[ns]', freq='M')

PeriodIndex(['2011-01', '2011-02', '2011-03'], dtype='period[M]', freq='M')

### 6.4 `Period` 索引

In [93]:
display(
    ps,
    ps['2011-01'],
    ps[datetime(2011, 12, 25):],
    ps['10/31/2011':'12/31/2011'],
)

2011-01   -0.534796
2011-02    0.591118
2011-03   -1.171746
2011-04   -1.731343
2011-05    0.942469
2011-06    0.264055
2011-07    0.097198
2011-08    1.623017
2011-09   -0.499368
2011-10   -0.503337
2011-11   -0.081254
2011-12    0.438441
2012-01    0.464529
Freq: M, dtype: float64

-0.5347959538144712

2011-12    0.438441
2012-01    0.464529
Freq: M, dtype: float64

2011-10   -0.503337
2011-11   -0.081254
2011-12    0.438441
Freq: M, dtype: float64

In [94]:
# 传递频率低于 PeriodIndex 的切片数据
ps['2011']

2011-01   -0.534796
2011-02    0.591118
2011-03   -1.171746
2011-04   -1.731343
2011-05    0.942469
2011-06    0.264055
2011-07    0.097198
2011-08    1.623017
2011-09   -0.499368
2011-10   -0.503337
2011-11   -0.081254
2011-12    0.438441
Freq: M, dtype: float64

In [95]:
dfp = pd.DataFrame(
    np.random.randn(600,1),               
    columns=['A'],
    index=pd.period_range('2013-01-01 9:00', periods=600, freq='T'))

dfp['2013-01-01 10H']

Unnamed: 0,A
2013-01-01 10:00,0.204561
2013-01-01 10:01,0.328049
2013-01-01 10:02,-1.019019
2013-01-01 10:03,1.404413
2013-01-01 10:04,-1.649659
2013-01-01 10:05,0.680598
2013-01-01 10:06,-0.284703
2013-01-01 10:07,-0.909449
2013-01-01 10:08,-0.75688
2013-01-01 10:09,-0.316692


In [96]:
dfp['2013-01-01 10H':'2013-01-01 11H']

Unnamed: 0,A
2013-01-01 10:00,0.204561
2013-01-01 10:01,0.328049
2013-01-01 10:02,-1.019019
2013-01-01 10:03,1.404413
2013-01-01 10:04,-1.649659
2013-01-01 10:05,0.680598
2013-01-01 10:06,-0.284703
2013-01-01 10:07,-0.909449
2013-01-01 10:08,-0.756880
2013-01-01 10:09,-0.316692


### 6.5 带周期指数的频率转换与重采样

In [97]:
# 频率转换
p = pd.Period('2011', freq='A-DEC')

display(
    p,
    p.asfreq('M', how='start'),
    p.asfreq('M', how='end'),
    p.asfreq('M', 's'),
    p.asfreq('M', 'e'),

)

Period('2011', 'A-DEC')

Period('2011-01', 'M')

Period('2011-12', 'M')

Period('2011-01', 'M')

Period('2011-12', 'M')

In [98]:
# 转换为超周期
p = pd.Period('2011-12', freq='M')
p.asfreq('A-NOV')  # 年度频率

Period('2012', 'A-NOV')

In [99]:
# 定义日历季度
p = pd.Period('2012Q1', freq='Q-DEC')

display(
    p.asfreq('D', 's'),
    p.asfreq('D', 'e')
)

Period('2012-01-01', 'D')

Period('2012-03-31', 'D')

In [100]:
# 定义3月底的财政年度
p = pd.Period('2011Q4', freq='Q-MAR')

display(
    p.asfreq('D', 's'),
    p.asfreq('D', 'e')
)

Period('2011-01-01', 'D')

Period('2011-03-31', 'D')

## 7 表之间的转换

In [101]:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts

2012-01-31    1.180348
2012-02-29   -0.580144
2012-03-31    1.660662
2012-04-30    0.832288
2012-05-31    0.614858
Freq: M, dtype: float64

In [102]:
ps = ts.to_period()

ps

2012-01    1.180348
2012-02   -0.580144
2012-03    1.660662
2012-04    0.832288
2012-05    0.614858
Freq: M, dtype: float64

In [103]:
ps.to_timestamp()

2012-01-01    1.180348
2012-02-01   -0.580144
2012-03-01    1.660662
2012-04-01    0.832288
2012-05-01    0.614858
Freq: MS, dtype: float64

In [104]:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()

1990-03-01 09:00    0.048045
1990-06-01 09:00   -1.001549
1990-09-01 09:00   -0.585561
1990-12-01 09:00   -0.642919
1991-03-01 09:00   -0.864791
Freq: H, dtype: float64

## 8 代表越界范围

In [105]:
span = pd.period_range('1215-01-01', '1381-01-01', freq='D')
span

PeriodIndex(['1215-01-01', '1215-01-02', '1215-01-03', '1215-01-04',
             '1215-01-05', '1215-01-06', '1215-01-07', '1215-01-08',
             '1215-01-09', '1215-01-10',
             ...
             '1380-12-23', '1380-12-24', '1380-12-25', '1380-12-26',
             '1380-12-27', '1380-12-28', '1380-12-29', '1380-12-30',
             '1380-12-31', '1381-01-01'],
            dtype='period[D]', length=60632, freq='D')

In [106]:
s = pd.Series([20121231, 20141130, 99991231])

def conv(x):
    return pd.Period(year = x // 10000, month = x//100 % 100, day = x%100, freq='D')

display(
    s.apply(conv),
    s.apply(conv)[2]
)

0   2012-12-31
1   2014-11-30
2   9999-12-31
dtype: object

Period('9999-12-31', 'D')

In [107]:
span = pd.PeriodIndex(s.apply(conv))
span

PeriodIndex(['2012-12-31', '2014-11-30', '9999-12-31'], dtype='period[D]', freq='D')

### 9 时区处理

- python 相关库：
    - `pytz`
    - `dateutil`

#### 9.1 使用时区

In [108]:
rng = pd.date_range('3/6/2012 00:00', periods=15, freq='D')

rng.tz is None   # 默认是无时区的

True

In [109]:
from pytz import common_timezones, all_timezones

# common_timezones 
# 'Asia/Shanghai'

In [110]:
rng_pytz = pd.date_range(
    '1/1/2018 00:00', 
    periods=10, 
    freq='D', 
    tz='Asia/Shanghai',
)

rng_pytz.tz

<DstTzInfo 'Asia/Shanghai' LMT+8:06:00 STD>

In [111]:
# dateutil
rng_dateutil = pd.date_range(
    '3/6/2012 00:00', 
    periods=10, 
    freq='D',
    tz='dateutil/Asia/Shanghai'
)

rng_dateutil.tz

tzfile('PRC')

In [112]:
# dateutil - utc
import dateutil

rng_utc = pd.date_range(
    '1/1/2018 00:00', 
    periods=10, 
    freq='D',
    tz=dateutil.tz.tzutc(),
)

rng_utc.tz

tzutc()

In [113]:
import pytz

tz_pytz = pytz.timezone('Asia/Shanghai')
rng_pytz = pd.date_range(
    '1/1/2018 00:00', 
    periods=10, 
    freq='D', 
    tz=tz_pytz
)

display(rng_pytz.tz == tz_pytz)

True

In [114]:
# dateutil
tz_dateutil = dateutil.tz.gettz('Asia/Shanghai')
rng_dateutil = pd.date_range(
    '1/1/2018 00:00', 
    periods=10, 
    freq='D', 
    tz=tz_dateutil
)

rng_dateutil.tz == tz_dateutil

True

In [115]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts_utc = ts.tz_localize('UTC')
ts_utc

2012-03-06 00:00:00+00:00    1.012146
2012-03-07 00:00:00+00:00   -0.388405
2012-03-08 00:00:00+00:00    0.912588
2012-03-09 00:00:00+00:00    1.878029
2012-03-10 00:00:00+00:00   -1.075077
2012-03-11 00:00:00+00:00   -1.163911
2012-03-12 00:00:00+00:00   -0.749080
2012-03-13 00:00:00+00:00    0.122951
2012-03-14 00:00:00+00:00   -0.123391
2012-03-15 00:00:00+00:00    0.292034
2012-03-16 00:00:00+00:00    0.798669
2012-03-17 00:00:00+00:00    1.660840
2012-03-18 00:00:00+00:00   -0.033446
2012-03-19 00:00:00+00:00   -0.293585
2012-03-20 00:00:00+00:00   -2.237755
Freq: D, dtype: float64

In [116]:
rng_asia = ts_utc.tz_convert('Asia/Shanghai')
rng_asia

2012-03-06 08:00:00+08:00    1.012146
2012-03-07 08:00:00+08:00   -0.388405
2012-03-08 08:00:00+08:00    0.912588
2012-03-09 08:00:00+08:00    1.878029
2012-03-10 08:00:00+08:00   -1.075077
2012-03-11 08:00:00+08:00   -1.163911
2012-03-12 08:00:00+08:00   -0.749080
2012-03-13 08:00:00+08:00    0.122951
2012-03-14 08:00:00+08:00   -0.123391
2012-03-15 08:00:00+08:00    0.292034
2012-03-16 08:00:00+08:00    0.798669
2012-03-17 08:00:00+08:00    1.660840
2012-03-18 08:00:00+08:00   -0.033446
2012-03-19 08:00:00+08:00   -0.293585
2012-03-20 08:00:00+08:00   -2.237755
Freq: D, dtype: float64

In [117]:
rng_asia[1]

-0.38840490543240985

In [118]:
rng_europe = rng_asia.index[1].tz_convert('Europe/Berlin')
rng_asia = rng_europe.tz_convert('Asia/Shanghai')

display(
    rng_europe,
    rng_asia,
)

Timestamp('2012-03-07 01:00:00+0100', tz='Europe/Berlin')

Timestamp('2012-03-07 08:00:00+0800', tz='Asia/Shanghai')

- 删除时区

In [119]:
didx = pd.DatetimeIndex(
    start='2014-08-01 09:00', 
    freq='H', 
    periods=10, 
    tz='US/Eastern')
didx

DatetimeIndex(['2014-08-01 09:00:00-04:00', '2014-08-01 10:00:00-04:00',
               '2014-08-01 11:00:00-04:00', '2014-08-01 12:00:00-04:00',
               '2014-08-01 13:00:00-04:00', '2014-08-01 14:00:00-04:00',
               '2014-08-01 15:00:00-04:00', '2014-08-01 16:00:00-04:00',
               '2014-08-01 17:00:00-04:00', '2014-08-01 18:00:00-04:00'],
              dtype='datetime64[ns, US/Eastern]', freq='H')

In [120]:
# 转换为本地无时区时间
didx.tz_localize(None)

DatetimeIndex(['2014-08-01 09:00:00', '2014-08-01 10:00:00',
               '2014-08-01 11:00:00', '2014-08-01 12:00:00',
               '2014-08-01 13:00:00', '2014-08-01 14:00:00',
               '2014-08-01 15:00:00', '2014-08-01 16:00:00',
               '2014-08-01 17:00:00', '2014-08-01 18:00:00'],
              dtype='datetime64[ns]', freq='H')

In [121]:
didx.tz_convert(None)

DatetimeIndex(['2014-08-01 13:00:00', '2014-08-01 14:00:00',
               '2014-08-01 15:00:00', '2014-08-01 16:00:00',
               '2014-08-01 17:00:00', '2014-08-01 18:00:00',
               '2014-08-01 19:00:00', '2014-08-01 20:00:00',
               '2014-08-01 21:00:00', '2014-08-01 22:00:00'],
              dtype='datetime64[ns]', freq='H')

In [122]:
didx.tz_convert('UCT').tz_localize(None)

DatetimeIndex(['2014-08-01 13:00:00', '2014-08-01 14:00:00',
               '2014-08-01 15:00:00', '2014-08-01 16:00:00',
               '2014-08-01 17:00:00', '2014-08-01 18:00:00',
               '2014-08-01 19:00:00', '2014-08-01 20:00:00',
               '2014-08-01 21:00:00', '2014-08-01 22:00:00'],
              dtype='datetime64[ns]', freq='H')

#### 9.2 定位时区时的模棱两可

In [123]:
rng_hourly = pd.DatetimeIndex(
    ['11/06/2011 00:00', '11/06/2011 01:00',
    '11/06/2011 01:00', '11/06/2011 02:00',
    '11/06/2011 03:00'])
try:
    rng_hourly.tz_localize('US/Eastern')
except Exception as e:
    print(f"Error: {e}")

Error: Cannot infer dst time from '2011-11-06 01:00:00', try using the 'ambiguous' argument


In [124]:
rng_hourly_eastern = rng_hourly.tz_localize('US/Eastern', ambiguous='infer')
rng_hourly_eastern

DatetimeIndex(['2011-11-06 00:00:00-04:00', '2011-11-06 01:00:00-05:00',
               '2011-11-06 01:00:00-05:00', '2011-11-06 02:00:00-05:00',
               '2011-11-06 03:00:00-05:00'],
              dtype='datetime64[ns, US/Eastern]', freq=None)

In [125]:
rng_hourly_dst = np.array([1, 1, 0, 0, 0])
rng_hourly.tz_localize('US/Eastern', ambiguous=rng_hourly_dst).tolist()

[Timestamp('2011-11-06 00:00:00-0400', tz='US/Eastern'),
 Timestamp('2011-11-06 01:00:00-0400', tz='US/Eastern'),
 Timestamp('2011-11-06 01:00:00-0500', tz='US/Eastern'),
 Timestamp('2011-11-06 02:00:00-0500', tz='US/Eastern'),
 Timestamp('2011-11-06 03:00:00-0500', tz='US/Eastern')]

In [126]:
rng_hourly.tz_localize('US/Eastern', ambiguous='NaT').tolist()

[Timestamp('2011-11-06 00:00:00-0400', tz='US/Eastern'),
 NaT,
 NaT,
 Timestamp('2011-11-06 02:00:00-0500', tz='US/Eastern'),
 Timestamp('2011-11-06 03:00:00-0500', tz='US/Eastern')]

In [127]:
didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern')
didx

DatetimeIndex(['2014-08-01 09:00:00-04:00', '2014-08-01 10:00:00-04:00',
               '2014-08-01 11:00:00-04:00', '2014-08-01 12:00:00-04:00',
               '2014-08-01 13:00:00-04:00', '2014-08-01 14:00:00-04:00',
               '2014-08-01 15:00:00-04:00', '2014-08-01 16:00:00-04:00',
               '2014-08-01 17:00:00-04:00', '2014-08-01 18:00:00-04:00'],
              dtype='datetime64[ns, US/Eastern]', freq='H')

In [128]:
# 转换为之前的时区设置时间
didx.tz_localize(None)

DatetimeIndex(['2014-08-01 09:00:00', '2014-08-01 10:00:00',
               '2014-08-01 11:00:00', '2014-08-01 12:00:00',
               '2014-08-01 13:00:00', '2014-08-01 14:00:00',
               '2014-08-01 15:00:00', '2014-08-01 16:00:00',
               '2014-08-01 17:00:00', '2014-08-01 18:00:00'],
              dtype='datetime64[ns]', freq='H')

In [129]:
# 转换为UTC的无时区时间
didx.tz_convert(None)

DatetimeIndex(['2014-08-01 13:00:00', '2014-08-01 14:00:00',
               '2014-08-01 15:00:00', '2014-08-01 16:00:00',
               '2014-08-01 17:00:00', '2014-08-01 18:00:00',
               '2014-08-01 19:00:00', '2014-08-01 20:00:00',
               '2014-08-01 21:00:00', '2014-08-01 22:00:00'],
              dtype='datetime64[ns]', freq='H')

In [130]:
didx.tz_convert('UCT').tz_localize(None)

DatetimeIndex(['2014-08-01 13:00:00', '2014-08-01 14:00:00',
               '2014-08-01 15:00:00', '2014-08-01 16:00:00',
               '2014-08-01 17:00:00', '2014-08-01 18:00:00',
               '2014-08-01 19:00:00', '2014-08-01 20:00:00',
               '2014-08-01 21:00:00', '2014-08-01 22:00:00'],
              dtype='datetime64[ns]', freq='H')

#### 9.3 TZ 敏感 Dtype

In [131]:
s_naive = pd.Series(pd.date_range('20130101',periods=3))
s_naive

0   2013-01-01
1   2013-01-02
2   2013-01-03
dtype: datetime64[ns]

In [132]:
s_aware = pd.Series(pd.date_range('20130101',periods=3,tz='US/Eastern'))
s_aware

0   2013-01-01 00:00:00-05:00
1   2013-01-02 00:00:00-05:00
2   2013-01-03 00:00:00-05:00
dtype: datetime64[ns, US/Eastern]

In [133]:
s_naive.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2012-12-31 19:00:00-05:00
1   2013-01-01 19:00:00-05:00
2   2013-01-02 19:00:00-05:00
dtype: datetime64[ns, US/Eastern]

In [134]:
s_naive.astype('datetime64[ns, US/Eastern]')

0   2012-12-31 19:00:00-05:00
1   2013-01-01 19:00:00-05:00
2   2013-01-02 19:00:00-05:00
dtype: datetime64[ns, US/Eastern]

In [135]:
s_aware.astype('datetime64[ns]')

0   2013-01-01 05:00:00
1   2013-01-02 05:00:00
2   2013-01-03 05:00:00
dtype: datetime64[ns]

In [136]:
s_aware.astype('datetime64[ns, CET]')

0   2013-01-01 06:00:00+01:00
1   2013-01-02 06:00:00+01:00
2   2013-01-03 06:00:00+01:00
dtype: datetime64[ns, CET]

In [137]:
s_naive.values  # 转换为utc

array(['2013-01-01T00:00:00.000000000', '2013-01-02T00:00:00.000000000',
       '2013-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

In [138]:
s_aware.values  # 转换为utc

array(['2013-01-01T05:00:00.000000000', '2013-01-02T05:00:00.000000000',
       '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]')

In [139]:
pd.Series(s_aware.values)

0   2013-01-01 05:00:00
1   2013-01-02 05:00:00
2   2013-01-03 05:00:00
dtype: datetime64[ns]

In [140]:
pd.Series(s_aware.values).dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 00:00:00-05:00
1   2013-01-02 00:00:00-05:00
2   2013-01-03 00:00:00-05:00
dtype: datetime64[ns, US/Eastern]