In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
from datetime import datetime

In [3]:
# --------------1. 日期和时间数据类型及其工具------------------

In [4]:
# datetime, time, calendar，这些模块经常被使用
now = datetime.now()
now

datetime.datetime(2018, 6, 26, 9, 9, 5, 124394)

In [5]:
now.year, now.month, now.day, now.hour

(2018, 6, 26, 9)

In [6]:
# timedelta表示两个不同的datetime对象之间的时间差
from datetime import timedelta
start = datetime(2011, 1, 7)
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [7]:
#-------------------1.1 字符串与时间的转换----------------------

In [8]:
# 使用str或strftime方法，对datetime对象，以及pandas的Timestamp对象进行格式化
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [9]:
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [10]:
# 字符串转换为日期，这要用到datetime.strptime
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [11]:
# 使用第三方库dateutil中的parser.parse方法
from dateutil.parser import parse
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [12]:
# dateutil能够解析很多常见的时间表示格式：
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [13]:
#日在月之前是很常见的
parse('6/12/2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

In [14]:
# pandas通常可以用于处理由日期组成的数组，to_datetime方法能解析很多不同种类的日期表示
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [15]:
# 处理一些应该被判断为缺失的值（比如None, 空字符串之类的）
idx = pd.to_datetime(datestrs + [None])
idx[2]

NaT

In [16]:
# -------------2. 时间序列基础 ----------------------

In [17]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8), 
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02   -1.621248
2011-01-05    0.146320
2011-01-07   -1.530208
2011-01-08   -1.418126
2011-01-10   -0.004386
2011-01-12    1.398968
dtype: float64

In [18]:
# datetime对象被放进了DatetimeIndex
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [19]:
type(ts) # 变量为TimeSeries

pandas.core.series.Series

In [20]:
# --------2.1 Indexing, Selection, Subsetting（索引，选择，取子集）---------------

In [21]:
ts

2011-01-02   -1.621248
2011-01-05    0.146320
2011-01-07   -1.530208
2011-01-08   -1.418126
2011-01-10   -0.004386
2011-01-12    1.398968
dtype: float64

In [22]:
stamp = ts.index[2] # 获得时间戳索引
ts[stamp]

-1.5302084573099586

In [23]:
ts[2]  # 直接用int索引值

-1.5302084573099586

In [24]:
ts['1/10/2011'] # 直接传入日期的字符串

-0.004385757398959138

In [26]:
# 比较长的时间序列，我们可以直接传入一年或一年一个月，来进行数据选取
longer_ts = pd.Series(np.random.randn(10),
                      index=pd.date_range('1/1/2000', periods=10))
longer_ts

2000-01-01    0.022563
2000-01-02    0.186218
2000-01-03   -0.057247
2000-01-04   -0.506013
2000-01-05    1.000048
2000-01-06   -0.976099
2000-01-07    0.611399
2000-01-08   -1.024048
2000-01-09    1.645390
2000-01-10   -0.096786
Freq: D, dtype: float64

In [27]:
# -------------2.2 重复索引的时间序列-----------------

In [28]:
# 遇到多个数据在同一时间戳下的情况
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', 
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [29]:
dup_ts.index.is_unique

False

In [30]:
#对这个时间序列取索引的的话， 要么得到标量，要么得到切片，这取决于时间戳是否是重复的
dup_ts['1/3/2000'] # not duplicated

4

In [31]:
dup_ts['1/2/2000'] # duplicated

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [35]:
grouped = dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [33]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

In [36]:
# -------------3 Date Ranges, Frequencies, and Shifting（日期范围，频度，和位移）------------------

In [37]:
ts

2011-01-02   -1.621248
2011-01-05    0.146320
2011-01-07   -1.530208
2011-01-08   -1.418126
2011-01-10   -0.004386
2011-01-12    1.398968
dtype: float64

In [42]:
# 样本时间序列变为固定按日的频率，需要调用resample
ts.resample('D')

DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]

In [43]:
# ---------------3.1 Generating Date Ranges（生成日期范围）--------------

In [44]:
# pandas.date_range是用来生成DatetimeIndex的，可以指明长度
index = pd.date_range('2012-04-01', '2012-05-01') # 按日频度的时间戳
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01'],
              dtype='datetime64[ns]', freq='D')

In [45]:
pd.date_range(start='2012-04-01', periods=20) # 传入一个开始或一个结束时间 + 一个数字来表示时期

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [47]:
# 时间戳全部归一化到午夜（00:00:00）--normalize选项
nor_date = pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)
nor_date[0]

Timestamp('2012-05-02 00:00:00', freq='D')

In [48]:
#----------------3.2 Frequencies and Date Offsets（频度和日期偏移）---------------

In [49]:
# pandas中的频度由一个基本频度（base frequency）和一个乘法器（multiplier）组成
pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [50]:
# -------------3.3 Shifting (Leading and Lagging) Data （移动（超前与滞后）数据）------------------

In [51]:
ts = pd.Series(np.random.randn(4),
               index=pd.date_range('1/1/2000', periods=4, freq='M'))
ts

2000-01-31    0.460300
2000-02-29    0.795802
2000-03-31   -0.597854
2000-04-30   -1.266302
Freq: M, dtype: float64

In [52]:
# （shifting）表示按照时间把数据向前或向后推移
ts.shift(2)

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.460300
2000-04-30    0.795802
Freq: M, dtype: float64

In [54]:
# 向之前推移
ts.shift(-2)

2000-01-31   -0.597854
2000-02-29   -1.266302
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [55]:
# shift的一个普通的用法是计算时间序列的百分比变化
ts / ts.shift(1) - 1

2000-01-31         NaN
2000-02-29    0.728877
2000-03-31   -1.751259
2000-04-30    1.118080
Freq: M, dtype: float64

In [56]:
# 普通的shift不会对index进行修改，一些数据会被丢弃。
# 因此如果频度是已知的，可以把频度传递给shift，这样的话时间戳会自动变化，而数据不变

ts

2000-01-31    0.460300
2000-02-29    0.795802
2000-03-31   -0.597854
2000-04-30   -1.266302
Freq: M, dtype: float64

In [57]:
ts.shift(3, freq='D')  # 只有时间戳后移

2000-02-03    0.460300
2000-03-03    0.795802
2000-04-03   -0.597854
2000-05-03   -1.266302
dtype: float64

In [58]:
# --------------3.4 Shifting dates with offsets（用偏移量来移动日期）-----------------