In [16]:
import pandas as pd
import numpy as np

## 把日期和时间拆成多个特征

In [2]:
df = pd.DataFrame()
df['date'] = pd.date_range('1/1/2001', periods=150, freq='w')  # 每个一星期
df

Unnamed: 0,date
0,2001-01-07
1,2001-01-14
2,2001-01-21
3,2001-01-28
4,2001-02-04
...,...
145,2003-10-19
146,2003-10-26
147,2003-11-02
148,2003-11-09


In [5]:
df['date'].dt

<pandas.core.indexes.accessors.DatetimeProperties object at 0x7fb2d194ae10>

In [9]:
# 提取年 月 日等
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df.date.dt.day
df['weekday'] = df.date.dt.weekday
df['hour'] = df.date.dt.hour

In [11]:
df.head(3)

Unnamed: 0,date,year,month,day,weekday,hour
0,2001-01-07,2001,1,7,6,0
1,2001-01-14,2001,1,14,6,0
2,2001-01-21,2001,1,21,6,0


## 计算日期时间之间的差

In [12]:
df = pd.DataFrame()
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
# 计算特征之间的间隔
df['Left'] - df['Arrived']

0   0 days
1   2 days
dtype: timedelta64[ns]

In [14]:
pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))

0    0
1    2
dtype: int64

## 将字符串转换为日期


```
代码	描述	示例
%Y	整年	2001
%m	零填充的月份	04
%d	零填充的日期	09
%I	零填充的小时（12 小时）	02
%p	AM 或 PM	AM
%M	零填充的分钟	05
%S	零填充的秒钟
```

如果errors="coerce"那么任何问题都不会产生错误（默认行为），而是将导致错误的值设置为NaT（即缺失值）。

In [17]:
date_strings = np.array(
    ['03-04-2005 11:35 PM',
     '23-05-2010 12:01 AM',
     '04-09-2009 09:09 PM']
)
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p") for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

## 转换 pandas 列的时区

In [18]:
from pytz import all_timezones

all_timezones[:10]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Asmera',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau']

In [23]:
all_timezones[-10:]

['US/Indiana-Starke',
 'US/Michigan',
 'US/Mountain',
 'US/Pacific',
 'US/Samoa',
 'UTC',
 'Universal',
 'W-SU',
 'WET',
 'Zulu']

In [19]:
dates = pd.Series(pd.date_range('2020/2/2', periods=10, freq='M'))
dates

0   2020-02-29
1   2020-03-31
2   2020-04-30
3   2020-05-31
4   2020-06-30
5   2020-07-31
6   2020-08-31
7   2020-09-30
8   2020-10-31
9   2020-11-30
dtype: datetime64[ns]

In [21]:
# 设置时区
dates_with_Abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')
dates_with_Abidjan_time_zone

0   2020-02-29 00:00:00+00:00
1   2020-03-31 00:00:00+00:00
2   2020-04-30 00:00:00+00:00
3   2020-05-31 00:00:00+00:00
4   2020-06-30 00:00:00+00:00
5   2020-07-31 00:00:00+00:00
6   2020-08-31 00:00:00+00:00
7   2020-09-30 00:00:00+00:00
8   2020-10-31 00:00:00+00:00
9   2020-11-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

In [24]:
# 转换时区
dates_with_Pacific_tz = dates_with_Abidjan_time_zone.dt.tz_convert('US/Pacific')
dates_with_Pacific_tz

0   2020-02-28 16:00:00-08:00
1   2020-03-30 17:00:00-07:00
2   2020-04-29 17:00:00-07:00
3   2020-05-30 17:00:00-07:00
4   2020-06-29 17:00:00-07:00
5   2020-07-30 17:00:00-07:00
6   2020-08-30 17:00:00-07:00
7   2020-09-29 17:00:00-07:00
8   2020-10-30 17:00:00-07:00
9   2020-11-29 16:00:00-08:00
dtype: datetime64[ns, US/Pacific]

## 编码星期

In [25]:
# 创建数据集
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))
dates

0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns]

In [32]:
dates.dt.dayofweek  # 一周里的第几天，Monday=0, Sunday=6

0    3
1    6
2    1
dtype: int64

In [33]:
dates.dt.days_in_month

0    28
1    31
2    30
dtype: int64

In [None]:
dates.dt.w

## 处理时间序列中的缺失值

In [35]:
# 创建日期
time_index = pd.date_range('01/01/2010', periods=5, freq='M')
df = pd.DataFrame(index=time_index)
df

2010-01-31
2010-02-28
2010-03-31
2010-04-30
2010-05-31


In [36]:
df['Sales'] = [1., 2., np.nan, np.nan, 5.]
df

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,
2010-04-30,
2010-05-31,5.0


In [42]:
df.interpolate?

[0;31mSignature:[0m
[0mdf[0m[0;34m.[0m[0minterpolate[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmethod[0m[0;34m=[0m[0;34m'linear'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit_direction[0m[0;34m=[0m[0;34m'forward'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit_area[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdowncast[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Interpolate values according to different methods.

Please note that only ``method='linear'`` is supported for
DataFrame/Series with a MultiIndex.

Parameters
----------
method : str, default 'li

In [37]:
# 对缺失值执行插值
df.interpolate()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


In [39]:
# 前向填充
df.ffill()  # fillna` with ``method='ffill'``.

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


In [41]:
# 后填充
df.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


In [43]:
df.interpolate(limit=1, limit_direction='forward')

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,
2010-05-31,5.0


In [45]:
df.interpolate(limit=1, limit_direction='backward')

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,
2010-04-30,4.0
2010-05-31,5.0


## 平移时间特征

In [46]:
df = pd.DataFrame()

# 创建数据
df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D')
df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
df

Unnamed: 0,dates,stock_price
0,2001-01-01,1.1
1,2001-01-02,2.2
2,2001-01-03,3.3
3,2001-01-04,4.4
4,2001-01-05,5.5


In [48]:
df['previous_day_stock_price'] = df['stock_price'].shift(1)
df

Unnamed: 0,dates,stock_price,previous_day_stock_price
0,2001-01-01,1.1,
1,2001-01-02,2.2,1.1
2,2001-01-03,3.3,2.2
3,2001-01-04,4.4,3.3
4,2001-01-05,5.5,4.4


In [50]:
df['stock_price'].diff()

0    NaN
1    1.1
2    1.1
3    1.1
4    1.1
Name: stock_price, dtype: float64

## 滑动时间窗口

In [52]:
time_index = pd.date_range('01/01/2020', periods=5, freq='M')

df = pd.DataFrame(index=time_index)
df['stock_price'] = [1, 2, 3, 4, 5]
df

Unnamed: 0,stock_price
2020-01-31,1
2020-02-29,2
2020-03-31,3
2020-04-30,4
2020-05-31,5


In [53]:
# 计算滑动均值
df.rolling(window=2).mean()

Unnamed: 0,stock_price
2020-01-31,
2020-02-29,1.5
2020-03-31,2.5
2020-04-30,3.5
2020-05-31,4.5


In [54]:
df.rolling(window=2).max()

Unnamed: 0,stock_price
2020-01-31,
2020-02-29,2.0
2020-03-31,3.0
2020-04-30,4.0
2020-05-31,5.0


## 选择日期时间范围

In [56]:
df = pd.DataFrame()

df['date'] = pd.date_range('01/01/2001', periods=100000, freq='H')
df

Unnamed: 0,date
0,2001-01-01 00:00:00
1,2001-01-01 01:00:00
2,2001-01-01 02:00:00
3,2001-01-01 03:00:00
4,2001-01-01 04:00:00
...,...
99995,2012-05-29 11:00:00
99996,2012-05-29 12:00:00
99997,2012-05-29 13:00:00
99998,2012-05-29 14:00:00


In [57]:
# 选择两个日期时间之间的观测
df[(df['date']>'2001-01-01 01:00:00') & (df['date']<'2001-01-05 02:00:00')]

Unnamed: 0,date
2,2001-01-01 02:00:00
3,2001-01-01 03:00:00
4,2001-01-01 04:00:00
5,2001-01-01 05:00:00
6,2001-01-01 06:00:00
...,...
93,2001-01-04 21:00:00
94,2001-01-04 22:00:00
95,2001-01-04 23:00:00
96,2001-01-05 00:00:00


In [58]:
# 根据时间索引

df = df.set_index(df['date'])
df

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2001-01-01 00:00:00,2001-01-01 00:00:00
2001-01-01 01:00:00,2001-01-01 01:00:00
2001-01-01 02:00:00,2001-01-01 02:00:00
2001-01-01 03:00:00,2001-01-01 03:00:00
2001-01-01 04:00:00,2001-01-01 04:00:00
...,...
2012-05-29 11:00:00,2012-05-29 11:00:00
2012-05-29 12:00:00,2012-05-29 12:00:00
2012-05-29 13:00:00,2012-05-29 13:00:00
2012-05-29 14:00:00,2012-05-29 14:00:00


In [59]:
df.loc['2001-01-01 01:00:00':'2001-01-01 05:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2001-01-01 01:00:00,2001-01-01 01:00:00
2001-01-01 02:00:00,2001-01-01 02:00:00
2001-01-01 03:00:00,2001-01-01 03:00:00
2001-01-01 04:00:00,2001-01-01 04:00:00
2001-01-01 05:00:00,2001-01-01 05:00:00
