# Chapter 7: Handling Dates and Times

In [1]:
import numpy as np
import pandas as pd

## 7.1 Converting Strings to Dates

In [2]:
import numpy as np
import pandas as pd

In [3]:
date_strings = np.array(['03-04-2005 11:35 PM',
                         '23-05-2010 12:01 AM',
                         '04-09-2009 09:09 PM'])

#### Convert to datetimes

In [4]:
[pd.to_datetime(date, format='%d-%m-%Y %I:%M %p', errors='coerce') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

## 7.2 Handling Time Zones

#### Set timezone when creating datetime

In [5]:
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

#### Add a timezone to an existing datetime

In [6]:
date = pd.Timestamp('2017-05-01 06:00:00')

Set time zone:

In [7]:
dateInLondon = date.tz_localize('Europe/London')

In [8]:
dateInLondon

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

#### Convert to a different timezone

In [9]:
dateInLondon.tz_convert('Africa/Abidjan')

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

#### Apply `tz_localize` and `tz_convert` to every element

Create dates

In [10]:
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))

In [12]:
dates

0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns]

Set time zone

In [11]:
dates.dt.tz_localize('Africa/Abidjan')

0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

#### See all available time zone options

In [13]:
from pytz import all_timezones

In [16]:
len(all_timezones)

593

In [17]:
all_timezones[0:593:50]

['Africa/Abidjan',
 'Africa/Timbuktu',
 'America/Curacao',
 'America/Managua',
 'America/Shiprock',
 'Asia/Brunei',
 'Asia/Pyongyang',
 'Australia/Currie',
 'Etc/GMT+6',
 'Europe/Ljubljana',
 'Indian/Christmas',
 'Pacific/Niue']

## 7.3 Selecting Dates and Times

In [19]:
import pandas as pd

In [20]:
df = pd.DataFrame()

#### Create datetimes

In [22]:
df['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')

In [23]:
df.shape

(100000, 1)

#### Select observations between 2 datetimes (boolean)

In [24]:
df[(df['date'] > '2002-1-1 01:00:00') &
   (df['date'] <= '2002-1-1 04:00:00')]

Unnamed: 0,date
8762,2002-01-01 02:00:00
8763,2002-01-01 03:00:00
8764,2002-01-01 04:00:00


#### Set the date column as the index and slice using `.loc[]`

In [25]:
df = df.set_index(df['date'])

In [26]:
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00


## 7.4 Breaking Up Date Data into Multiple Features

In [27]:
import pandas as pd

#### Split your datetimes into year, month, day, hour, minute

In [28]:
df = pd.DataFrame()

#### Create dates

In [35]:
df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')

In [36]:
df.head()

Unnamed: 0,date,date.1
0,2001-01-07,2001-01-07
1,2001-01-14,2001-01-14
2,2001-01-21,2001-01-21
3,2001-01-28,2001-01-28
4,2001-02-04,2001-02-04


#### Create features for year, month, day, hour, minute

In [37]:
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day
df['hour'] = df.date.dt.hour
df['minute'] = df.date.dt.minute

In [38]:
df.head()

Unnamed: 0,date,date.1,year,month,day,hour,minute
0,2001-01-07,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001-02-04,2001,2,4,0,0


## 7.5 Calculating the Difference Between Dates

In [39]:
import pandas as pd

In [40]:
df = pd.DataFrame()

#### Create two datetime features

In [44]:
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]

In [45]:
df

Unnamed: 0,Arrived,Left
0,2017-01-01,2017-01-01
1,2017-01-04,2017-01-06


#### Calculate duration between features

In [46]:
df.Left - df.Arrived

0   0 days
1   2 days
dtype: timedelta64[ns]

See also: pandas `TimeDelta`

## 7.6 Encoding Days of the Week

In [47]:
import pandas as pd

#### Create dates

In [53]:
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq="M"))

In [52]:
dates

0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns]

#### Show days of the week

In [59]:
dates.dt.day_name()

0    Thursday
1      Sunday
2     Tuesday
dtype: object

#### Code day of week as numerical, for machine learning (Monday is 0)

In [60]:
dates.dt.weekday

0    3
1    6
2    1
dtype: int64

## 7.7 Creating a Lagged Feature

In [61]:
import pandas as pd

In [62]:
df = pd.DataFrame()

#### Create data

In [63]:
df["dates"] = pd.date_range("1/1/2001", periods=5, freq="D")
df["stock_price"] = [1.1,2.2,3.3,4.4,5.5]

In [64]:
df

Unnamed: 0,dates,stock_price
0,2001-01-01,1.1
1,2001-01-02,2.2
2,2001-01-03,3.3
3,2001-01-04,4.4
4,2001-01-05,5.5


#### Lagged values by one row

In [65]:
df['previous_days_stock_price'] = df['stock_price'].shift(1)

In [66]:
df

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2001-01-01,1.1,
1,2001-01-02,2.2,1.1
2,2001-01-03,3.3,2.2
3,2001-01-04,4.4,3.3
4,2001-01-05,5.5,4.4


## 7.8 Using Rolling Time Windows

In [67]:
import pandas as pd

#### Create datetimes

In [68]:
time_index = pd.date_range("01/01/2010", periods=5, freq="M")

In [69]:
time_index

DatetimeIndex(['2010-01-31', '2010-02-28', '2010-03-31', '2010-04-30',
               '2010-05-31'],
              dtype='datetime64[ns]', freq='M')

#### Create DF, set index

In [70]:
df = pd.DataFrame(index=time_index)

In [71]:
df

2010-01-31
2010-02-28
2010-03-31
2010-04-30
2010-05-31


#### Create feature

In [72]:
df['Stock_Price'] = [1,2,3,4,5]

In [73]:
df

Unnamed: 0,Stock_Price
2010-01-31,1
2010-02-28,2
2010-03-31,3
2010-04-30,4
2010-05-31,5


#### Calculate rolling mean

In [74]:
df.rolling(window=2).mean()

Unnamed: 0,Stock_Price
2010-01-31,
2010-02-28,1.5
2010-03-31,2.5
2010-04-30,3.5
2010-05-31,4.5


## 7.9 Handling Missing Data in Time Series

In [75]:
import pandas as pd
import numpy as np

#### Create date

In [76]:
time_index = pd.date_range('01/01/2010', periods=5, freq='M')

#### Create DF, set index

In [77]:
df = pd.DataFrame(index=time_index)

#### Create a feature w/a gap of missing values

In [78]:
df['Sales'] = [1.0, 2.0, np.nan, np.nan, 5.0]

#### Interpolate missing values

In [79]:
df.interpolate()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


#### Also: Forward-fill

In [80]:
df.ffill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


#### and Back-fill

In [81]:
df.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


#### Interpolate as quadratic

In [82]:
df.interpolate(method='quadratic')

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.059808
2010-04-30,4.038069
2010-05-31,5.0


#### Interpolate part of the gap

In [83]:
df.interpolate(limit=1, limit_direction='forward')

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,
2010-05-31,5.0
