In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import datetime
from datetime import datetime, date
import matplotlib as plt
%matplotlib inline

# The datetime, day, and time objects

In [6]:
datetime(2014, 12, 15) #create datetime object by three parameter; 15 is days, 12 is month; default houre and minute is 0

datetime.datetime(2014, 12, 15, 0, 0)

In [7]:
datetime(2014, 12, 15, 17, 30) # 5h30 pm

datetime.datetime(2014, 12, 15, 17, 30)

In [8]:
# get the local "now" (date and time)
# can take a timezone, but that's not demonstrated here
datetime.now()

datetime.datetime(2020, 4, 13, 14, 34, 54, 886034)

In [9]:
# a date without time can be represented by creating a date using a datetime object: datetime.date
datetime.date(datetime(2014, 12, 15))

datetime.date(2014, 12, 15)

In [10]:
# get just the current date
datetime.now().date()

datetime.date(2020, 4, 13)

In [11]:
# get just a time from a datetime
datetime.time(datetime(2014, 12, 15, 17, 30))

datetime.time(17, 30)

In [12]:
# get the current local time
datetime.now().time()

datetime.time(14, 45, 54, 233906)

# Timestamp objects

In [19]:
"""
A pandas Timestamp is based on the datetime64 dtype
and has higher precision than the Python datetime object. In pandas, Timestamp objects
are generally interchangeable with datetime objects
"""
#Create a timestamp representing a specific date
pd.Timestamp('2014-12-15')

Timestamp('2014-12-15 00:00:00')

In [20]:
# a timestamp with both date and time
pd.Timestamp('2014-12-15 17:30')

Timestamp('2014-12-15 17:30:00')

In [21]:
# timestamp with just a time which adds in the current local date
pd.Timestamp('17:30')

Timestamp('2020-04-13 17:30:00')

In [22]:
# get the current date and time (now)
pd.Timestamp("now")

Timestamp('2020-04-13 14:59:18.616899')

# Using a Timedelta to represent a time interval

In [24]:
"""To represent a difference in time we will use the pandas Timedelta object"""

'To represent a difference in time we will use the pandas Timedelta object'

In [25]:
#uses a Timedelta object to calculate a one-day increase in the time from the specified date:
# what is one day from 2014-11-30?
today = datetime(2014, 11, 30)
tomorrow = today + pd.Timedelta(days=1)
tomorrow

datetime.datetime(2014, 12, 1, 0, 0)

In [26]:
# how many days between these two dates?
date1 = datetime(2014, 12, 2)
date2 = datetime(2014, 11, 28)
date1 - date2

datetime.timedelta(days=4)

# Indexing using DatetimeIndex

In [28]:
"""time-series functionality in pandas revolves around the use of specialized
indexes that represent measurements of data at one or more timestamps

These indexes in pandas are referred to as DateTimeIndex objects
"""

'time-series functionality in pandas revolves around the use of specialized\nindexes that represent measurements of data at one or more timestamps\n\nThese indexes in pandas are referred to as DateTimeIndex objects\n'

In [29]:
#creates a DateTime index by passing a list of datetime objects to a Series:
dates = [datetime(2014, 8, 1), datetime(2014, 8, 2)]
ts = pd.Series(np.random.randn(2), dates)
ts
"""This Series has taken the DateTime objects and constructed a DatetimeIndex from the
date values. Each value of that index is a Timestamp object."""

2014-08-01   -0.239395
2014-08-02    1.155256
dtype: float64

In [30]:
    # what is the type of the index?
type(ts.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [31]:
    # and we can see it is a collection of timestamps
type(ts.index[0])

pandas._libs.tslibs.timestamps.Timestamp

In [32]:
# create from just a list of dates as strings!
np.random.seed(123456)
dates = ['2014-08-01', '2014-08-02']
ts = pd.Series(np.random.randn(2), dates)
ts

2014-08-01    0.469112
2014-08-02   -0.282863
dtype: float64

In [33]:
"""
pandas provides a utility function in pd.to_datetime() which takes a sequence of
similar- or mixed-type objects which pandas attempts to convert into Timestamp objects
and those into a DatetimeIndex. If an object cannot be converted, the pandas will create a NaT
"""
# convert a sequence of objects to a DatetimeIndex
dti = pd.to_datetime(['Aug 1, 2014', 
                      '2014-08-02', 
                      '2014.8.3', 
                      None])
for l in dti: print (l)

2014-08-01 00:00:00
2014-08-02 00:00:00
2014-08-03 00:00:00
NaT


In [34]:
# pd.to_datetime(['Aug 1, 2014', 'foo'])
# this throws an error in 0.20.1
# force the conversion, NaT for items that dont work
pd.to_datetime(['Aug 1, 2014', 'foo'], errors="coerce")

DatetimeIndex(['2014-08-01', 'NaT'], dtype='datetime64[ns]', freq=None)

In [35]:
# create a range of dates starting at a specific date
# and for a specific number of days, creating a Series
np.random.seed(123456)
periods = pd.date_range('8/1/2014', periods=10) #create 10 consecutive days
date_series = pd.Series(np.random.randn(10), index=periods)
date_series

2014-08-01    0.469112
2014-08-02   -0.282863
2014-08-03   -1.509059
2014-08-04   -1.135632
2014-08-05    1.212112
2014-08-06   -0.173215
2014-08-07    0.119209
2014-08-08   -1.044236
2014-08-09   -0.861849
2014-08-10   -2.104569
Freq: D, dtype: float64

In [36]:
# slice by location
subset = date_series[3:7] #slide from the location of time index (day 4 - day 7)
subset

2014-08-04   -1.135632
2014-08-05    1.212112
2014-08-06   -0.173215
2014-08-07    0.119209
Freq: D, dtype: float64

In [37]:
# a Series to demonstrate alignment
s2 = pd.Series([10, 100, 1000, 10000], subset.index) # cac ngay trong subset duoc alighnment voi elements in the list
s2

2014-08-04       10
2014-08-05      100
2014-08-06     1000
2014-08-07    10000
Freq: D, dtype: int64

In [38]:
# demonstrate alignment by date on a subset of items
date_series + s2 #only the values which has time index matched are counted

2014-08-01             NaN
2014-08-02             NaN
2014-08-03             NaN
2014-08-04        8.864368
2014-08-05      101.212112
2014-08-06      999.826785
2014-08-07    10000.119209
2014-08-08             NaN
2014-08-09             NaN
2014-08-10             NaN
Freq: D, dtype: float64

In [39]:
# a two year range of daily data in a Series
s3 = pd.Series(0, pd.date_range('2013-01-01', '2014-12-31'))
s3

2013-01-01    0
2013-01-02    0
2013-01-03    0
2013-01-04    0
2013-01-05    0
             ..
2014-12-27    0
2014-12-28    0
2014-12-29    0
2014-12-30    0
2014-12-31    0
Freq: D, Length: 730, dtype: int64

In [40]:
# only select those in 2013
s3['2013']

2013-01-01    0
2013-01-02    0
2013-01-03    0
2013-01-04    0
2013-01-05    0
             ..
2013-12-27    0
2013-12-28    0
2013-12-29    0
2013-12-30    0
2013-12-31    0
Freq: D, Length: 365, dtype: int64

In [41]:
# 31 items for May 2014
s3['2014-05']

2014-05-01    0
2014-05-02    0
2014-05-03    0
2014-05-04    0
2014-05-05    0
2014-05-06    0
2014-05-07    0
2014-05-08    0
2014-05-09    0
2014-05-10    0
2014-05-11    0
2014-05-12    0
2014-05-13    0
2014-05-14    0
2014-05-15    0
2014-05-16    0
2014-05-17    0
2014-05-18    0
2014-05-19    0
2014-05-20    0
2014-05-21    0
2014-05-22    0
2014-05-23    0
2014-05-24    0
2014-05-25    0
2014-05-26    0
2014-05-27    0
2014-05-28    0
2014-05-29    0
2014-05-30    0
2014-05-31    0
Freq: D, dtype: int64

In [44]:
# items between two months
s3['2014-08':'2014-09']

2014-08-01    0
2014-08-02    0
2014-08-03    0
2014-08-04    0
2014-08-05    0
             ..
2014-09-26    0
2014-09-27    0
2014-09-28    0
2014-09-29    0
2014-09-30    0
Freq: D, Length: 61, dtype: int64

# Creating time-series data with specific frequencies

In [54]:
# generate a Series at one minute intervals
# Note: more alias search (Offset aliases) in:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
# B business day frequency; D day frequency; W weekly frequency; M month frequency....
np.random.seed(123456)
bymin = pd.Series(np.random.randn(24*60*90), 
                  pd.date_range('2014-08-01', 
                                '2014-10-29 23:59',
                                freq='T')) #default = D: daily frequency; T: 1 minute intervals sequence
bymin[:5]

2014-08-01 00:00:00    0.469112
2014-08-01 00:01:00   -0.282863
2014-08-01 00:02:00   -1.509059
2014-08-01 00:03:00   -1.135632
2014-08-01 00:04:00    1.212112
Freq: T, dtype: float64

In [52]:
#np.random.seed(123456)
#bymin = pd.Series(np.random.randn(5), 
#                  pd.date_range('2014-08-01', 
#                                '2014-08-10',
#                                freq='2D')) # Frequency strings can have multiples 
#bymin[:5]

2014-08-01    0.469112
2014-08-03   -0.282863
2014-08-05   -1.509059
2014-08-07   -1.135632
2014-08-09    1.212112
Freq: 2D, dtype: float64

In [55]:
# slice down to the minute
bymin['2014-08-01 00:02':'2014-08-01 00:07']

2014-08-01 00:02:00   -1.509059
2014-08-01 00:03:00   -1.135632
2014-08-01 00:04:00    1.212112
2014-08-01 00:05:00   -0.173215
2014-08-01 00:06:00    0.119209
2014-08-01 00:07:00   -1.044236
Freq: T, dtype: float64

In [56]:
# generate a series based upon business days
days = pd.date_range('2014-08-29', '2014-09-05', freq='B') # 2 days was skiped because they are weekend
days

DatetimeIndex(['2014-08-29', '2014-09-01', '2014-09-02', '2014-09-03',
               '2014-09-04', '2014-09-05'],
              dtype='datetime64[ns]', freq='B')

In [4]:
"""
A range can be created starting at a particular date and time, with a specific frequency, and
for a specific number of periods using the periods parameter. The following creates a 5-item DatetimeIndex 
starting at 2014-08-01 12:10:01 and at 1-second intervals:
"""
# periods will use the frequency as the increment
pd.date_range('2014-08-01 12:10:01', freq='S', periods=5) #S: second

DatetimeIndex(['2014-08-01 12:10:01', '2014-08-01 12:10:02',
               '2014-08-01 12:10:03', '2014-08-01 12:10:04',
               '2014-08-01 12:10:05'],
              dtype='datetime64[ns]', freq='S')

# Date offsets

In [6]:
"""
Frequencies in pandas are represented using date offsets
We have touched on this concept at the beginning of the chapter when discussing Timedelta objects. 
pandas extends the capabilities of these using the concept of DateOffset objects. They are objects which
represent knowledge of how to integrate time offsets and frequencies relative to DatetimeIndex objects.

'M', 'D', using in freq parameter in pd.date_range() are translated into an instance of the pandas DateOffset
object.

DateOffset objects can be used in various scenarios:
    They can be added or subtracted to obtain a shifted date
    They can be multiplied by an integer (positive or negative) so that the increment will be applied multiple times
    They have rollforward and rollback methods to move a date forward or backward to the next or previous "offset date
"""
# calculate a one day offset from 2014-8-29
d = datetime(2014, 8, 29)
do = pd.DateOffset(days = 1) 
d + do

Timestamp('2014-08-30 00:00:00')

In [8]:
# import the data offset types
from pandas.tseries.offsets import *
# calculate one business day from 2014-8-31
d + BusinessDay()

Timestamp('2014-09-01 00:00:00')

In [9]:
# determine 2 business days from 2014-8-29
d + 2 * BusinessDay()

Timestamp('2014-09-02 00:00:00')

In [11]:
#using a BMonthEnd object to calculate the last business day of a month from a given date (in this case, 2014-09-02)
    # what is the next business month end from a specific date 2014-09-02?
d + BMonthEnd()

Timestamp('2014-09-30 00:00:00')

In [12]:
# calculate the next month end by rolling forward from a specific date
BMonthEnd().rollforward(datetime(2014, 9, 15))

Timestamp('2014-09-30 00:00:00')

In [13]:
# calculate the date of the Tuesday previous to a specified date 
d - Week(weekday = 1)

Timestamp('2014-08-26 00:00:00')

# Anchored offsets

In [None]:
"""
Anchored offsets are frequencies that represent a given frequency and begin at a specific
point such as a specific day of the week, month, or year. Anchored offsets use a specific
shorthand nomenclature. As an example, the following strings specify a specific day of the
week: 
W-SUN: weekly on Sunday
W-MON: weekly on Monday
.......................
W-SAT: weekly on Saturday

"""

In [14]:
# calculate all Wednesdays between 2014-06-01
# and 2014-08-31
wednesdays = pd.date_range('2014-06-01', 
                           '2014-07-31', freq="W-WED")
wednesdays.values

array(['2014-06-04T00:00:00.000000000', '2014-06-11T00:00:00.000000000',
       '2014-06-18T00:00:00.000000000', '2014-06-25T00:00:00.000000000',
       '2014-07-02T00:00:00.000000000', '2014-07-09T00:00:00.000000000',
       '2014-07-16T00:00:00.000000000', '2014-07-23T00:00:00.000000000',
       '2014-07-30T00:00:00.000000000'], dtype='datetime64[ns]')

In [15]:
# what are all of the business quarterly end
# dates in 2014?
qends = pd.date_range('2014-01-01', '2014-12-31', 
                      freq='BQS-JUN')
qends.values

array(['2014-03-03T00:00:00.000000000', '2014-06-02T00:00:00.000000000',
       '2014-09-01T00:00:00.000000000', '2014-12-01T00:00:00.000000000'],
      dtype='datetime64[ns]')

# The Period object

In [5]:
# pandas formalizes the concept of an interval of time using a Period object. 
# A Period is created using a timestamp and a frequency, where the timestamp represents the anchor used as 
# a point of reference and the frequency is the duration of time
# create a period representing a month of time
# starting in August 2014
aug2014 = pd.Period('2014-08', freq='M')
aug2014

Period('2014-08', 'M')

In [6]:
# examine the start and end times of this period
aug2014.start_time, aug2014.end_time

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-31 23:59:59.999999999'))

In [7]:
# calculate the period that is one frequency
# unit of the aug2014 period further along in time
# This happens to be September 2014
sep2014 = aug2014 + 1
sep2014

Period('2014-09', 'M')

In [8]:
sep2014.start_time, sep2014.end_time

(Timestamp('2014-09-01 00:00:00'), Timestamp('2014-09-30 23:59:59.999999999'))

# Indexing using the PeriodIndex

In [9]:
"""
A series of Period objects can be combined into a special form of pandas index known as
PeriodIndex. A PeriodIndex index is useful for being able to associate data to specific
intervals of time and with being able to slice and perform analysis on the events in each
interval.
"""
# create a period index representing all monthly boundaries in 2013
mp2013 = pd.period_range('1/1/2013', '12/31/2013', freq='M') #each elements in period_range is an Period object
mp2013

PeriodIndex(['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
             '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'],
            dtype='period[M]', freq='M')

In [10]:
# loop through all period objects in the index
# printing start and end time for each
for p in mp2013: 
    print ("{0} {1}".format(p.start_time, p.end_time))

2013-01-01 00:00:00 2013-01-31 23:59:59.999999999
2013-02-01 00:00:00 2013-02-28 23:59:59.999999999
2013-03-01 00:00:00 2013-03-31 23:59:59.999999999
2013-04-01 00:00:00 2013-04-30 23:59:59.999999999
2013-05-01 00:00:00 2013-05-31 23:59:59.999999999
2013-06-01 00:00:00 2013-06-30 23:59:59.999999999
2013-07-01 00:00:00 2013-07-31 23:59:59.999999999
2013-08-01 00:00:00 2013-08-31 23:59:59.999999999
2013-09-01 00:00:00 2013-09-30 23:59:59.999999999
2013-10-01 00:00:00 2013-10-31 23:59:59.999999999
2013-11-01 00:00:00 2013-11-30 23:59:59.999999999
2013-12-01 00:00:00 2013-12-31 23:59:59.999999999


In [11]:
# create a Series with a PeriodIndex
np.random.seed(123456)
ps = pd.Series(np.random.randn(12), mp2013) # Series mp2013 has 12 objects
ps[:5]

2013-01    0.469112
2013-02   -0.282863
2013-03   -1.509059
2013-04   -1.135632
2013-05    1.212112
Freq: M, dtype: float64

In [13]:
# create a Series with a PeriodIndex and which
# represents all calendar month periods in 2013 and 2014
np.random.seed(123456)
ps = pd.Series(np.random.randn(24), 
               pd.period_range('1/1/2013', 
                               '12/31/2014', freq='M'))
ps

2013-01    0.469112
2013-02   -0.282863
2013-03   -1.509059
2013-04   -1.135632
2013-05    1.212112
2013-06   -0.173215
2013-07    0.119209
2013-08   -1.044236
2013-09   -0.861849
2013-10   -2.104569
2013-11   -0.494929
2013-12    1.071804
2014-01    0.721555
2014-02   -0.706771
2014-03   -1.039575
2014-04    0.271860
2014-05   -0.424972
2014-06    0.567020
2014-07    0.276232
2014-08   -1.087401
2014-09   -0.673690
2014-10    0.113648
2014-11   -1.478427
2014-12    0.524988
Freq: M, dtype: float64

In [14]:
# get value for period represented with 2014-06
ps['2014-06']

0.567020349793672

In [15]:
# get values for all periods in 2014
ps['2014']

2014-01    0.721555
2014-02   -0.706771
2014-03   -1.039575
2014-04    0.271860
2014-05   -0.424972
2014-06    0.567020
2014-07    0.276232
2014-08   -1.087401
2014-09   -0.673690
2014-10    0.113648
2014-11   -1.478427
2014-12    0.524988
Freq: M, dtype: float64

In [16]:
# all values between (and including) March and June 2014
ps['2014-03':'2014-06']

2014-03   -1.039575
2014-04    0.271860
2014-05   -0.424972
2014-06    0.567020
Freq: M, dtype: float64

# Handling holidays using calendars

In [19]:
"""Earlier, when we calculated the next business day from August 29, 2014, we were told by
pandas that this date is September 1, 2014. This is actually not correct in the United States:
September 1, 2014 is a US federal holiday and banks and exchanges are closed on this day.
The reason for this is that pandas uses a specific default calendar when calculating the next
business day, and this default pandas calendar does not include September 1, 2014 as a
holiday 

The solution to this issue is to either create a custom calendar (which we will not get into
the details of), or use the one custom calendar provided by pandas for just this situation,
USFederalHolidayCalender. This custom calendar can then be passed to a
CustomBusinessDay object that will be used instead of a BusinessDay object. The
calculation using this CustomBusinessDay object will then use the new calendar and take
into account the US federal holidays.

"""
# demonstrate using the US federal holiday calendar
# first need to import it
from pandas.tseries.holiday import *
# create it and show what it considers holidays
cal = USFederalHolidayCalendar()
for d in cal.holidays(start='2014-01-01', end='2014-12-31'):
    print (d)

2014-01-01 00:00:00
2014-01-20 00:00:00
2014-02-17 00:00:00
2014-05-26 00:00:00
2014-07-04 00:00:00
2014-09-01 00:00:00
2014-10-13 00:00:00
2014-11-11 00:00:00
2014-11-27 00:00:00
2014-12-25 00:00:00


In [20]:
# This calendar can then be used to calculate the next business day from August 29, 2014:
from pandas.tseries.offsets import CustomBusinessDay
# create CustomBusinessDay object based on the federal calendar
cbd = CustomBusinessDay(holidays=cal.holidays())

# now calc next business day from 2014-8-29
datetime(2014, 8, 29) + cbd

Timestamp('2014-09-02 00:00:00')

# Normalizing timestamps using time zones

In [21]:
"""
pandas provides rich support for working with timestamps in different time
zones. Under the covers, pandas utilizes the pytz (python time zone) and dateuil (date uil) libraries to manage the
time zone operations
"""
# get the current local time and demonstrate there is no
# timezone info by default
now = pd.Timestamp('now')
now, now.tz is None # pandas objects that are time zone-aware support .tz property

(Timestamp('2020-04-14 08:35:58.096118'), True)

In [22]:
# default DatetimeIndex and its Timestamps do not have
# time zone information
rng = pd.date_range('3/6/2012 00:00', periods=15, freq='D')
rng.tz is None, rng[0].tz is None

(True, True)

In [23]:
# import common timezones from pytz
from pytz import common_timezones
# report the first 5
common_timezones[:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

In [24]:
# get now, and now localized to UTC
now = Timestamp("now")
local_now = now.tz_localize('UTC')
now, local_now

(Timestamp('2020-04-14 08:48:09.656637'),
 Timestamp('2020-04-14 08:48:09.656637+0000', tz='UTC'))

In [27]:
""" The local UTC time can be found using the following which utilizes the .tz_localize() method of Timestamp 
and by passing the UTC value:"""
# get now, and now localized to UTC
now = Timestamp("now")
#local_now = now.tz_localize('UTC')
local_now = now.tz_localize('Asia/Tokyo')
now, local_now

(Timestamp('2020-04-14 08:54:46.314538'),
 Timestamp('2020-04-14 08:54:46.314538+0900', tz='Asia/Tokyo'))

In [31]:
# localize a timestamp to US/Mountain time zone
tstamp = Timestamp('2014-08-01 12:00:00', tz='US/Mountain')
tstamp

Timestamp('2014-08-01 12:00:00-0600', tz='US/Mountain')

In [30]:
"""
A DatetimeIndex can be created with a specific time zone by using the tz parameter of
the pd.date_range method:
"""
# create a DatetimeIndex using a timezone
rng = pd.date_range('3/6/2012 00:00:00', 
                    periods=10, freq='D', tz='US/Mountain')
rng.tz, rng[0].tz

(<DstTzInfo 'US/Mountain' LMT-1 day, 17:00:00 STD>,
 <DstTzInfo 'US/Mountain' MST-1 day, 17:00:00 STD>)

In [32]:
"""
It is also possible to construct other time zones explicitly. This model can give you more
control over which time zone is used in .tz_localize(). The following creates two
different timezone objects and localizes a Timestamp to each:
"""
# show use of timezone objects
# need to reference pytz
import pytz
# create an object for two different timezones
mountain_tz = pytz.timezone("US/Mountain")
eastern_tz = pytz.timezone("US/Eastern")
# apply each to 'now'
mountain_tz.localize(now), eastern_tz.localize(now)

(Timestamp('2020-04-14 08:54:46.314538-0600', tz='US/Mountain'),
 Timestamp('2020-04-14 08:54:46.314538-0400', tz='US/Eastern'))

In [33]:
# create two Series, same start, same periods, same frequencies,
# each with a different timezone
s_mountain = Series(np.arange(0, 5),
                    index=pd.date_range('2014-08-01', 
                                        periods=5, freq="H", 
                                        tz='US/Mountain'))
s_eastern = Series(np.arange(0, 5), 
                   index=pd.date_range('2014-08-01', 
                                       periods=5, freq="H", 
                                       tz='US/Eastern'))
s_mountain # UTC -6

2014-08-01 00:00:00-06:00    0
2014-08-01 01:00:00-06:00    1
2014-08-01 02:00:00-06:00    2
2014-08-01 03:00:00-06:00    3
2014-08-01 04:00:00-06:00    4
Freq: H, dtype: int32

In [34]:
s_eastern # UTC-4

2014-08-01 00:00:00-04:00    0
2014-08-01 01:00:00-04:00    1
2014-08-01 02:00:00-04:00    2
2014-08-01 03:00:00-04:00    3
2014-08-01 04:00:00-04:00    4
Freq: H, dtype: int32

In [36]:
# add the two Series. This only results in three items being aligned 
s = s_eastern + s_mountain # result have index which is UTC timestamp series
s # plus 2 series, only three items in two series are aligned (have the same time index)

2014-08-01 04:00:00+00:00    NaN
2014-08-01 05:00:00+00:00    NaN
2014-08-01 06:00:00+00:00    2.0
2014-08-01 07:00:00+00:00    4.0
2014-08-01 08:00:00+00:00    6.0
2014-08-01 09:00:00+00:00    NaN
2014-08-01 10:00:00+00:00    NaN
Freq: H, dtype: float64

In [39]:
s.index

DatetimeIndex(['2014-08-01 04:00:00+00:00', '2014-08-01 05:00:00+00:00',
               '2014-08-01 06:00:00+00:00', '2014-08-01 07:00:00+00:00',
               '2014-08-01 08:00:00+00:00', '2014-08-01 09:00:00+00:00',
               '2014-08-01 10:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='H')

In [40]:
""" Once a time zone is assigned to an object, that object can be converted to another time zone
using the .tz_convert() method"""
# convert s1 from US/Eastern to US/Pacific
s_pacific = s_eastern.tz_convert("US/Pacific")
s_pacific

2014-07-31 21:00:00-07:00    0
2014-07-31 22:00:00-07:00    1
2014-07-31 23:00:00-07:00    2
2014-08-01 00:00:00-07:00    3
2014-08-01 01:00:00-07:00    4
Freq: H, dtype: int32

# Manipulating time-series data

In [None]:
"""
We will now examine several common operations that are performed on time-series data.
These operations entail realigning data, changing the frequency of the samples and their
values, and calculating aggregate results on continuously moving subsets of the data to
determine the behavior of the values in the data as time changes
"""

# Shifting and lagging

In [41]:
"""
to shift the values backward and forward in
time. The pandas method for this is .shift() which will shift values in a Series or
DataFrame a specified number of units of the frequency in the index
"""
# create a Series to work with
np.random.seed(123456)
ts = Series([1, 2, 2.5, 1.5, 0.5],
            pd.date_range('2014-08-01', periods=5))
ts

2014-08-01    1.0
2014-08-02    2.0
2014-08-03    2.5
2014-08-04    1.5
2014-08-05    0.5
Freq: D, dtype: float64

In [42]:
# shift forward one day
    # the index itself remains unchanged. There is no replacement data for 2014-08-01 so it is filled with NaN
ts.shift(1)

2014-08-01    NaN
2014-08-02    1.0
2014-08-03    2.0
2014-08-04    2.5
2014-08-05    1.5
Freq: D, dtype: float64

In [43]:
# lag two days
    # a shift in a negative direction (shift backward 2 days)
ts.shift(-2)

2014-08-01    2.5
2014-08-02    1.5
2014-08-03    0.5
2014-08-04    NaN
2014-08-05    NaN
Freq: D, dtype: float64

In [45]:
# calculate daily percentage change
ts / ts.shift(1) # dividing a Series object by its values which is shifted by 1 

2014-08-01         NaN
2014-08-02    2.000000
2014-08-03    1.250000
2014-08-04    0.600000
2014-08-05    0.333333
Freq: D, dtype: float64

In [47]:
# Shifts can be performed on different frequencies than that in the index. When this is
# performed, the index will be modified and the values remain the same
# the following shifts the Series forward by one business day:
    # shift forward one business day
ts.shift(1, freq="B")

2014-08-04    1.0
2014-08-04    2.0
2014-08-04    2.5
2014-08-05    1.5
2014-08-06    0.5
dtype: float64

In [48]:
# shift forward five hours
ts.tshift(5, freq="H")

2014-08-01 05:00:00    1.0
2014-08-02 05:00:00    2.0
2014-08-03 05:00:00    2.5
2014-08-04 05:00:00    1.5
2014-08-05 05:00:00    0.5
Freq: D, dtype: float64

In [49]:
# shift using a DateOffset
ts.shift(1, DateOffset(minutes=0.5))

2014-08-01 00:00:30    1.0
2014-08-02 00:00:30    2.0
2014-08-03 00:00:30    2.5
2014-08-04 00:00:30    1.5
2014-08-05 00:00:30    0.5
Freq: D, dtype: float64

In [51]:
# This method shifts the index labels by the specified units and a frequency specified by the freq  
# parameter (which is required). The following code demonstrates this approach by adjusting the index by 1 hour:

# shift just the index values
# ts.tshift(-1, freq='H')
ts.tshift(-2, freq='H') #shift lui index lai 2 tieng

2014-07-31 22:00:00    1.0
2014-08-01 22:00:00    2.0
2014-08-02 22:00:00    2.5
2014-08-03 22:00:00    1.5
2014-08-04 22:00:00    0.5
Freq: D, dtype: float64

# Frequency Conversion

In [52]:
"""
Frequency data can be converted in pandas using the .asfreq() method of a time-series
object. When converting frequency, a new Series object with a new DatatimeIndex
object will be created. The DatetimeIndex of the new Series object starts at the first
Timestamp of the original and progresses at the given frequency until the last Timestamp
of the original. Values will then be aligned into the new Series
"""
# time series of consecutive incremental integers mapped into each 2 hour of each day for August 2014:
# create a Series of incremental values
# index by hour through all of August 2014
periods = 31 * 24
hourly = Series(np.arange(0, periods),
               pd.date_range('08-01-2014', freq="2H", 
                             periods = periods))
hourly[:5]

2014-08-01 00:00:00    0
2014-08-01 02:00:00    1
2014-08-01 04:00:00    2
2014-08-01 06:00:00    3
2014-08-01 08:00:00    4
Freq: 2H, dtype: int32

In [53]:
# The following converts this time series to a daily frequency using .asfreq('D')
    # convert to daily frequency; many items will be dropped due to alignment
daily = hourly.asfreq('D') #chuyen doi index tu h sang ngay, dropped cac values de khop voi index
daily[:5]

2014-08-01     0
2014-08-02    12
2014-08-03    24
2014-08-04    36
2014-08-05    48
Freq: D, dtype: int32

In [54]:
# convert back to hourly.  Results in many NaNs
# as the new index has many labels that do not
# align from the source
daily.asfreq('H')

2014-08-01 00:00:00      0.0
2014-08-01 01:00:00      NaN
2014-08-01 02:00:00      NaN
2014-08-01 03:00:00      NaN
2014-08-01 04:00:00      NaN
                       ...  
2014-09-30 20:00:00      NaN
2014-09-30 21:00:00      NaN
2014-09-30 22:00:00      NaN
2014-09-30 23:00:00      NaN
2014-10-01 00:00:00    732.0
Freq: H, Length: 1465, dtype: float64

In [55]:
# Using method parameter of .asfreq() to change the default which is fill with NaN as above 
    # forward fill values: 'ffill'
    # the last value trong ngay 30/9 la 720 => fill with 720; the last value in 1/8 la 0 => fill all hour with 0
daily.asfreq('H', method='ffill')

2014-08-01 00:00:00      0
2014-08-01 01:00:00      0
2014-08-01 02:00:00      0
2014-08-01 03:00:00      0
2014-08-01 04:00:00      0
                      ... 
2014-09-30 20:00:00    720
2014-09-30 21:00:00    720
2014-09-30 22:00:00    720
2014-09-30 23:00:00    720
2014-10-01 00:00:00    732
Freq: H, Length: 1465, dtype: int32

In [56]:
# The bfill method will back fill values from the next known value:
daily.asfreq('H', method='bfill') #value cua ngay 2/8 la 12 => fill ngay 1/8 with 12

2014-08-01 00:00:00      0
2014-08-01 01:00:00     12
2014-08-01 02:00:00     12
2014-08-01 03:00:00     12
2014-08-01 04:00:00     12
                      ... 
2014-09-30 20:00:00    732
2014-09-30 21:00:00    732
2014-09-30 22:00:00    732
2014-09-30 23:00:00    732
2014-10-01 00:00:00    732
Freq: H, Length: 1465, dtype: int32

# Up and down resampling

In [59]:
"""
Frequency conversion provides a basic way to convert the index in a time series to another
frequency. Data in the new time series is aligned with the old data and can result in many
NaN values. This can be partially solved using a fill method, but that is limited in its
capabilities to fill with appropriate information.


'Resampling' differs (compare with asfreq) in that it does not perform a pure alignment. The values placed in the
new series can use the same forward and reverse fill options, but they can also be specified
using other pandas-provided algorithms or with your own functions
"""

In [60]:
# calculate a random walk five days long at one second intervals
# this many items will be needed
count = 24 * 60 * 60 * 5
# create a series of values
np.random.seed(123456)
values = np.random.randn(count)
ws = pd.Series(values)
# calculate the walk
walk = ws.cumsum()
# patch the index
walk.index = pd.date_range('2014-08-01', periods=count, freq="S")

walk

2014-08-01 00:00:00      0.469112
2014-08-01 00:00:01      0.186249
2014-08-01 00:00:02     -1.322810
2014-08-01 00:00:03     -2.458442
2014-08-01 00:00:04     -1.246330
                          ...    
2014-08-05 23:59:55    456.529763
2014-08-05 23:59:56    456.052131
2014-08-05 23:59:57    455.202981
2014-08-05 23:59:58    454.947362
2014-08-05 23:59:59    456.191430
Freq: S, Length: 432000, dtype: float64

In [61]:
                                          """ DOWNSAMPLING """
"""
Resampling in pandas is accomplished using the .resample() method and by passing it a
new frequency. To demonstrate this, the following resamples the by-the-second data to bythe-minute. 
This is a downsampling, as the result has a lower frequency and results in less values:
"""
# resample to minute intervals
walk.resample("1Min").mean() # dua cac gia tri giay vao buckets roi tinh trung binh de ra gia tri cho index moi (theo phut) 
"""
A resampling will actually split the data into buckets of data based on new periods and then apply a particular operation 
to the data in each bucket, in this case calculating the mean of the bucket

default: closed = left, label = left
"""

2014-08-01 00:00:00     -8.718220
2014-08-01 00:01:00    -15.239213
2014-08-01 00:02:00     -9.179315
2014-08-01 00:03:00     -8.338307
2014-08-01 00:04:00     -8.129554
                          ...    
2014-08-05 23:55:00    453.773467
2014-08-05 23:56:00    450.857039
2014-08-05 23:57:00    450.078149
2014-08-05 23:58:00    444.637806
2014-08-05 23:59:00    453.837417
Freq: T, Length: 7200, dtype: float64

In [62]:
# calculate the mean of the first minute of the walk
walk['2014-08-01 00:00'].mean() #tinh gia tri trung binh trong 1 phut dau tien 

-8.718220052832644

In [70]:
"""
In downsampling, as the existing data is put into buckets based on the new intervals, there
can often be a question of what values are on each end of the bucket

The default is the former, and it is referred to as a left close
"""

In [68]:
# use a right close
walk.resample("1Min", closed='right').mean()

2014-07-31 23:59:00      0.469112
2014-08-01 00:00:00     -8.907477
2014-08-01 00:01:00    -15.280685
2014-08-01 00:02:00     -9.083865
2014-08-01 00:03:00     -8.285550
                          ...    
2014-08-05 23:55:00    453.726168
2014-08-05 23:56:00    450.849039
2014-08-05 23:57:00    450.039159
2014-08-05 23:58:00    444.631719
2014-08-05 23:59:00    453.955377
Freq: T, Length: 7201, dtype: float64

In [71]:
"""Note: more detail about parameter closed: left/ right; label: left/ right. 
Xem oneNote: /numpy and matplotlib/concept/pandas/time series model"""

In [72]:
# taking the first value in each bucket:
walk.resample("1Min").first()

2014-08-01 00:00:00      0.469112
2014-08-01 00:01:00    -10.886314
2014-08-01 00:02:00    -13.374656
2014-08-01 00:03:00     -7.647693
2014-08-01 00:04:00     -4.482292
                          ...    
2014-08-05 23:55:00    452.900335
2014-08-05 23:56:00    450.062374
2014-08-05 23:57:00    449.582419
2014-08-05 23:58:00    447.243014
2014-08-05 23:59:00    446.877810
Freq: T, Length: 7200, dtype: float64

In [73]:
                                           """ UPSAMPLING """
#To demonstrate upsampling, we will resample the walk to minutes and then back into seconds:
    # resample to 1 minute intervales, then back to 1 sec
bymin = walk.resample("1Min").mean() # The upsampling created the index values for the second-by-second data but inserted NaN
                                     # values by default
bymin.resample('S').mean()

2014-08-01 00:00:00     -8.718220
2014-08-01 00:00:01           NaN
2014-08-01 00:00:02           NaN
2014-08-01 00:00:03           NaN
2014-08-01 00:00:04           NaN
                          ...    
2014-08-05 23:58:56           NaN
2014-08-05 23:58:57           NaN
2014-08-05 23:58:58           NaN
2014-08-05 23:58:59           NaN
2014-08-05 23:59:00    453.837417
Freq: S, Length: 431941, dtype: float64

In [74]:
"""
The upsampling created the index values for the second-by-second data but inserted NaN
values by default. This default behavior can be modified using the fill_method
parameter. We saw this when changing frequency with the options of forward and
backward filling. These are also available with resampling. The following demonstrates
how to use the forward fill:
"""

In [75]:
# resample to 1 second intervales using forward fill
bymin.resample("S").bfill()

2014-08-01 00:00:00     -8.718220
2014-08-01 00:00:01    -15.239213
2014-08-01 00:00:02    -15.239213
2014-08-01 00:00:03    -15.239213
2014-08-01 00:00:04    -15.239213
                          ...    
2014-08-05 23:58:56    453.837417
2014-08-05 23:58:57    453.837417
2014-08-05 23:58:58    453.837417
2014-08-05 23:58:59    453.837417
2014-08-05 23:59:00    453.837417
Freq: S, Length: 431941, dtype: float64

In [76]:
"""
It is also possible to interpolate the missing values using the .interplolate() method on
the result. This will calculate a linear interpolation between the values existing in the result
for all of the NaN values created during the resampling:
"""
# demonstate interoplating the NaN values
interpolated = bymin.resample("S").interpolate()
interpolated

2014-08-01 00:00:00     -8.718220
2014-08-01 00:00:01     -8.826903
2014-08-01 00:00:02     -8.935586
2014-08-01 00:00:03     -9.044270
2014-08-01 00:00:04     -9.152953
                          ...    
2014-08-05 23:58:56    453.224110
2014-08-05 23:58:57    453.377437
2014-08-05 23:58:58    453.530764
2014-08-05 23:58:59    453.684090
2014-08-05 23:59:00    453.837417
Freq: S, Length: 431941, dtype: float64

In [None]:
"""
pandas also provides a very convenient resampling method referred to as open, high, low,
and close, by using the .ohlc() method. 
The following example takes our second-bysecond data and calculates hour-by-hour ohlc values:
"""