In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
"""
Anything that is observed or measured at many points in time forms a time series. Many time series are fixed frequency
• Timestamps, specific instants in time
• Fixed periods, such as the month January 2007 or the full year 2010
• Intervals of time, indicated by a start and end timestamp. Periods can be thought
of as special cases of intervals
• Experiment or elapsed time; each timestamp is a measure of time relative to a
particular start time (e.g., the diameter of a cookie baking each second since
being placed in the oven)
"""
# Date and Time data types and tools
from datetime import datetime, timedelta
now = datetime.now()
now
now.year, now.month, now.day
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta
delta.days, delta.seconds
start = datetime(2011, 1, 7)
start + timedelta(12)   # days = 12

'\nAnything that is observed or measured at many points in time forms a time series. Many time series are fixed frequency\n• Timestamps, specific instants in time\n• Fixed periods, such as the month January 2007 or the full year 2010\n• Intervals of time, indicated by a start and end timestamp. Periods can be thought\nof as special cases of intervals\n• Experiment or elapsed time; each timestamp is a measure of time relative to a\nparticular start time (e.g., the diameter of a cookie baking each second since\nbeing placed in the oven)\n'

datetime.datetime(2020, 2, 21, 14, 51, 32, 673468)

(2020, 2, 21)

datetime.timedelta(926, 56700)

(926, 56700)

datetime.datetime(2011, 1, 19, 0, 0)

In [4]:
# converting between string and datetime
stamp = datetime(2011, 1, 3)
str(stamp)
stamp.strftime('%Y-%m-%d')
stamp.strftime('%y-%m-%d-%w')
stamp.strftime('%F')
"""
%Y Four-digit year
%y Two-digit year
%m Two-digit month [01, 12]
%d Two-digit day [01, 31]
%H Hour (24-hour clock) [00, 23]
%I Hour (12-hour clock) [01, 12]
%M Two-digit minute [00, 59]
%S Second [00, 61] (seconds 60, 61 account for leap seconds)
%w Weekday as integer [0 (Sunday), 6]
%U Week number of the year [00, 53]; Sunday is considered the first day of the week, and days before the first Sunday of
the year are “week 0”
%W Week number of the year [00, 53]; Monday is considered the first day of the week, and days before the first Monday of
the year are “week 0”
%z UTC time zone offset as +HHMM or -HHMM; empty if time zone naive
%F Shortcut for %Y-%m-%d (e.g., 2012-4-18)
%D Shortcut for %m/%d/%y (e.g., 04/18/12)
"""
# from str to datetime
value = '2011-01-03'
datetime.strptime(value, '%F')
# parser.parse method in the third-party dateutil package
from dateutil.parser import parse
parse('2011-01-03')
parse('Jan 31, 1997 10:45 PM')
parse('6/12/2011', dayfirst=True)   # day appearing before month
# dateutil.parser is a useful but imperfect tool. it will recognize some strings as dates that you might prefer that it didn't
# '42' will be parsed as the year 2042

'2011-01-03 00:00:00'

'2011-01-03'

'11-01-03-1'

'2011-01-03'

'\n%Y Four-digit year\n%y Two-digit year\n%m Two-digit month [01, 12]\n%d Two-digit day [01, 31]\n%H Hour (24-hour clock) [00, 23]\n%I Hour (12-hour clock) [01, 12]\n%M Two-digit minute [00, 59]\n%S Second [00, 61] (seconds 60, 61 account for leap seconds)\n%w Weekday as integer [0 (Sunday), 6]\n%U Week number of the year [00, 53]; Sunday is considered the first day of the week, and days before the first Sunday of\nthe year are “week 0”\n%W Week number of the year [00, 53]; Monday is considered the first day of the week, and days before the first Monday of\nthe year are “week 0”\n%z UTC time zone offset as +HHMM or -HHMM; empty if time zone naive\n%F Shortcut for %Y-%m-%d (e.g., 2012-4-18)\n%D Shortcut for %m/%d/%y (e.g., 04/18/12)\n'

ValueError: 'F' is a bad directive in format '%F'

In [5]:
import pandas as pd

datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(datestrs)
idx = pd.to_datetime(datestrs + [None])
idx
pd.isnull(idx)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

array([False, False,  True])

In [6]:
# time series basics
import numpy as np
from datetime import datetime

dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(3), index=idx)
ts
ts = pd.Series(np.random.randn(6), index=dates)
ts
ts.index
ts + ts[::2]
stamp = ts.index[0]
stamp

2011-07-06 12:00:00    0.193873
2011-08-06 00:00:00    0.380666
NaT                   -1.076728
dtype: float64

2011-01-02   -0.958501
2011-01-05    0.199016
2011-01-07    0.855768
2011-01-08    0.742549
2011-01-10   -1.156979
2011-01-12   -0.735470
dtype: float64

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

2011-01-02   -1.917002
2011-01-05         NaN
2011-01-07    1.711535
2011-01-08         NaN
2011-01-10   -2.313958
2011-01-12         NaN
dtype: float64

Timestamp('2011-01-02 00:00:00')

In [7]:
# indexing, selection, subsetting
stamp = ts.index[2]
stamp
ts[stamp]
# As a convenience, you can also pass a string that is interpretable as a date
ts['1/10/2011']
ts['20110110']

# for longer time series
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('1/1/2000', periods=1000))
longer_ts
longer_ts['2001']   # selects time period of whole 2001 year
# slicing with datetime object
ts[datetime(2011, 1, 7):]

Timestamp('2011-01-07 00:00:00')

0.8557677266619443

-1.156979157178613

-1.156979157178613

2000-01-01   -1.304095
2000-01-02    1.510582
2000-01-03    0.921778
2000-01-04    2.207405
2000-01-05   -0.003735
2000-01-06   -0.723550
2000-01-07    0.391881
2000-01-08   -0.281380
2000-01-09    1.671399
2000-01-10    1.008381
2000-01-11    0.547649
2000-01-12   -0.862524
2000-01-13    0.068820
2000-01-14   -1.516917
2000-01-15   -0.359408
2000-01-16   -0.157749
2000-01-17    0.342236
2000-01-18   -1.625766
2000-01-19   -0.596872
2000-01-20    0.088273
2000-01-21    0.805745
2000-01-22   -1.412287
2000-01-23   -0.134489
2000-01-24    1.107318
2000-01-25    1.280419
2000-01-26    1.031569
2000-01-27    1.837641
2000-01-28   -1.049971
2000-01-29    1.118074
2000-01-30   -0.574398
                ...   
2002-08-28   -0.378513
2002-08-29    1.737399
2002-08-30   -1.075227
2002-08-31    0.734124
2002-09-01   -1.091274
2002-09-02    1.676625
2002-09-03   -0.244950
2002-09-04    1.931451
2002-09-05   -0.043874
2002-09-06    1.468413
2002-09-07   -0.887333
2002-09-08   -0.917146
2002-09-09 

2001-01-01    0.339515
2001-01-02   -0.825326
2001-01-03    0.871114
2001-01-04   -0.465813
2001-01-05    0.561507
2001-01-06    1.324620
2001-01-07   -0.806158
2001-01-08    0.138970
2001-01-09   -1.336205
2001-01-10   -0.650474
2001-01-11    1.560583
2001-01-12   -0.033600
2001-01-13   -1.057811
2001-01-14    1.126320
2001-01-15    1.190146
2001-01-16   -0.084565
2001-01-17   -0.016721
2001-01-18   -0.023745
2001-01-19    0.008763
2001-01-20    0.984169
2001-01-21   -2.333292
2001-01-22    1.598626
2001-01-23   -0.377719
2001-01-24    0.790732
2001-01-25   -2.414359
2001-01-26   -0.560914
2001-01-27   -0.460512
2001-01-28    0.005430
2001-01-29    0.239768
2001-01-30   -0.022224
                ...   
2001-12-02    1.029771
2001-12-03   -2.602516
2001-12-04    1.184904
2001-12-05   -0.458430
2001-12-06    0.703795
2001-12-07   -0.805285
2001-12-08   -0.383959
2001-12-09    1.740931
2001-12-10   -0.237682
2001-12-11   -1.472899
2001-12-12    2.091441
2001-12-13   -1.402459
2001-12-14 

2011-01-07    0.855768
2011-01-08    0.742549
2011-01-10   -1.156979
2011-01-12   -0.735470
dtype: float64

In [8]:
# slice with timestamps not contained in a time series to perform a range query, slicing produces views
ts
ts['1/6/2011': '1/11/2011']
ts.truncate(after='1/9/2011')


2011-01-02   -0.958501
2011-01-05    0.199016
2011-01-07    0.855768
2011-01-08    0.742549
2011-01-10   -1.156979
2011-01-12   -0.735470
dtype: float64

2011-01-07    0.855768
2011-01-08    0.742549
2011-01-10   -1.156979
dtype: float64

2011-01-02   -0.958501
2011-01-05    0.199016
2011-01-07    0.855768
2011-01-08    0.742549
dtype: float64

In [9]:
# holds true for DataFrame as well, indexing on its rows
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
dates
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=['Colorado', 'Texas',
                                'New York', 'Ohio'])
long_df.loc['5-2001']

DatetimeIndex(['2000-01-05', '2000-01-12', '2000-01-19', '2000-01-26',
               '2000-02-02', '2000-02-09', '2000-02-16', '2000-02-23',
               '2000-03-01', '2000-03-08', '2000-03-15', '2000-03-22',
               '2000-03-29', '2000-04-05', '2000-04-12', '2000-04-19',
               '2000-04-26', '2000-05-03', '2000-05-10', '2000-05-17',
               '2000-05-24', '2000-05-31', '2000-06-07', '2000-06-14',
               '2000-06-21', '2000-06-28', '2000-07-05', '2000-07-12',
               '2000-07-19', '2000-07-26', '2000-08-02', '2000-08-09',
               '2000-08-16', '2000-08-23', '2000-08-30', '2000-09-06',
               '2000-09-13', '2000-09-20', '2000-09-27', '2000-10-04',
               '2000-10-11', '2000-10-18', '2000-10-25', '2000-11-01',
               '2000-11-08', '2000-11-15', '2000-11-22', '2000-11-29',
               '2000-12-06', '2000-12-13', '2000-12-20', '2000-12-27',
               '2001-01-03', '2001-01-10', '2001-01-17', '2001-01-24',
      

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.963791,0.586147,0.704035,-0.069861
2001-05-09,0.171788,1.748889,0.07026,1.36071
2001-05-16,2.772809,-0.25969,-0.357622,-1.615786
2001-05-23,0.786354,0.510945,0.262279,0.688761
2001-05-30,0.235521,0.451498,0.935616,0.414587


In [10]:
# time series with duplicate indices
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts
dup_ts['1/2/2000']
grouped = dup_ts.groupby(level=0)
grouped.mean()


2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [11]:
# date ranges, frequencies, and shifting
# pandas has a
# full suite of standard time series frequencies and tools for resampling, inferring frequencies,
# and generating fixed-frequency date ranges. For example, you can convert
# the sample time series to be fixed daily frequency by calling resample:
ts
resampler = ts.resample('D')    # 'D' is interpreted as daily frequency
resampler


2011-01-02   -0.958501
2011-01-05    0.199016
2011-01-07    0.855768
2011-01-08    0.742549
2011-01-10   -1.156979
2011-01-12   -0.735470
dtype: float64

DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]

In [12]:
# generating date ranges
# date_range is responsible for generating a DatetimeIndex with an indicated length according to a particular frequency
index = pd.date_range('2012-04-01', '2012-06-01')
index
pd.date_range(start='2012-04-01', periods=20)
pd.date_range(end='2012-04-01', periods=20)
# if you wanted a date index containing the last business day of each month, you would pass the 'BM' frequency
pd.date_range('2000-01-01', '2000-12-01', freq='BM')
pd.date_range('2000-01-01', '2000-02-01', freq='B')  # only businessday
pd.date_range('2000-01-01', '2000-02-01', freq='H')
# by default preserves the time
pd.date_range('2012-05-02 12:56:31', periods=5)
pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)


DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2012-03-13', '2012-03-14', '2012-03-15', '2012-03-16',
               '2012-03-17', '2012-03-18', '2012-03-19', '2012-03-20',
               '2012-03-21', '2012-03-22', '2012-03-23', '2012-03-24',
               '2012-03-25', '2012-03-26', '2012-03-27', '2012-03-28',
               '2012-03-29', '2012-03-30', '2012-03-31', '2012-04-01'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
               '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',
               '2000-01-13', '2000-01-14', '2000-01-17', '2000-01-18',
               '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-24',
               '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
               '2000-01-31', '2000-02-01'],
              dtype='datetime64[ns]', freq='B')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00',
               '2000-01-01 02:00:00', '2000-01-01 03:00:00',
               '2000-01-01 04:00:00', '2000-01-01 05:00:00',
               '2000-01-01 06:00:00', '2000-01-01 07:00:00',
               '2000-01-01 08:00:00', '2000-01-01 09:00:00',
               ...
               '2000-01-31 15:00:00', '2000-01-31 16:00:00',
               '2000-01-31 17:00:00', '2000-01-31 18:00:00',
               '2000-01-31 19:00:00', '2000-01-31 20:00:00',
               '2000-01-31 21:00:00', '2000-01-31 22:00:00',
               '2000-01-31 23:00:00', '2000-02-01 00:00:00'],
              dtype='datetime64[ns]', length=745, freq='H')

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
# frequencies and date offsets
# date offset
from pandas.tseries.offsets import Hour, Minute
hour = Hour()
hour
Hour(4)
# mostly you would never need to explicitly create one of these objects
pd.date_range('2000-01-01', '2000-01-03', freq='4h')
pd.date_range('2000-01-01', '2000-01-03', freq='1h30min')
# week of month dates
# one useful frequency class is 'week of month' starting with WOM which enables you to get dates like the third Friday or so
rng = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')
list(rng)

<Hour>

<4 * Hours>

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00',
               '2000-01-01 15:00:00', '2000-01-01 16:30:00',
               '2000-01-01 18:00:00', '2000-01-01 19:30:00',
               '2000-01-01 21:00:00', '2000-01-01 22:30:00',
               '2000-01-02 00:00:00', '2000-01-02 01:30:00',
               '2000-01-02 03:00:00', '2000-01-02 04:30:00',
               '2000-01-02 06:00:00', '2000-01-02 07:30:00',
               '2000-01-02 09:00:00', '2000-01-02 10:30:00',
               '2000-01-02 12:00:00', '2000-01-02 13:30:00',
               '2000-01-02 15:00:00', '2000-01-02 16:30:00',
               '2000-01-02 18:00:00', '2000-01-02 19:30:00',
               '2000-01-02 21:00:00', '2000-01-02 22:30:00',
               '2000-01-

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

In [14]:
# shifting(leading and lagging) data
ts = pd.Series(np.random.randn(4),
               index=pd.date_range('1/1/2000', periods=4, freq='M'))
ts
ts.shift(2)
ts.shift(-2)
# a common use of shift is computing percent changes in a time series or multiple time series
ts / ts.shift(1) - 1
ts.shift(2, freq='M')
ts.shift(2, freq='D')
ts.shift(1, freq='90T')

2000-01-31    0.375660
2000-02-29   -0.762255
2000-03-31    1.521064
2000-04-30    0.293275
Freq: M, dtype: float64

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.375660
2000-04-30   -0.762255
Freq: M, dtype: float64

2000-01-31    1.521064
2000-02-29    0.293275
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

2000-01-31         NaN
2000-02-29   -3.029106
2000-03-31   -2.995479
2000-04-30   -0.807191
Freq: M, dtype: float64

2000-03-31    0.375660
2000-04-30   -0.762255
2000-05-31    1.521064
2000-06-30    0.293275
Freq: M, dtype: float64

2000-02-02    0.375660
2000-03-02   -0.762255
2000-04-02    1.521064
2000-05-02    0.293275
dtype: float64

2000-01-31 01:30:00    0.375660
2000-02-29 01:30:00   -0.762255
2000-03-31 01:30:00    1.521064
2000-04-30 01:30:00    0.293275
Freq: M, dtype: float64

In [15]:
# shifting dates with offsets
from pandas.tseries.offsets import Day, MonthEnd

now = datetime(2011, 11, 17)
now + 3 * Day()
now + MonthEnd()
now + MonthEnd(2)
offset = MonthEnd()
offset.rollforward(now)
offset.rollback(now)
# a creative use of date offsets is to use these methods with groupby
ts = pd.Series(np.random.randn(20),
               index=pd.date_range('1/15/2000', periods=20, freq='4d'))
ts
ts.groupby(offset.rollforward).mean()
# an easier way is to use resample
ts.resample('M').mean()

Timestamp('2011-11-20 00:00:00')

Timestamp('2011-11-30 00:00:00')

Timestamp('2011-12-31 00:00:00')

Timestamp('2011-11-30 00:00:00')

Timestamp('2011-10-31 00:00:00')

2000-01-15   -1.203146
2000-01-19   -1.374458
2000-01-23   -0.182893
2000-01-27   -1.048195
2000-01-31   -0.407904
2000-02-04    0.094914
2000-02-08   -0.510310
2000-02-12   -1.257177
2000-02-16   -0.374134
2000-02-20   -0.315777
2000-02-24   -0.141309
2000-02-28   -0.850939
2000-03-03   -1.292738
2000-03-07    1.999342
2000-03-11    0.179423
2000-03-15    0.707790
2000-03-19   -0.533459
2000-03-23   -0.883477
2000-03-27    1.766139
2000-03-31    0.075253
Freq: 4D, dtype: float64

2000-01-31   -0.843319
2000-02-29   -0.479248
2000-03-31    0.252284
dtype: float64

2000-01-31   -0.843319
2000-02-29   -0.479248
2000-03-31    0.252284
Freq: M, dtype: float64

In [16]:
# time zone handling
# timezone are expressed as offsets from UTC
import pytz

pytz.common_timezones[-5:]
pytz.common_timezones[8]
tz = pytz.timezone('America/Los_Angeles')   # create a time zone object from pytz, use pytz.timezone
tz

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

'Africa/Bissau'

<DstTzInfo 'America/Los_Angeles' LMT-1 day, 16:07:00 STD>

In [17]:
# time zone localization and conversion
rng = pd.date_range('3/3/2012 9:30', periods=6, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
print(ts.index.tz)  # tz field is None
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')
ts
ts_utc = ts.tz_localize('UTC')
ts_utc
ts_utc.tz_convert('America/New_York')
ts_eastern = ts.tz_localize('America/New_York')
ts_western = ts.tz_localize('America/Los_Angeles')
ts_eastern.tz_convert('UTC')
ts_eastern.tz_convert('Europe/Berlin')
ts.index.tz_localize('Asia/Shanghai')

2012-03-03 09:30:00   -0.686502
2012-03-04 09:30:00    0.429116
2012-03-05 09:30:00    0.290947
2012-03-06 09:30:00   -0.482403
2012-03-07 09:30:00    0.126723
2012-03-08 09:30:00   -1.408667
Freq: D, dtype: float64

None


DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

2012-03-03 09:30:00   -0.686502
2012-03-04 09:30:00    0.429116
2012-03-05 09:30:00    0.290947
2012-03-06 09:30:00   -0.482403
2012-03-07 09:30:00    0.126723
2012-03-08 09:30:00   -1.408667
Freq: D, dtype: float64

2012-03-03 09:30:00+00:00   -0.686502
2012-03-04 09:30:00+00:00    0.429116
2012-03-05 09:30:00+00:00    0.290947
2012-03-06 09:30:00+00:00   -0.482403
2012-03-07 09:30:00+00:00    0.126723
2012-03-08 09:30:00+00:00   -1.408667
Freq: D, dtype: float64

2012-03-03 04:30:00-05:00   -0.686502
2012-03-04 04:30:00-05:00    0.429116
2012-03-05 04:30:00-05:00    0.290947
2012-03-06 04:30:00-05:00   -0.482403
2012-03-07 04:30:00-05:00    0.126723
2012-03-08 04:30:00-05:00   -1.408667
Freq: D, dtype: float64

2012-03-03 14:30:00+00:00   -0.686502
2012-03-04 14:30:00+00:00    0.429116
2012-03-05 14:30:00+00:00    0.290947
2012-03-06 14:30:00+00:00   -0.482403
2012-03-07 14:30:00+00:00    0.126723
2012-03-08 14:30:00+00:00   -1.408667
Freq: D, dtype: float64

2012-03-03 15:30:00+01:00   -0.686502
2012-03-04 15:30:00+01:00    0.429116
2012-03-05 15:30:00+01:00    0.290947
2012-03-06 15:30:00+01:00   -0.482403
2012-03-07 15:30:00+01:00    0.126723
2012-03-08 15:30:00+01:00   -1.408667
Freq: D, dtype: float64

DatetimeIndex(['2012-03-03 09:30:00+08:00', '2012-03-04 09:30:00+08:00',
               '2012-03-05 09:30:00+08:00', '2012-03-06 09:30:00+08:00',
               '2012-03-07 09:30:00+08:00', '2012-03-08 09:30:00+08:00'],
              dtype='datetime64[ns, Asia/Shanghai]', freq='D')

In [18]:
# operations with timezone-aware timestamp objects
stamp = pd.Timestamp('2011-03-12 04:00')
stamp_utc = stamp.tz_localize('utc')
stamp_utc.tz_convert('America/New_York')
stamp_utc.value  # store a UTC timestamp value as nanoseconds since the Unix epoch(Jan 1, 1970) invariant between time zone conversions

Timestamp('2011-03-11 23:00:00-0500', tz='America/New_York')

1299902400000000000

In [19]:
# operations between different time zones
# If two time series with different time zones are combined, the result will be UTC.
rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
ts1 = ts[:7].tz_localize('Europe/London')
ts1.index
ts2 = ts1[2:].tz_convert('Europe/Moscow')
ts2.index
result = ts1 + ts2
result.index

2012-03-07 09:30:00    0.016466
2012-03-08 09:30:00   -1.724070
2012-03-09 09:30:00   -0.961687
2012-03-12 09:30:00   -1.180559
2012-03-13 09:30:00   -1.265076
2012-03-14 09:30:00   -0.290390
2012-03-15 09:30:00    1.459000
2012-03-16 09:30:00    0.958900
2012-03-19 09:30:00   -1.104767
2012-03-20 09:30:00    0.854493
Freq: B, dtype: float64

DatetimeIndex(['2012-03-07 09:30:00+00:00', '2012-03-08 09:30:00+00:00',
               '2012-03-09 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00'],
              dtype='datetime64[ns, Europe/London]', freq='B')

DatetimeIndex(['2012-03-09 13:30:00+04:00', '2012-03-12 13:30:00+04:00',
               '2012-03-13 13:30:00+04:00', '2012-03-14 13:30:00+04:00',
               '2012-03-15 13:30:00+04:00'],
              dtype='datetime64[ns, Europe/Moscow]', freq='B')

DatetimeIndex(['2012-03-07 09:30:00+00:00', '2012-03-08 09:30:00+00:00',
               '2012-03-09 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='B')

In [20]:
# periods and period arithmetic
# periods represent timespans, like days, quarters, or years
p = pd.Period(2007, freq='A-DEC')   # 表示以12月作为结束的一整年 annual?
p   # in this case, the period object represents the full timespan from Jan/1/2007
p + 5
pd.Period('2014', freq='A-DEC') - p
rng = pd.period_range('2000-01-01', '2000-06-30', freq='M')
rng
values = ['2001Q3', '2002Q2', '2003Q1']
index = pd.PeriodIndex(values, freq='Q-DEC')
index

Period('2007', 'A-DEC')

Period('2012', 'A-DEC')

7

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

In [21]:
# period frequency conversion
# asfreq
p = pd.Period('2007', freq='A-DEC')
p
p.asfreq('M', how='start')
p.asfreq('M', how='end')
p = pd.Period('2007', freq='A-JUN')  # 2006.7-2007.6
p
p.asfreq('M', 'start')
p.asfreq('M', 'end')
# when you are converting from high to low frequency, pandas determines the super-period depending on where
# the subperiod "belongs". For example, in A-JUN frequency, the month Aug-2007 is actually part of th e2008 period
p = pd.Period('Aug-2007', 'M')
p.asfreq('A-JUN')
p.asfreq('A-SEP')
# whole PeriodIndex objects or time series can be similarly converted with the same semantics
rng = pd.period_range('2006', '2009', freq='A-DEC')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
ts.asfreq('M', how='start')
ts.asfreq('B', how='end')   # last business day of each year


Period('2007', 'A-DEC')

Period('2007-01', 'M')

Period('2007-12', 'M')

Period('2007', 'A-JUN')

Period('2006-07', 'M')

Period('2007-06', 'M')

Period('2008', 'A-JUN')

Period('2007', 'A-SEP')

2006    0.016176
2007    1.961273
2008   -1.389327
2009    0.346411
Freq: A-DEC, dtype: float64

2006-01    0.016176
2007-01    1.961273
2008-01   -1.389327
2009-01    0.346411
Freq: M, dtype: float64

2006-12-29    0.016176
2007-12-31    1.961273
2008-12-31   -1.389327
2009-12-31    0.346411
Freq: B, dtype: float64

In [22]:
# quarterly data is standard in accounting, finance...
p = pd.Period('2012Q4', freq='Q-JAN')
p
# in the case of fiscal year ending in Jan, 2012Q4 runs from November through January, which you can check by converting to daily frequency
p.asfreq('D', 'start')
p.asfreq('D', 'end')
p2 = pd.Period('2012Q3', freq='Q-JAN')  # 一月作为一年的最后一个quarter的最后一个月...
p2.asfreq('D', 'start')
p2.asfreq('D', 'end')
p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60
p4pm
p4pm.to_timestamp()

Period('2012Q4', 'Q-JAN')

Period('2011-11-01', 'D')

Period('2012-01-31', 'D')

Period('2011-08-01', 'D')

Period('2011-10-31', 'D')

Period('2012-01-30 16:00', 'T')

Timestamp('2012-01-30 16:00:00')

In [23]:
# converting timestamps to periods(and back)
rng = pd.date_range('2000-01-01', periods=3, freq='M')
ts = pd.Series(np.random.randn(3), index=rng)
ts
pts = ts.to_period()
pts
rng = pd.date_range('1/29/2000', periods=6, freq='D')
ts2 = pd.Series(np.random.randn(6), index=rng)
ts2
ts2.to_period('M')
pts = ts2.to_period()
pts
pts.to_timestamp(how='e')

2000-01-31   -0.781683
2000-02-29   -0.544524
2000-03-31   -0.682452
Freq: M, dtype: float64

2000-01   -0.781683
2000-02   -0.544524
2000-03   -0.682452
Freq: M, dtype: float64

2000-01-29   -0.245001
2000-01-30   -0.997027
2000-01-31    0.072092
2000-02-01   -0.498126
2000-02-02   -1.583324
2000-02-03    0.596303
Freq: D, dtype: float64

2000-01   -0.245001
2000-01   -0.997027
2000-01    0.072092
2000-02   -0.498126
2000-02   -1.583324
2000-02    0.596303
Freq: M, dtype: float64

2000-01-29   -0.245001
2000-01-30   -0.997027
2000-01-31    0.072092
2000-02-01   -0.498126
2000-02-02   -1.583324
2000-02-03    0.596303
Freq: D, dtype: float64

2000-01-29   -0.245001
2000-01-30   -0.997027
2000-01-31    0.072092
2000-02-01   -0.498126
2000-02-02   -1.583324
2000-02-03    0.596303
Freq: D, dtype: float64

In [26]:
"""
resampling refers to the process of converting a time series from one frequency to another
resample has a similar API to groupby, resample to group then call an aggregation function
"""
rng = pd.date_range('2000-01-01', periods=100, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
ts.resample('M').mean()
ts.resample('M', kind='period').mean()

'\nresampling refers to the process of converting a time series from one frequency to another\nresample has a similar API to groupby, resample to group then call an aggregation function\n'

2000-01-01   -0.386173
2000-01-02    0.042016
2000-01-03   -0.136730
2000-01-04    0.155579
2000-01-05    1.693893
2000-01-06    0.854094
2000-01-07    0.143522
2000-01-08    1.354197
2000-01-09   -0.035503
2000-01-10    0.840876
2000-01-11   -0.594492
2000-01-12   -0.408520
2000-01-13   -1.759381
2000-01-14   -0.917538
2000-01-15   -0.440053
2000-01-16    0.793924
2000-01-17    0.558771
2000-01-18    0.464805
2000-01-19   -0.497090
2000-01-20    0.990978
2000-01-21   -1.846803
2000-01-22   -1.230020
2000-01-23   -0.363767
2000-01-24   -1.407067
2000-01-25   -0.196524
2000-01-26    0.230735
2000-01-27    2.473405
2000-01-28   -1.423594
2000-01-29    2.371089
2000-01-30   -0.247778
                ...   
2000-03-11   -0.842516
2000-03-12   -0.560131
2000-03-13    2.058008
2000-03-14    0.399040
2000-03-15    1.039716
2000-03-16   -0.789480
2000-03-17   -0.780044
2000-03-18    1.000631
2000-03-19   -0.798537
2000-03-20   -1.042686
2000-03-21    0.459716
2000-03-22   -0.540368
2000-03-23 

2000-01-31    0.076349
2000-02-29    0.115857
2000-03-31   -0.197922
2000-04-30    0.485648
Freq: M, dtype: float64

2000-01    0.076349
2000-02    0.115857
2000-03   -0.197922
2000-04    0.485648
Freq: M, dtype: float64

In [32]:
# downsampling: from higher frequency to lower
rng = pd.date_range('2000-01-01', periods=12, freq='T')
ts = pd.Series(np.arange(12), index=rng)
ts
ts.resample('5min', closed='right').sum()
ts.resample('5min').sum()
# by default, the left bin edge is inclusive, so the 00:00 value is included in the 00:00 to 00:05
ts.resample('5min', closed='right', label='right').sum()
# shift the result index by some amount
ts.resample('5min', closed='right', label='right', loffset='-1s').sum()
# Open-High-Low-Close(OHLC) resampling
ts.resample('5min').ohlc()  # first, last, maximum, minimum

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int32

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int32

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int32

1999-12-31 23:59:59     0
2000-01-01 00:04:59    15
2000-01-01 00:09:59    40
2000-01-01 00:14:59    11
Freq: 5T, dtype: int32

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,4,0,4
2000-01-01 00:05:00,5,9,5,9
2000-01-01 00:10:00,10,11,10,11


In [36]:
# upsampling and interpolation
frame = pd.DataFrame(np.random.randn(2, 4),
                     index=pd.date_range('1/1/2000', periods=2,
                                         freq='W-WED'),
                     columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame
df_daily = frame.resample('D').asfreq()
df_daily
frame.resample('D').ffill()
frame.resample('D').ffill(limit=2)
frame.resample('W-THU').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-12,0.989555,1.343975,-1.506564,-1.744829


Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-06,,,,
2000-01-07,,,,
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,0.989555,1.343975,-1.506564,-1.744829


Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-06,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-07,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-08,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-09,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-10,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-11,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-12,0.989555,1.343975,-1.506564,-1.744829


Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-06,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-07,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,0.989555,1.343975,-1.506564,-1.744829


Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-06,-0.438141,-0.524943,-0.642989,-0.30367
2000-01-13,0.989555,1.343975,-1.506564,-1.744829


In [38]:
# resampling with periods
frame = pd.DataFrame(np.random.randn(24, 4),
                     index=pd.period_range('1-2000', '12-2001',
                                           freq='M'),
                     columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame[:5]
annual_frame = frame.resample('A-DEC').mean()
annual_frame
annual_frame.resample('Q-DEC').ffill()
annual_frame.resample('Q-DEC', convention='end').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01,-0.204921,0.062865,-0.317342,-2.554881
2000-02,-1.438312,0.507934,-0.418419,-1.143274
2000-03,0.415318,-0.141017,-1.406202,-1.172406
2000-04,-1.199744,-0.515944,1.454003,-0.972013
2000-05,1.273769,0.377305,-0.154437,-0.039328


Unnamed: 0,Colorado,Texas,New York,Ohio
2000,0.314708,-0.020274,-0.318318,-0.678566
2001,0.140117,-0.17659,0.026317,0.405509


Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,0.314708,-0.020274,-0.318318,-0.678566
2000Q2,0.314708,-0.020274,-0.318318,-0.678566
2000Q3,0.314708,-0.020274,-0.318318,-0.678566
2000Q4,0.314708,-0.020274,-0.318318,-0.678566
2001Q1,0.140117,-0.17659,0.026317,0.405509
2001Q2,0.140117,-0.17659,0.026317,0.405509
2001Q3,0.140117,-0.17659,0.026317,0.405509
2001Q4,0.140117,-0.17659,0.026317,0.405509


Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,0.314708,-0.020274,-0.318318,-0.678566
2001Q1,0.314708,-0.020274,-0.318318,-0.678566
2001Q2,0.314708,-0.020274,-0.318318,-0.678566
2001Q3,0.314708,-0.020274,-0.318318,-0.678566
2001Q4,0.140117,-0.17659,0.026317,0.405509
