In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
"""
Anything that is observed or measured at many points in time forms a time series. Many time series are fixed frequency
• Timestamps, specific instants in time
• Fixed periods, such as the month January 2007 or the full year 2010
• Intervals of time, indicated by a start and end timestamp. Periods can be thought
of as special cases of intervals
• Experiment or elapsed time; each timestamp is a measure of time relative to a
particular start time (e.g., the diameter of a cookie baking each second since
being placed in the oven)
"""
# Date and Time data types and tools
from datetime import datetime, timedelta
now = datetime.now()
now
now.year, now.month, now.day
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta
delta.days, delta.seconds
start = datetime(2011, 1, 7)
start + timedelta(12)   # days = 12

'\nAnything that is observed or measured at many points in time forms a time series. Many time series are fixed frequency\n• Timestamps, specific instants in time\n• Fixed periods, such as the month January 2007 or the full year 2010\n• Intervals of time, indicated by a start and end timestamp. Periods can be thought\nof as special cases of intervals\n• Experiment or elapsed time; each timestamp is a measure of time relative to a\nparticular start time (e.g., the diameter of a cookie baking each second since\nbeing placed in the oven)\n'

datetime.datetime(2020, 2, 18, 15, 8, 16, 347267)

(2020, 2, 18)

datetime.timedelta(926, 56700)

(926, 56700)

datetime.datetime(2011, 1, 19, 0, 0)

In [3]:
# converting between string and datetime
stamp = datetime(2011, 1, 3)
str(stamp)
stamp.strftime('%Y-%m-%d')
stamp.strftime('%y-%m-%d-%w')
stamp.strftime('%F')
"""
%Y Four-digit year
%y Two-digit year
%m Two-digit month [01, 12]
%d Two-digit day [01, 31]
%H Hour (24-hour clock) [00, 23]
%I Hour (12-hour clock) [01, 12]
%M Two-digit minute [00, 59]
%S Second [00, 61] (seconds 60, 61 account for leap seconds)
%w Weekday as integer [0 (Sunday), 6]
%U Week number of the year [00, 53]; Sunday is considered the first day of the week, and days before the first Sunday of
the year are “week 0”
%W Week number of the year [00, 53]; Monday is considered the first day of the week, and days before the first Monday of
the year are “week 0”
%z UTC time zone offset as +HHMM or -HHMM; empty if time zone naive
%F Shortcut for %Y-%m-%d (e.g., 2012-4-18)
%D Shortcut for %m/%d/%y (e.g., 04/18/12)
"""
# from str to datetime
value = '2011-01-03'
datetime.strptime(value, '%F')
# parser.parse method in the third-party dateutil package
from dateutil.parser import parse
parse('2011-01-03')
parse('Jan 31, 1997 10:45 PM')
parse('6/12/2011', dayfirst=True)   # day appearing before month
# dateutil.parser is a useful but imperfect tool. it will recognize some strings as dates that you might prefer that it didn't
# '42' will be parsed as the year 2042

'2011-01-03 00:00:00'

'2011-01-03'

'11-01-03-1'

'2011-01-03'

'\n%Y Four-digit year\n%y Two-digit year\n%m Two-digit month [01, 12]\n%d Two-digit day [01, 31]\n%H Hour (24-hour clock) [00, 23]\n%I Hour (12-hour clock) [01, 12]\n%M Two-digit minute [00, 59]\n%S Second [00, 61] (seconds 60, 61 account for leap seconds)\n%w Weekday as integer [0 (Sunday), 6]\n%U Week number of the year [00, 53]; Sunday is considered the first day of the week, and days before the first Sunday of\nthe year are “week 0”\n%W Week number of the year [00, 53]; Monday is considered the first day of the week, and days before the first Monday of\nthe year are “week 0”\n%z UTC time zone offset as +HHMM or -HHMM; empty if time zone naive\n%F Shortcut for %Y-%m-%d (e.g., 2012-4-18)\n%D Shortcut for %m/%d/%y (e.g., 04/18/12)\n'

ValueError: 'F' is a bad directive in format '%F'

In [4]:
import pandas as pd

datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(datestrs)
idx = pd.to_datetime(datestrs + [None])
idx
pd.isnull(idx)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

array([False, False,  True])

In [5]:
# time series basics
import numpy as np
from datetime import datetime

dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(3), index=idx)
ts
ts = pd.Series(np.random.randn(6), index=dates)
ts
ts.index
ts + ts[::2]
stamp = ts.index[0]
stamp

2011-07-06 12:00:00    1.649798
2011-08-06 00:00:00    0.456721
NaT                    0.111264
dtype: float64

2011-01-02    0.715327
2011-01-05    0.756917
2011-01-07    0.649669
2011-01-08   -1.250535
2011-01-10    1.262826
2011-01-12    1.720687
dtype: float64

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

2011-01-02    1.430654
2011-01-05         NaN
2011-01-07    1.299338
2011-01-08         NaN
2011-01-10    2.525651
2011-01-12         NaN
dtype: float64

Timestamp('2011-01-02 00:00:00')

In [6]:
# indexing, selection, subsetting
stamp = ts.index[2]
stamp
ts[stamp]
# As a convenience, you can also pass a string that is interpretable as a date
ts['1/10/2011']
ts['20110110']

# for longer time series
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('1/1/2000', periods=1000))
longer_ts
longer_ts['2001']   # selects time period of whole 2001 year
# slicing with datetime object
ts[datetime(2011, 1, 7):]

Timestamp('2011-01-07 00:00:00')

0.6496690446138357

1.262825603258606

1.262825603258606

2000-01-01    2.226228
2000-01-02   -0.372992
2000-01-03    1.291018
2000-01-04   -0.326265
2000-01-05   -0.276326
2000-01-06   -0.355569
2000-01-07    0.194608
2000-01-08   -0.890136
2000-01-09    0.379868
2000-01-10    0.734949
2000-01-11   -0.550603
2000-01-12   -0.228024
2000-01-13   -1.153591
2000-01-14    0.558844
2000-01-15   -0.800265
2000-01-16    0.137146
2000-01-17    0.210986
2000-01-18    0.066799
2000-01-19    0.129609
2000-01-20   -0.053827
2000-01-21   -1.245547
2000-01-22    0.681900
2000-01-23    0.659660
2000-01-24    0.781386
2000-01-25   -1.390673
2000-01-26   -0.995112
2000-01-27    0.212307
2000-01-28    0.461557
2000-01-29    1.670926
2000-01-30   -0.615635
                ...   
2002-08-28    1.312226
2002-08-29   -1.079289
2002-08-30   -0.400025
2002-08-31    0.219371
2002-09-01    0.426936
2002-09-02    0.801212
2002-09-03   -0.338356
2002-09-04    0.821953
2002-09-05    0.418181
2002-09-06   -0.758751
2002-09-07   -0.311996
2002-09-08   -0.561633
2002-09-09 

2001-01-01    0.375831
2001-01-02   -0.932422
2001-01-03   -2.045705
2001-01-04   -2.193407
2001-01-05    0.073083
2001-01-06   -0.188490
2001-01-07    0.097755
2001-01-08   -0.456239
2001-01-09   -0.937280
2001-01-10   -0.946649
2001-01-11   -0.619414
2001-01-12    0.262100
2001-01-13   -0.342497
2001-01-14    2.005820
2001-01-15   -0.907864
2001-01-16    0.145399
2001-01-17    0.454555
2001-01-18   -1.020103
2001-01-19   -0.359284
2001-01-20   -0.407811
2001-01-21    0.638532
2001-01-22   -0.056133
2001-01-23   -0.621321
2001-01-24    1.035227
2001-01-25   -0.001365
2001-01-26   -0.612778
2001-01-27    2.096310
2001-01-28   -1.197693
2001-01-29    2.041561
2001-01-30    0.498086
                ...   
2001-12-02    0.569753
2001-12-03   -0.928173
2001-12-04    1.429428
2001-12-05    0.904111
2001-12-06    0.072477
2001-12-07    0.162621
2001-12-08   -0.797783
2001-12-09    0.330374
2001-12-10    1.492985
2001-12-11   -0.142104
2001-12-12    1.382928
2001-12-13    0.827944
2001-12-14 

2011-01-07    0.649669
2011-01-08   -1.250535
2011-01-10    1.262826
2011-01-12    1.720687
dtype: float64

In [7]:
# slice with timestamps not contained in a time series to perform a range query, slicing produces views
ts
ts['1/6/2011': '1/11/2011']
ts.truncate(after='1/9/2011')


2011-01-02    0.715327
2011-01-05    0.756917
2011-01-07    0.649669
2011-01-08   -1.250535
2011-01-10    1.262826
2011-01-12    1.720687
dtype: float64

2011-01-07    0.649669
2011-01-08   -1.250535
2011-01-10    1.262826
dtype: float64

2011-01-02    0.715327
2011-01-05    0.756917
2011-01-07    0.649669
2011-01-08   -1.250535
dtype: float64

In [8]:
# holds true for DataFrame as well, indexing on its rows
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
dates
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=['Colorado', 'Texas',
                                'New York', 'Ohio'])
long_df.loc['5-2001']

DatetimeIndex(['2000-01-05', '2000-01-12', '2000-01-19', '2000-01-26',
               '2000-02-02', '2000-02-09', '2000-02-16', '2000-02-23',
               '2000-03-01', '2000-03-08', '2000-03-15', '2000-03-22',
               '2000-03-29', '2000-04-05', '2000-04-12', '2000-04-19',
               '2000-04-26', '2000-05-03', '2000-05-10', '2000-05-17',
               '2000-05-24', '2000-05-31', '2000-06-07', '2000-06-14',
               '2000-06-21', '2000-06-28', '2000-07-05', '2000-07-12',
               '2000-07-19', '2000-07-26', '2000-08-02', '2000-08-09',
               '2000-08-16', '2000-08-23', '2000-08-30', '2000-09-06',
               '2000-09-13', '2000-09-20', '2000-09-27', '2000-10-04',
               '2000-10-11', '2000-10-18', '2000-10-25', '2000-11-01',
               '2000-11-08', '2000-11-15', '2000-11-22', '2000-11-29',
               '2000-12-06', '2000-12-13', '2000-12-20', '2000-12-27',
               '2001-01-03', '2001-01-10', '2001-01-17', '2001-01-24',
      

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-1.634552,-0.459202,0.184533,0.705621
2001-05-09,0.678235,-0.612991,0.382822,-0.474643
2001-05-16,0.017702,0.243652,-1.414015,1.362887
2001-05-23,-2.340715,-0.522536,1.321039,0.783216
2001-05-30,-0.446368,0.530365,0.475904,0.091022


In [9]:
# time series with duplicate indices
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts
dup_ts['1/2/2000']
grouped = dup_ts.groupby(level=0)
grouped.mean()


2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [10]:
# date ranges, frequencies, and shifting
# pandas has a
# full suite of standard time series frequencies and tools for resampling, inferring frequencies,
# and generating fixed-frequency date ranges. For example, you can convert
# the sample time series to be fixed daily frequency by calling resample:
ts
resampler = ts.resample('D')    # 'D' is interpreted as daily frequency
resampler


2011-01-02    0.715327
2011-01-05    0.756917
2011-01-07    0.649669
2011-01-08   -1.250535
2011-01-10    1.262826
2011-01-12    1.720687
dtype: float64

DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]

In [14]:
# generating date ranges
# date_range is responsible for generating a DatetimeIndex with an indicated length according to a particular frequency
index = pd.date_range('2012-04-01', '2012-06-01')
index
pd.date_range(start='2012-04-01', periods=20)
pd.date_range(end='2012-04-01', periods=20)
# if you wanted a date index containing the last business day of each month, you would pass the 'BM' frequency
pd.date_range('2000-01-01', '2000-12-01', freq='BM')
pd.date_range('2000-01-01', '2000-02-01', freq='B')  # only businessday
pd.date_range('2000-01-01', '2000-02-01', freq='H')
# by default preserves the time
pd.date_range('2012-05-02 12:56:31', periods=5)
pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)


DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2012-03-13', '2012-03-14', '2012-03-15', '2012-03-16',
               '2012-03-17', '2012-03-18', '2012-03-19', '2012-03-20',
               '2012-03-21', '2012-03-22', '2012-03-23', '2012-03-24',
               '2012-03-25', '2012-03-26', '2012-03-27', '2012-03-28',
               '2012-03-29', '2012-03-30', '2012-03-31', '2012-04-01'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
               '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',
               '2000-01-13', '2000-01-14', '2000-01-17', '2000-01-18',
               '2000-01-19', '2000-01-20', '2000-01-21', '2000-01-24',
               '2000-01-25', '2000-01-26', '2000-01-27', '2000-01-28',
               '2000-01-31', '2000-02-01'],
              dtype='datetime64[ns]', freq='B')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00',
               '2000-01-01 02:00:00', '2000-01-01 03:00:00',
               '2000-01-01 04:00:00', '2000-01-01 05:00:00',
               '2000-01-01 06:00:00', '2000-01-01 07:00:00',
               '2000-01-01 08:00:00', '2000-01-01 09:00:00',
               ...
               '2000-01-31 15:00:00', '2000-01-31 16:00:00',
               '2000-01-31 17:00:00', '2000-01-31 18:00:00',
               '2000-01-31 19:00:00', '2000-01-31 20:00:00',
               '2000-01-31 21:00:00', '2000-01-31 22:00:00',
               '2000-01-31 23:00:00', '2000-02-01 00:00:00'],
              dtype='datetime64[ns]', length=745, freq='H')

In [20]:
# frequencies and date offsets
# date offset
from pandas.tseries.offsets import Hour, Minute
hour = Hour()
hour
Hour(4)
# mostly you would never need to explicitly create one of these objects
pd.date_range('2000-01-01', '2000-01-03', freq='4h')
pd.date_range('2000-01-01', '2000-01-03', freq='1h30min')
# week of month dates
# one useful frequency class is 'week of month' starting with WOM which enables you to get dates like the third Friday or so
rng = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')
list(rng)

<Hour>

<4 * Hours>

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00',
               '2000-01-01 15:00:00', '2000-01-01 16:30:00',
               '2000-01-01 18:00:00', '2000-01-01 19:30:00',
               '2000-01-01 21:00:00', '2000-01-01 22:30:00',
               '2000-01-02 00:00:00', '2000-01-02 01:30:00',
               '2000-01-02 03:00:00', '2000-01-02 04:30:00',
               '2000-01-02 06:00:00', '2000-01-02 07:30:00',
               '2000-01-02 09:00:00', '2000-01-02 10:30:00',
               '2000-01-02 12:00:00', '2000-01-02 13:30:00',
               '2000-01-02 15:00:00', '2000-01-02 16:30:00',
               '2000-01-02 18:00:00', '2000-01-02 19:30:00',
               '2000-01-02 21:00:00', '2000-01-02 22:30:00',
               '2000-01-

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

In [25]:
# shifting(leading and lagging) data
ts = pd.Series(np.random.randn(4),
               index=pd.date_range('1/1/2000', periods=4, freq='M'))
ts
ts.shift(2)
ts.shift(-2)
# a common use of shift is computing percent changes in a time series or multiple time series
ts / ts.shift(1) - 1
ts.shift(2, freq='M')
ts.shift(2, freq='D')
ts.shift(1, freq='90T')

2000-01-31   -0.610653
2000-02-29   -1.521783
2000-03-31    1.498375
2000-04-30   -0.150941
Freq: M, dtype: float64

2000-01-31         NaN
2000-02-29         NaN
2000-03-31   -0.610653
2000-04-30   -1.521783
Freq: M, dtype: float64

2000-01-31    1.498375
2000-02-29   -0.150941
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

2000-01-31         NaN
2000-02-29    1.492060
2000-03-31   -1.984618
2000-04-30   -1.100737
Freq: M, dtype: float64

2000-03-31   -0.610653
2000-04-30   -1.521783
2000-05-31    1.498375
2000-06-30   -0.150941
Freq: M, dtype: float64

2000-02-02   -0.610653
2000-03-02   -1.521783
2000-04-02    1.498375
2000-05-02   -0.150941
dtype: float64

2000-01-31 01:30:00   -0.610653
2000-02-29 01:30:00   -1.521783
2000-03-31 01:30:00    1.498375
2000-04-30 01:30:00   -0.150941
Freq: M, dtype: float64

In [30]:
# shifting dates with offsets
from pandas.tseries.offsets import Day, MonthEnd

now = datetime(2011, 11, 17)
now + 3 * Day()
now + MonthEnd()
now + MonthEnd(2)
offset = MonthEnd()
offset.rollforward(now)
offset.rollback(now)
# a creative use of date offsets is to use these methods with groupby
ts = pd.Series(np.random.randn(20),
               index=pd.date_range('1/15/2000', periods=20, freq='4d'))
ts
ts.groupby(offset.rollforward).mean()
# an easier way is to use resample
ts.resample('M').mean()

Timestamp('2011-11-20 00:00:00')

Timestamp('2011-11-30 00:00:00')

Timestamp('2011-12-31 00:00:00')

Timestamp('2011-11-30 00:00:00')

Timestamp('2011-10-31 00:00:00')

2000-01-15   -0.976085
2000-01-19    1.779162
2000-01-23   -0.271855
2000-01-27   -0.081055
2000-01-31   -0.825880
2000-02-04   -1.035571
2000-02-08   -0.244828
2000-02-12   -1.096827
2000-02-16   -0.426689
2000-02-20   -0.411702
2000-02-24   -0.439954
2000-02-28    0.425226
2000-03-03   -0.771999
2000-03-07   -0.074223
2000-03-11    0.095599
2000-03-15    1.555791
2000-03-19   -0.946401
2000-03-23   -0.351056
2000-03-27    0.270578
2000-03-31    0.690277
Freq: 4D, dtype: float64