# Trouble with time series data 

Time series data has a special place in data analysis, and is increasingly common in today’s real time world.

In [64]:
import pandas as pd
import numpy as np

## Fixing messy time series data with DateTimeIndex


In [60]:
pd.date_range('1/1/2019', periods=12, freq='H')

DatetimeIndex(['2019-01-01 00:00:00', '2019-01-01 01:00:00',
               '2019-01-01 02:00:00', '2019-01-01 03:00:00',
               '2019-01-01 04:00:00', '2019-01-01 05:00:00',
               '2019-01-01 06:00:00', '2019-01-01 07:00:00',
               '2019-01-01 08:00:00', '2019-01-01 09:00:00',
               '2019-01-01 10:00:00', '2019-01-01 11:00:00'],
              dtype='datetime64[ns]', freq='H')

In [61]:
pd.date_range('1/1/2019', periods=12, freq='D')

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12'],
              dtype='datetime64[ns]', freq='D')

In [65]:
# convert to timeseries
pd.Series(np.random.randn(12), index=pd.date_range('1/1/2019', periods=12, freq='D'))

2019-01-01    0.304601
2019-01-02   -1.301215
2019-01-03   -0.918825
2019-01-04    0.353027
2019-01-05    1.087069
2019-01-06    0.322118
2019-01-07    0.941573
2019-01-08    0.223489
2019-01-09   -0.729176
2019-01-10    1.360778
2019-01-11   -0.981381
2019-01-12    1.820091
Freq: D, dtype: float64

In [67]:
# change frequency
pd.Series(np.random.randn(12), index=pd.date_range('1/1/2019', periods=12, freq='H')).asfreq('45Min', method='pad')

2019-01-01 00:00:00   -1.318194
2019-01-01 00:45:00   -1.318194
2019-01-01 01:30:00    0.557019
2019-01-01 02:15:00    2.109572
2019-01-01 03:00:00   -1.824286
2019-01-01 03:45:00   -1.824286
2019-01-01 04:30:00    1.899163
2019-01-01 05:15:00    1.901835
2019-01-01 06:00:00    0.226411
2019-01-01 06:45:00    0.226411
2019-01-01 07:30:00    0.385678
2019-01-01 08:15:00   -0.178140
2019-01-01 09:00:00   -0.974349
2019-01-01 09:45:00   -0.974349
2019-01-01 10:30:00   -1.476653
Freq: 45T, dtype: float64

In [68]:
# Resample
# up periods to 72 hours
pd.Series(np.random.randn(72), index=pd.date_range('1/1/2019', periods=72, freq='H')).resample('D').mean()

2019-01-01   -0.075579
2019-01-02    0.090393
2019-01-03    0.286684
Freq: D, dtype: float64

In [69]:
# Create a pandas representation of a time point
pd.Timestamp('2019-01-01')

Timestamp('2019-01-01 00:00:00')

In [56]:
# reading data from disk with timeseries information
import pandas.util.testing as tm
df = tm.makeTimeDataFrame(freq='ms')
df["t"] = df.index
df.head()

Unnamed: 0,A,B,C,D,t
2000-01-01 00:00:00.000,-0.944513,1.315922,1.922571,0.601529,2000-01-01 00:00:00.000
2000-01-01 00:00:00.001,0.390411,1.805532,0.459364,0.309276,2000-01-01 00:00:00.001
2000-01-01 00:00:00.002,-1.040051,0.418895,0.725516,-1.858552,2000-01-01 00:00:00.002
2000-01-01 00:00:00.003,0.400832,0.529163,0.00256,2.663544,2000-01-01 00:00:00.003
2000-01-01 00:00:00.004,0.865504,0.956609,0.5579,-1.18528,2000-01-01 00:00:00.004


In [57]:
df.to_json("events.json", orient="records")

In [58]:
# if you read t, it comes out as an integer
df2 = pd.read_json("events.json", orient="records")
df2.head()

Unnamed: 0,A,B,C,D,t
0,-0.944513,1.315922,1.922571,0.601529,946684800000
1,0.390411,1.805532,0.459364,0.309276,946684800001
2,-1.040051,0.418895,0.725516,-1.858552,946684800002
3,0.400832,0.529163,0.00256,2.663544,946684800003
4,0.865504,0.956609,0.5579,-1.18528,946684800004


In [59]:
# how do we recover t?
df2["t"] = pd.to_datetime(df2["t"], unit="ms")
df2.head()

Unnamed: 0,A,B,C,D,t
0,-0.944513,1.315922,1.922571,0.601529,2000-01-01 00:00:00.000
1,0.390411,1.805532,0.459364,0.309276,2000-01-01 00:00:00.001
2,-1.040051,0.418895,0.725516,-1.858552,2000-01-01 00:00:00.002
3,0.400832,0.529163,0.00256,2.663544,2000-01-01 00:00:00.003
4,0.865504,0.956609,0.5579,-1.18528,2000-01-01 00:00:00.004


## Segmenting and offsetting time series data to find the right subset

In [71]:
    pd.DataFrame(
        {'year': [2018, 2019],
         'month': [1, 2],
         'day': [4, 5],
         'hour': [2, 3]}
    )

Unnamed: 0,year,month,day,hour
0,2018,1,4,2
1,2019,2,5,3


In [70]:
pd.to_datetime(
    pd.DataFrame(
        {'year': [2018, 2019],
         'month': [1, 2],
         'day': [4, 5],
         'hour': [2, 3]}
    )
)

0   2018-01-04 02:00:00
1   2019-02-05 03:00:00
dtype: datetime64[ns]

In [75]:
import datetime
# time range
pd.date_range(datetime.datetime(2019, 1, 1), datetime.datetime(2019, 12, 31))

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10',
               ...
               '2019-12-22', '2019-12-23', '2019-12-24', '2019-12-25',
               '2019-12-26', '2019-12-27', '2019-12-28', '2019-12-29',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [74]:
# business days
pd.bdate_range(datetime.datetime(2019, 1, 1), datetime.datetime(2019, 12, 31))

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-07', '2019-01-08', '2019-01-09', '2019-01-10',
               '2019-01-11', '2019-01-14',
               ...
               '2019-12-18', '2019-12-19', '2019-12-20', '2019-12-23',
               '2019-12-24', '2019-12-25', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', length=261, freq='B')

In [76]:
# generate quarter starts
pd.bdate_range(datetime.datetime(2019, 1, 1), periods=250, freq='BQS')

DatetimeIndex(['2019-01-01', '2019-04-01', '2019-07-01', '2019-10-01',
               '2020-01-01', '2020-04-01', '2020-07-01', '2020-10-01',
               '2021-01-01', '2021-04-01',
               ...
               '2079-01-02', '2079-04-03', '2079-07-03', '2079-10-02',
               '2080-01-01', '2080-04-01', '2080-07-01', '2080-10-01',
               '2081-01-01', '2081-04-01'],
              dtype='datetime64[ns]', length=250, freq='BQS-JAN')

In [93]:
ts = pd.Series(np.random.randn(12), index=pd.date_range('1/1/2019', periods=12, freq='D'))
ts

2019-01-01   -2.010902
2019-01-02    0.811922
2019-01-03   -0.161604
2019-01-04    1.224144
2019-01-05   -1.695676
2019-01-06   -0.912748
2019-01-07    1.404750
2019-01-08    0.658572
2019-01-09    0.778649
2019-01-10   -1.697634
2019-01-11    0.764896
2019-01-12   -1.066550
Freq: D, dtype: float64

In [79]:
# index like normal
ts[:5]

2019-01-01    1.020896
2019-01-02   -0.040989
2019-01-03    1.330753
2019-01-04   -0.521787
2019-01-05   -1.198657
Freq: D, dtype: float64

In [80]:
ts["2019-01-01"]

1.020895591275134

In [81]:
ts[datetime.date(2019,1,3):]

2019-01-03    1.330753
2019-01-04   -0.521787
2019-01-05   -1.198657
2019-01-06    2.724483
2019-01-07   -0.131282
2019-01-08   -0.919027
2019-01-09   -1.872955
2019-01-10    0.621986
2019-01-11   -0.750116
2019-01-12   -1.256701
Freq: D, dtype: float64

In [82]:
ts["2019-01-01": "2019-01-05"]

2019-01-01    1.020896
2019-01-02   -0.040989
2019-01-03    1.330753
2019-01-04   -0.521787
2019-01-05   -1.198657
Freq: D, dtype: float64

## Repairing misaligned data with shifting and filling operators


In [83]:
# offset objects for shifting
from pandas.tseries.offsets import *

In [84]:
Day()  # a day forward

<Day>

In [85]:
Day().apply(pd.Timestamp('2019-01-01 09:00'))

Timestamp('2019-01-02 09:00:00')

In [86]:
d = Day().apply(pd.Timestamp('2019-01-01 09:00'))
d + Hour()

Timestamp('2019-01-02 10:00:00')

In [87]:
d + Week(weekday=4)  # coming Friday

Timestamp('2019-01-04 09:00:00')

In [88]:
d - Week(weekday=4)  # previous Friday

Timestamp('2018-12-28 09:00:00')

In [89]:
pd.date_range('2019-01-01', '2019-01-03') + Week()

DatetimeIndex(['2019-01-08', '2019-01-09', '2019-01-10'], dtype='datetime64[ns]', freq='D')

In [97]:
# use this to e.g. calculate date to date changes in value
ts.shift(3)

2019-01-01         NaN
2019-01-02         NaN
2019-01-03         NaN
2019-01-04   -2.010902
2019-01-05    0.811922
2019-01-06   -0.161604
2019-01-07    1.224144
2019-01-08   -1.695676
2019-01-09   -0.912748
2019-01-10    1.404750
2019-01-11    0.658572
2019-01-12    0.778649
Freq: D, dtype: float64

In [96]:
ts.shift(5, freq='D')

2019-01-06   -2.010902
2019-01-07    0.811922
2019-01-08   -0.161604
2019-01-09    1.224144
2019-01-10   -1.695676
2019-01-11   -0.912748
2019-01-12    1.404750
2019-01-13    0.658572
2019-01-14    0.778649
2019-01-15   -1.697634
2019-01-16    0.764896
2019-01-17   -1.066550
Freq: D, dtype: float64

In [100]:
ts.shift(3).fillna(method='backfill')

2019-01-01   -2.010902
2019-01-02   -2.010902
2019-01-03   -2.010902
2019-01-04   -2.010902
2019-01-05    0.811922
2019-01-06   -0.161604
2019-01-07    1.224144
2019-01-08   -1.695676
2019-01-09   -0.912748
2019-01-10    1.404750
2019-01-11    0.658572
2019-01-12    0.778649
Freq: D, dtype: float64

In [101]:
ts.shift(-3).fillna(method='pad')

2019-01-01    1.224144
2019-01-02   -1.695676
2019-01-03   -0.912748
2019-01-04    1.404750
2019-01-05    0.658572
2019-01-06    0.778649
2019-01-07   -1.697634
2019-01-08    0.764896
2019-01-09   -1.066550
2019-01-10   -1.066550
2019-01-11   -1.066550
2019-01-12   -1.066550
Freq: D, dtype: float64