# Chapter 9: Time Series Analysis

## Technical requirements

In [1]:
import polars as pl

In [2]:
lf = pl.scan_csv('../data/toronto_weather.csv')

In [3]:
lf.fetch(5)

datetime,temperature,wind_speed,pressure,humidity
str,f64,f64,f64,f64
"""2012-10-01 12:…",,,,
"""2012-10-01 13:…",286.26,3.0,1012.0,82.0
"""2012-10-01 14:…",286.262541,3.0,1011.0,81.0
"""2012-10-01 15:…",286.269518,3.0,1011.0,79.0
"""2012-10-01 16:…",286.276496,3.0,1010.0,77.0


In [4]:
lf = lf.with_columns(pl.col('temperature')-273.15)

In [5]:
lf.fetch(5)

datetime,temperature,wind_speed,pressure,humidity
str,f64,f64,f64,f64
"""2012-10-01 12:…",,,,
"""2012-10-01 13:…",13.11,3.0,1012.0,82.0
"""2012-10-01 14:…",13.112541,3.0,1011.0,81.0
"""2012-10-01 15:…",13.119518,3.0,1011.0,79.0
"""2012-10-01 16:…",13.126496,3.0,1010.0,77.0


## Working with date and time

### How to do it...

In [87]:
lf_date_parsed = pl.scan_csv('../data/toronto_weather.csv', try_parse_dates=True)
lf_date_parsed.fetch(5)

datetime,temperature,wind_speed,pressure,humidity
datetime[μs],f64,f64,f64,f64
2012-10-01 12:00:00,,,,
2012-10-01 13:00:00,286.26,3.0,1012.0,82.0
2012-10-01 14:00:00,286.262541,3.0,1011.0,81.0
2012-10-01 15:00:00,286.269518,3.0,1011.0,79.0
2012-10-01 16:00:00,286.276496,3.0,1010.0,77.0


In [88]:
lf_date_parsed.schema, lf_date_parsed.dtypes

(OrderedDict([('datetime', Datetime(time_unit='us', time_zone=None)),
              ('temperature', Float64),
              ('wind_speed', Float64),
              ('pressure', Float64),
              ('humidity', Float64)]),
 [Datetime(time_unit='us', time_zone=None),
  Float64,
  Float64,
  Float64,
  Float64])

In [89]:
lf = lf.with_columns(
    pl.col('datetime').str.to_datetime()
)
lf.fetch(5)

datetime,temperature,wind_speed,pressure,humidity
datetime[μs],f64,f64,f64,f64
2012-10-01 12:00:00,,,,
2012-10-01 13:00:00,13.11,3.0,1012.0,82.0
2012-10-01 14:00:00,13.112541,3.0,1011.0,81.0
2012-10-01 15:00:00,13.119518,3.0,1011.0,79.0
2012-10-01 16:00:00,13.126496,3.0,1010.0,77.0


In [153]:
(
    lf
    .select(
        'datetime',
        pl.col('datetime').dt.year().alias('year'),
        pl.col('datetime').dt.month().alias('month'),
        pl.col('datetime').dt.day().alias('day'),
        pl.col('datetime').dt.time().alias('time')
    )
    .fetch(5)
)

datetime,year,month,day,time
datetime[μs],i32,i8,i8,time
2012-10-01 12:00:00,2012,10,1,12:00:00
2012-10-01 13:00:00,2012,10,1,13:00:00
2012-10-01 14:00:00,2012,10,1,14:00:00
2012-10-01 15:00:00,2012,10,1,15:00:00
2012-10-01 16:00:00,2012,10,1,16:00:00


In [187]:
from datetime import datetime

filtered_lf = (
    lf
    .filter(
        pl.col('datetime').dt.date().is_between(
            datetime(2017,1,1), datetime(2017,12,31)
        ),
        pl.col('datetime').dt.hour() < 12
    )
)
filtered_lf.head().collect()

datetime,temperature,wind_speed,pressure,humidity
datetime[μs],f64,f64,f64,f64
2017-01-01 00:00:00,2.44,3.0,1001.0,92.0
2017-01-01 01:00:00,2.19,9.0,1003.0,86.0
2017-01-01 02:00:00,2.41,7.0,1003.0,86.0
2017-01-01 03:00:00,2.42,5.0,1003.0,74.0
2017-01-01 04:00:00,1.77,6.0,1006.0,69.0


In [188]:
(
    filtered_lf
    .select(
        pl.col('datetime').dt.year().unique()
        .implode()
        .list.len()
        .alias('year_cnt'),
        pl.col('datetime').dt.hour().unique()
        .implode()
        .list.len()
        .alias('hour_cnt')
    )
    .head()
    .collect()
)

year_cnt,hour_cnt
u32,u32
1,12


In [217]:
time_zones_lf = (
    lf
    .select(
        'datetime',
        pl.col('datetime').dt.replace_time_zone('America/Toronto')
        .alias('replaced_time_zone_toronto'),
        pl.col('datetime').dt.convert_time_zone('America/Toronto')
        .alias('converted_time_zone_toronto')
    )
)
time_zones_lf.head().collect()

datetime,replaced_time_zone_toronto,converted_time_zone_toronto
datetime[μs],"datetime[μs, America/Toronto]","datetime[μs, America/Toronto]"
2012-10-01 12:00:00,2012-10-01 12:00:00 EDT,2012-10-01 08:00:00 EDT
2012-10-01 13:00:00,2012-10-01 13:00:00 EDT,2012-10-01 09:00:00 EDT
2012-10-01 14:00:00,2012-10-01 14:00:00 EDT,2012-10-01 10:00:00 EDT
2012-10-01 15:00:00,2012-10-01 15:00:00 EDT,2012-10-01 11:00:00 EDT
2012-10-01 16:00:00,2012-10-01 16:00:00 EDT,2012-10-01 12:00:00 EDT


### There is more...

In [331]:
(
    lf
    .select(
        'datetime',
        (pl.col('datetime')-pl.duration(weeks=5)).alias('minus_5weeks'),
        (pl.col('datetime')+pl.duration(milliseconds=5)).alias('plus_5ms'),   
    )
    .head()
    .collect()
)

datetime,minus_5weeks,plus_5ms
datetime[μs],datetime[μs],datetime[μs]
2012-10-01 12:00:00,2012-08-27 12:00:00,2012-10-01 12:00:00.005
2012-10-01 13:00:00,2012-08-27 13:00:00,2012-10-01 13:00:00.005
2012-10-01 14:00:00,2012-08-27 14:00:00,2012-10-01 14:00:00.005
2012-10-01 15:00:00,2012-08-27 15:00:00,2012-10-01 15:00:00.005
2012-10-01 16:00:00,2012-08-27 16:00:00,2012-10-01 16:00:00.005


## Applying rolling window calculations

### How to do it...

In [275]:
(
    lf
    .select(
        'datetime',
        'temperature',
        pl.col('temperature').rolling_mean(3).alias('3hr_rollign_avg')
    )
    .head()
    .collect()
)

datetime,temperature,3hr_rollign_avg
datetime[μs],f64,f64
2012-10-01 12:00:00,,
2012-10-01 13:00:00,13.11,
2012-10-01 14:00:00,13.112541,
2012-10-01 15:00:00,13.119518,13.11402
2012-10-01 16:00:00,13.126496,13.119518


In [276]:
daily_avg_temperature_lf = (
    lf
    .select(
        pl.col('datetime').dt.date().alias('date'),
        'temperature'
    )
    .group_by('date', maintain_order=True)
    .agg(
        pl.col('temperature').mean().alias('daily_avg_temp')
    )
)
daily_avg_temperature_lf.head().collect()

date,daily_avg_temp
date,f64
2012-10-01,13.140854
2012-10-02,14.24739
2012-10-03,14.176875
2012-10-04,15.067917
2012-10-05,16.216458


In [562]:
(
    daily_avg_temperature_lf
    .select(
        'date',
        'daily_avg_temp',
        pl.col('daily_avg_temp').rolling_mean(3).alias('3day_rolling_avg'),
        pl.col('daily_avg_temp').rolling_min(3).alias('3day_rolling_min'),
        pl.col('daily_avg_temp').rolling_max(3).alias('3day_rolling_max')
    )
    .head()
    .collect()
)

date,daily_avg_temp,3day_rolling_avg,3day_rolling_min,3day_rolling_max
date,f64,f64,f64,f64
2012-10-01,13.140854,,,
2012-10-02,14.24739,,,
2012-10-03,14.176875,13.85504,13.140854,14.24739
2012-10-04,15.067917,14.497394,14.176875,15.067917
2012-10-05,16.216458,15.15375,14.176875,16.216458


In [570]:
(
    daily_avg_temperature_lf
    .set_sorted('date')
    .select(
        'date',
        'daily_avg_temp',
        pl.col('daily_avg_temp').rolling_mean(3).alias('3day_rolling_avg'),
        pl.col('daily_avg_temp').rolling_mean(
            window_size=3,
            min_periods=1
        )
        .alias('3day_rolling_avg2'),
        pl.col('daily_avg_temp').mean().rolling(
            index_column='date', 
            period='3d',
            closed='right'
        )
        .alias('3day_rolling_avg3')
    )
    .head(10)
    .collect()
)

date,daily_avg_temp,3day_rolling_avg,3day_rolling_avg2,3day_rolling_avg3
date,f64,f64,f64,f64
2012-10-01,13.140854,,13.140854,13.140854
2012-10-02,14.24739,,13.694122,13.694122
2012-10-03,14.176875,13.85504,13.85504,13.85504
2012-10-04,15.067917,14.497394,14.497394,14.497394
2012-10-05,16.216458,15.15375,15.15375,15.15375
2012-10-06,15.725417,15.669931,15.669931,15.669931
2012-10-07,10.197083,14.046319,14.046319,14.046319
2012-10-08,6.79625,10.90625,10.90625,10.90625
2012-10-09,6.735,7.909444,7.909444,7.909444
2012-10-10,9.305417,7.612222,7.612222,7.612222


In [580]:
(
    daily_avg_temperature_lf
    .set_sorted('date')
    .rolling(
        'date',
        period='3d'
    )
    .agg(
        pl.col('daily_avg_temp'),
        pl.col('daily_avg_temp').mean().alias('3day_rolling_avg'),
        pl.col('daily_avg_temp').min().alias('3day_rolling_min'),
        pl.col('daily_avg_temp').max().alias('3day_rolling_max'),
    )
    .head(10)
    .collect()
)

date,daily_avg_temp,3day_rolling_avg,3day_rolling_min,3day_rolling_max
date,list[f64],f64,f64,f64
2012-10-01,[13.140854],13.140854,13.140854,13.140854
2012-10-02,"[13.140854, 14.24739]",13.694122,13.140854,14.24739
2012-10-03,"[13.140854, 14.24739, 14.176875]",13.85504,13.140854,14.24739
2012-10-04,"[14.24739, 14.176875, 15.067917]",14.497394,14.176875,15.067917
2012-10-05,"[14.176875, 15.067917, 16.216458]",15.15375,14.176875,16.216458
2012-10-06,"[15.067917, 16.216458, 15.725417]",15.669931,15.067917,16.216458
2012-10-07,"[16.216458, 15.725417, 10.197083]",14.046319,10.197083,16.216458
2012-10-08,"[15.725417, 10.197083, 6.79625]",10.90625,6.79625,15.725417
2012-10-09,"[10.197083, 6.79625, 6.735]",7.909444,6.735,10.197083
2012-10-10,"[6.79625, 6.735, 9.305417]",7.612222,6.735,9.305417


In [581]:
(
    daily_avg_temperature_lf
    .select(
        'date',
        'daily_avg_temp',
        pl.col('daily_avg_temp').rolling_mean(60).alias('60day_rolling_avg')
    )
    .collect()
    .plot.line(
        x='date', 
        y=['daily_avg_temp', '60day_rolling_avg'],
        color=['skyblue', 'gray'],
        width=800,
        height=400
    )
    .opts(legend_position='bottom_right')
)

### There is more...

In [339]:
def get_range(nums):
    min_num = min(nums)
    max_num = max(nums)
    range = max_num - min_num
    return range

(
    daily_avg_temperature_lf
    .with_columns(
        pl.col('daily_avg_temp').rolling_map(get_range, window_size=3).alias('3day_rolling_range')
    )
    .head()
    .collect()
)    


date,daily_avg_temp,3day_rolling_range
date,f64,f64
2012-10-01,13.140854,
2012-10-02,14.24739,
2012-10-03,14.176875,1.106536
2012-10-04,15.067917,0.891042
2012-10-05,16.216458,2.039583


In [582]:
(
    daily_avg_temperature_lf
    .with_columns(
        pl.col('daily_avg_temp').rolling_map(get_range, window_size=3).alias('3day_rolling_range')
    )
    .collect()
    .plot.line(
        x='date', 
        y=['daily_avg_temp', '3day_rolling_range'],
        color=['skyblue', 'gray'],
        width=800,
        height=400
    )
    .opts(legend_position='bottom_right')
)    

## Resampling techniques

### How to do it...

In [451]:
(
    lf
    .set_sorted('datetime')
    .group_by_dynamic(
        'datetime', every='1w'
    )
    .agg(pl.col('humidity').mean().round(1))
    .head(10)
    .collect()
)

datetime,humidity
datetime[μs],f64
2012-10-01 00:00:00,63.1
2012-10-08 00:00:00,62.1
2012-10-15 00:00:00,76.1
2012-10-22 00:00:00,70.0
2012-10-29 00:00:00,80.0
2012-11-05 00:00:00,68.7
2012-11-12 00:00:00,68.5
2012-11-19 00:00:00,81.0
2012-11-26 00:00:00,69.0
2012-12-03 00:00:00,82.4


In [673]:
from datetime import datetime

(
    lf
    .set_sorted('datetime')
    .group_by_dynamic(
        'datetime', every='1w'
    )
    .agg(pl.col('humidity').mean().round(1))
    .filter(
        pl.col('datetime').dt.date().is_between(
            datetime(2012,10,1), 
            datetime(2012,10,31)
        )
    )
    .collect()
    .plot.line(
        x='datetime', 
        y='humidity',
        color=['skyblue'],
        width=1000,
        height=400
    )
)


In [472]:
upsampled_df = (
    lf
    .set_sorted('datetime')
    .collect()
    .upsample(
        time_column='datetime', 
        every='30m', 
        maintain_order=True
    )
    .select(
        'datetime', 
        pl.col('humidity')
    )
)
upsampled_df.head(10)

datetime,humidity
datetime[μs],f64
2012-10-01 12:00:00,
2012-10-01 12:30:00,
2012-10-01 13:00:00,82.0
2012-10-01 13:30:00,
2012-10-01 14:00:00,81.0
2012-10-01 14:30:00,
2012-10-01 15:00:00,79.0
2012-10-01 15:30:00,
2012-10-01 16:00:00,77.0
2012-10-01 16:30:00,


In [628]:
(
    upsampled_df
    .with_columns(
        pl.col('humidity').interpolate()
    )
    .head(10)
)

datetime,humidity
datetime[μs],f64
2012-10-01 12:00:00,
2012-10-01 12:30:00,
2012-10-01 13:00:00,82.0
2012-10-01 13:30:00,81.5
2012-10-01 14:00:00,81.0
2012-10-01 14:30:00,80.0
2012-10-01 15:00:00,79.0
2012-10-01 15:30:00,78.0
2012-10-01 16:00:00,77.0
2012-10-01 16:30:00,76.5


In [671]:
(
    upsampled_df
    .with_columns(
        pl.col('humidity').interpolate()
    )
    .filter(
        pl.col('datetime').dt.date().is_between(
            datetime(2012,10,1),
            datetime(2012,10,31)
        )
    )
    .plot.area(
        x='datetime', 
        y='humidity',
        color=['skyblue'],
        width=1000,
        height=400,
        alpha=0.5
    )
)


In [504]:
datetime_with_gaps_lf = (
    lf.filter(
        ~pl.col('datetime').dt.hour().is_in([13,15,16,19])
    )
)

(
    datetime_with_gaps_lf
    .set_sorted('datetime')
    .collect()
    .upsample(
        time_column='datetime', 
        every='1h', 
        maintain_order=True
    )
    .select(
        'datetime', 
        pl.col('humidity')
    )
    .head(10)
)

datetime,humidity
datetime[μs],f64
2012-10-01 12:00:00,
2012-10-01 13:00:00,
2012-10-01 14:00:00,81.0
2012-10-01 15:00:00,
2012-10-01 16:00:00,
2012-10-01 17:00:00,76.0
2012-10-01 18:00:00,74.0
2012-10-01 19:00:00,
2012-10-01 20:00:00,70.0
2012-10-01 21:00:00,69.0


In [646]:
datetime_range_lf = pl.LazyFrame({
    'datetime': pl.datetime_range(
        start=lf.select(pl.col('datetime').min()).collect()[0,0], 
        end=lf.select(pl.col('datetime').max()).collect()[0,0], 
        interval='30m', 
        eager=True
    )
})

(
    datetime_range_lf
    .join(lf, on='datetime', how='left')
    .select(
        'datetime', 
        pl.col('humidity')
    )
    .collect()
    .head(10)
)


datetime,humidity
datetime[μs],f64
2012-10-01 12:00:00,
2012-10-01 12:30:00,
2012-10-01 13:00:00,82.0
2012-10-01 13:30:00,
2012-10-01 14:00:00,81.0
2012-10-01 14:30:00,
2012-10-01 15:00:00,79.0
2012-10-01 15:30:00,
2012-10-01 16:00:00,77.0
2012-10-01 16:30:00,


## Time series forecasting with the functime library

### Getting ready

In [81]:
import polars as pl

In [82]:
lf = pl.scan_csv('../data/historical_temperatures.csv', try_parse_dates=True)

In [83]:
lf.fetch(5)

datetime,city,temperature
datetime[μs],str,f64
2012-10-01 12:00:00,"""Toronto""",
2012-10-01 13:00:00,"""Toronto""",286.26
2012-10-01 14:00:00,"""Toronto""",286.262541
2012-10-01 15:00:00,"""Toronto""",286.269518
2012-10-01 16:00:00,"""Toronto""",286.276496


In [84]:
lf.select('city').unique().collect()

city
str
"""Toronto"""
"""New York"""
"""Vancouver"""
"""San Francisco"""
"""Las Vegas"""
"""Seattle"""


In [85]:
lf.group_by('city').head(3).collect()

city,datetime,temperature
str,datetime[μs],f64
"""Las Vegas""",2012-10-01 12:00:00,
"""Las Vegas""",2012-10-01 13:00:00,293.41
"""Las Vegas""",2012-10-01 14:00:00,293.403141
"""Seattle""",2012-10-01 12:00:00,
"""Seattle""",2012-10-01 13:00:00,281.8
"""Seattle""",2012-10-01 14:00:00,281.797217
"""New York""",2012-10-01 12:00:00,
"""New York""",2012-10-01 13:00:00,288.22
"""New York""",2012-10-01 14:00:00,288.247676
"""Vancouver""",2012-10-01 12:00:00,


### How to do it...

In [200]:
time_col, entity_col, value_col = lf.columns

y = (
    lf
    .group_by_dynamic(
        time_col,
        every='1mo',
        by=entity_col,
    )
    .agg(
        (pl.col('temperature').mean()-273.15).round(1),
    )
)

In [201]:
y.group_by('city').head(3).collect()

city,datetime,temperature
str,datetime[μs],f64
"""San Francisco""",2012-10-01 00:00:00,16.5
"""San Francisco""",2012-11-01 00:00:00,13.4
"""San Francisco""",2012-12-01 00:00:00,10.3
"""Las Vegas""",2012-10-01 00:00:00,20.9
"""Las Vegas""",2012-11-01 00:00:00,14.4
"""Las Vegas""",2012-12-01 00:00:00,9.0
"""Vancouver""",2012-10-01 00:00:00,10.1
"""Vancouver""",2012-11-01 00:00:00,7.2
"""Vancouver""",2012-12-01 00:00:00,4.4
"""Seattle""",2012-10-01 00:00:00,11.2


In [239]:
def create_train_test_sets(
    y, 
    entity_col, 
    time_col, 
    test_size, 
    freq,
    *extracted_features
):
    from functime.cross_validation import train_test_split

    X = y.select(entity_col, time_col, extracted_features)
    y_train, y_test = (
        y
        .select(entity_col, time_col, value_col)
        .pipe(train_test_split(test_size))
    )
    X_train, X_test = X.pipe(train_test_split(test_size))

    return X_train, X_test, y_train, y_test

In [240]:
test_size = 3
freq = '1mo'
X_train, X_test, y_train, y_test = create_train_test_sets(y, entity_col, time_col, test_size, freq)

In [241]:
def predict_with_linear_model(
    lags, 
    freq, 
    y_train,
    fh
):
    from functime.forecasting import linear_model
    from functime.metrics import mase

    forecaster = linear_model(lags=24, freq='1mo')
    forecaster.fit(y=y_train)
    y_pred = forecaster.predict(fh=3)
    return y_pred

In [242]:
y_pred = predict_with_linear_model(24, freq, y_train, test_size)

In [243]:
scores = mase(y_true=y_test, y_pred=y_pred, y_train=y_train)

In [244]:
display(y_pred, scores)

city,datetime,temperature
str,datetime[μs],f64
"""San Francisco""",2017-09-01 00:00:00,19.970327
"""San Francisco""",2017-10-01 00:00:00,18.315983
"""San Francisco""",2017-11-01 00:00:00,16.368908
"""Vancouver""",2017-09-01 00:00:00,16.471989
"""Vancouver""",2017-10-01 00:00:00,14.267763
"""Vancouver""",2017-11-01 00:00:00,11.530046
"""Las Vegas""",2017-09-01 00:00:00,33.356968
"""Las Vegas""",2017-10-01 00:00:00,26.222851
"""Las Vegas""",2017-11-01 00:00:00,19.330387
"""Seattle""",2017-09-01 00:00:00,16.692818


city,mase
str,f64
"""Toronto""",1.341055
"""Seattle""",1.286418
"""San Francisco""",0.542017
"""Las Vegas""",1.151459
"""New York""",1.082172
"""Vancouver""",0.737569


In [279]:
actual_viz = (
    y
    .collect()
    .plot.line(
        x='datetime', y='temperature', by='city', subplots=True

    )
    .cols(2)
) 

pred_viz = (
    y_pred
    .plot.line(
        x='datetime', y='temperature', by='city', subplots=True
    )
    .cols(2)
)

actual_viz * pred_viz


### There is more...

In [307]:
from functime.seasonality import add_calendar_effects

y_features = (
    lf
    .group_by_dynamic(
        time_col,
        every='1mo',
        by=entity_col,
    )
    .agg(
        (pl.col('temperature').mean()-273.15).round(1),
        pl.col(value_col).ts.binned_entropy(bin_count=10)
        .alias('binned_entropy'),
        pl.col(value_col).ts.lempel_ziv_complexity(threshold=3)
        .alias('lempel_ziv_complexity'),
        pl.col(value_col).ts.longest_streak_above_mean()
        .alias('longest_streak_above_mean')
    )
    .pipe(add_calendar_effects(['month']))
)
y_features.head().collect()

city,datetime,temperature,binned_entropy,lempel_ziv_complexity,longest_streak_above_mean,month
str,datetime[μs],f64,f64,f64,u64,cat
"""Toronto""",2012-10-01 00:00:00,10.3,2.02934,0.051913,94,"""10"""
"""Toronto""",2012-11-01 00:00:00,4.3,1.969688,0.051389,87,"""11"""
"""Toronto""",2012-12-01 00:00:00,1.1,2.10535,0.051075,87,"""12"""
"""Toronto""",2013-01-01 00:00:00,-2.1,2.082969,0.051075,171,"""1"""
"""Toronto""",2013-02-01 00:00:00,-3.4,1.946257,0.053571,130,"""2"""
