In [46]:
import pandas as pd
from mlforecast import MLForecast
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from mlforecast.lag_transforms import ExpandingMean, RollingMean

def aggregate_across(df):
    df = df.groupby(['ds']).sum()
    df = df.loc[:, ~df.columns.str.contains('unique_id')]
 
    df['unique_id'] = 'SMS_H0'
 
    df.reset_index(inplace=True)
    return df
 
 


In [53]:
pd.read_parquet('/home/users/sbelegu/forecastingpipeline/tutorials/05_Spark/Cluster/Data/500_ho_with_transformation.parquet')

Unnamed: 0,ds,unique_id,y,temperature,windSpeed,temperature_lag1,temperature_lag48,temperature_expanding_mean_lag1,temperature_expanding_mean_lag24,windSpeed_lag1,windSpeed_lag48,windSpeed_expanding_mean_lag1,windSpeed_expanding_mean_lag24
0,2013-01-01 00:30:00,MAC001546,0.145,7.01,5.46,,,,,,,,
1,2013-01-01 01:00:00,MAC001546,0.142,7.49,5.51,7.01,,7.010000,,5.46,,5.460000,
2,2013-01-01 01:30:00,MAC001546,0.075,7.49,5.51,7.49,,7.250000,,5.51,,5.485000,
3,2013-01-01 02:00:00,MAC001546,0.133,7.16,5.74,7.49,,7.330000,,5.51,,5.493333,
4,2013-01-01 02:30:00,MAC001546,0.147,7.16,5.74,7.16,,7.287500,,5.74,,5.555000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8735995,2013-12-30 22:00:00,MAC003374,0.198,6.40,2.98,6.90,6.28,10.808659,10.811556,3.14,4.95,3.835671,3.831750
8735996,2013-12-30 22:30:00,MAC003374,0.144,6.40,2.98,6.40,6.28,10.808407,10.811512,2.98,4.95,3.835622,3.832126
8735997,2013-12-30 23:00:00,MAC003374,0.131,6.82,3.10,6.40,7.56,10.808154,10.811448,2.98,5.63,3.835573,3.832497
8735998,2013-12-30 23:30:00,MAC003374,0.181,6.82,3.10,6.82,7.56,10.807926,10.811384,3.10,5.63,3.835531,3.832868


In [55]:
horizon = 48

Y_df = pd.read_parquet('/home/users/sbelegu/forecastingpipeline/tutorials/05_Spark/Cluster/Data/500_ho_with_transformation.parquet')
ids = list(Y_df.unique_id.unique())[:128]
Y_df_aggregated = aggregate_across(Y_df)[['ds','unique_id','y']]
Y_df_128_ids =Y_df[Y_df['unique_id'].isin(ids)]

In [65]:
from mlforecast.target_transforms import Differences
from mlforecast.utils import PredictionIntervals
# Define the model
models = [XGBRegressor(),LGBMRegressor()]
 

# Initialize MLForecast with the model
fcst = MLForecast(models, 
                  freq='30min',  # our serie has a monthly frequency
                  lags=[1,12,24,48],
                  lag_transforms={
                        1: [ExpandingMean()],
                        24: [RollingMean(window_size=48), RollingMean(window_size=12)],
                    },
                  target_transforms=[Differences([48])])

In [67]:
import sys
sys.getsizeof(Y_df)

1415232144

In [66]:
%%time
fcst.fit(Y_df,static_features=[])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4335
[LightGBM] [Info] Number of data points in the train set: 8676500, number of used features: 17
[LightGBM] [Info] Start training from score 0.000019
CPU times: user 30min 38s, sys: 34.5 s, total: 31min 13s
Wall time: 36 s


MLForecast(models=[XGBRegressor, LGBMRegressor], freq=30min, lag_features=['lag1', 'lag12', 'lag24', 'lag48', 'expanding_mean_lag1', 'rolling_mean_lag24_window_size48', 'rolling_mean_lag24_window_size12'], date_features=[], num_threads=1)

In [21]:
%%time
fcst.fit(Y_df_aggregated,prediction_intervals=PredictionIntervals(n_windows=10, h=48), fitted=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 16896, number of used features: 4
[LightGBM] [Info] Start training from score -0.004197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 17376, number of used features: 4
[LightGBM] [Info] Start training from score 0.009057
CPU times: user 2min 25s, sys: 4.92 s, total: 2min 30s
Wall time: 4.47 s


MLForecast(models=[LGBMRegressor], freq=30min, lag_features=['lag1', 'lag12', 'lag24', 'lag48'], date_features=[], num_threads=1)

In [22]:
fcst.forecast_fitted_values()

Unnamed: 0,unique_id,ds,y,LGBMRegressor
0,SMS_H0,2013-01-03 00:30:00,104.472,102.038217
1,SMS_H0,2013-01-03 01:00:00,90.353,99.324596
2,SMS_H0,2013-01-03 01:30:00,82.572,85.370126
3,SMS_H0,2013-01-03 02:00:00,72.667,73.909358
4,SMS_H0,2013-01-03 02:30:00,68.110,69.408448
...,...,...,...,...
17371,SMS_H0,2013-12-30 22:00:00,174.447,172.979770
17372,SMS_H0,2013-12-30 22:30:00,162.569,160.263019
17373,SMS_H0,2013-12-30 23:00:00,150.055,151.166414
17374,SMS_H0,2013-12-30 23:30:00,129.839,132.968573


In [23]:
forecasts_ml = fcst.predict(h=48,level = [90])

In [24]:
%%time
fcst.fit(Y_df_128_ids,prediction_intervals=PredictionIntervals(n_windows=10, h=48))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008852 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 2162688, number of used features: 4
[LightGBM] [Info] Start training from score -0.000048
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 2224128, number of used features: 4
[LightGBM] [Info] Start training from score -0.000012
CPU times: user 11min 49s, sys: 23.1 s, total: 12min 12s
Wall time: 22.4 s


MLForecast(models=[LGBMRegressor], freq=30min, lag_features=['lag1', 'lag12', 'lag24', 'lag48'], date_features=[], num_threads=1)

In [None]:
forecasts_ml = mf.predict(h=48,level = [90])