In [13]:
import cupy as cp
import cudf as cudf
import cuml
import numpy as np

from tqdm.notebook import tqdm

from cuml.metrics import mean_squared_error
from cuml.model_selection import train_test_split
from cuml.tsa.arima import ARIMA

In [14]:
data = cudf.read_csv("timeseries_updated_WG_data.csv")
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GPI,NDC,dates,disc_unitsold,holiday,unitsold
0,0,0,58300040107520,115681108,2014-01-01,0.0,1.0,0.0
1,1,1,58300040107520,115681108,2014-01-02,81.0,0.0,245.0
2,2,2,58300040107520,115681108,2014-01-03,22.0,0.0,120.0
3,3,3,58300040107520,115681108,2014-01-04,115.0,0.0,186.0
4,4,4,58300040107520,115681108,2014-01-05,80.0,0.0,151.0


In [15]:
data = data.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
data.head()

Unnamed: 0,GPI,NDC,dates,disc_unitsold,holiday,unitsold
0,58300040107520,115681108,2014-01-01,0.0,1.0,0.0
1,58300040107520,115681108,2014-01-02,81.0,0.0,245.0
2,58300040107520,115681108,2014-01-03,22.0,0.0,120.0
3,58300040107520,115681108,2014-01-04,115.0,0.0,186.0
4,58300040107520,115681108,2014-01-05,80.0,0.0,151.0


In [16]:
data=data.rename(columns={'GPI':'Store_id','NDC':'Product SKU'},inplace=False)
data.head()

Unnamed: 0,Store_id,Product SKU,dates,disc_unitsold,holiday,unitsold
0,58300040107520,115681108,2014-01-01,0.0,1.0,0.0
1,58300040107520,115681108,2014-01-02,81.0,0.0,245.0
2,58300040107520,115681108,2014-01-03,22.0,0.0,120.0
3,58300040107520,115681108,2014-01-04,115.0,0.0,186.0
4,58300040107520,115681108,2014-01-05,80.0,0.0,151.0


In [17]:
data['dates'] = cudf.to_datetime(data['dates'])
print(data.dtypes)

Store_id                  int64
Product SKU               int64
dates            datetime64[ns]
disc_unitsold           float64
holiday                 float64
unitsold                float64
dtype: object


In [18]:
%%time 
#feature engineering
data['dates'] = cudf.to_datetime(data['dates'])
data["day"] = data["dates"].dt.day
data["weekday"] = data["dates"].dt.weekday
data["month"] = data["dates"].dt.month
data["year"] = data["dates"].dt.year
data["quarter"] = data["dates"].dt.month -1 
data["quarter"] = data["quarter"] // 3
data["quarter"] = data["quarter"] + 1 
data['Yearfull'] = data["dates"].dt.year.astype(str)+ '-01-01'
data['Week']= (((data['dates'] - data['Yearfull'].astype('datetime64[ns]')).dt.days)/7).astype('int16') +1
data['Week']= data.Week.where(data['Week']!=53, 52)

CPU times: user 21.2 ms, sys: 23.5 ms, total: 44.7 ms
Wall time: 56.3 ms


In [19]:
data.head()

Unnamed: 0,Store_id,Product SKU,dates,disc_unitsold,holiday,unitsold,day,weekday,month,year,quarter,Yearfull,Week
0,58300040107520,115681108,2014-01-01,0.0,1.0,0.0,1,2,1,2014,1,2014-01-01,1
1,58300040107520,115681108,2014-01-02,81.0,0.0,245.0,2,3,1,2014,1,2014-01-01,1
2,58300040107520,115681108,2014-01-03,22.0,0.0,120.0,3,4,1,2014,1,2014-01-01,1
3,58300040107520,115681108,2014-01-04,115.0,0.0,186.0,4,5,1,2014,1,2014-01-01,1
4,58300040107520,115681108,2014-01-05,80.0,0.0,151.0,5,6,1,2014,1,2014-01-01,1


In [20]:
data = data.drop('Yearfull', axis=1)
print(data.dtypes)

Store_id                  int64
Product SKU               int64
dates            datetime64[ns]
disc_unitsold           float64
holiday                 float64
unitsold                float64
day                       int16
weekday                   int16
month                     int16
year                      int16
quarter                   int16
Week                      int16
dtype: object


In [26]:
gpi_un = data['Store_id'].nunique()
print(gpi_un)

12


In [27]:
ndc_un = data['Product SKU'].nunique()
print(ndc_un)

531


In [28]:
data['Store_id']

0         58300040107520
1         58300040107520
2         58300040107520
3         58300040107520
4         58300040107520
               ...      
959512    27250050000320
959513    27250050000320
959514    27250050000320
959515    27250050000320
959516    27250050000320
Name: Store_id, Length: 959517, dtype: int64

In [32]:
storeid = list(set(data['Store_id'].astype('int').to_arrow().to_pylist()))
productsku = list(set(data['Product SKU'].astype('int').to_arrow().to_pylist()))

In [33]:
storeid

[58300040107520,
 58160070100320,
 58160020100320,
 27250050000320,
 27250050000350,
 58180025106730,
 58160034100330,
 58160020100340,
 27250050000340,
 58160070100310,
 58160020100310,
 58180025106750]

In [25]:
%%time
grouped = data.groupby(['Store_id', 'Product SKU'])

name_list = [name for name, _ in grouped]
df_list = [df["unitsold"].rename(name).reset_index(drop=True) for name, df in grouped]

# All series have the same length so we can conveniently pack them
# Note: if not, we could pad with missing observations at the start
packed_df = cudf.concat(df_list, axis=1)

CPU times: user 1.54 s, sys: 0 ns, total: 1.54 s
Wall time: 1.53 s


In [12]:
packed_df.columns


MultiIndex([(27250050000320,    87606005),
            (27250050000320,    93104801),
            (27250050000320,    93104810),
            (27250050000320,    93104898),
            (27250050000320,   378718505),
            (27250050000320,   904632661),
            (27250050000320,   904668961),
            (27250050000320, 23155010201),
            (27250050000320, 23155010205),
            (27250050000320, 23155010206),
            ...
            (58300040107520, 68180031902),
            (58300040107520, 68180031906),
            (58300040107520, 68180031909),
            (58300040107520, 68382035316),
            (58300040107520, 69097087502),
            (58300040107520, 69097087505),
            (58300040107520, 69097087512),
            (58300040107520, 70436001002),
            (58300040107520, 70436001004),
            (58300040107520, 70436001006)],
           length=531)

In [56]:
def run_arima_parallel(df):
    X = df.astype('float')
    train, test = train_test_split(df, train_size=int(0.8*df.shape[0]), shuffle=False)
    model = ARIMA(endog=train, order=(1,1,1), seasonal_order=(1,1,1,7), fit_intercept=False)
    result = model.fit()
    start = train.shape[0]
    end = df.shape[0]
    predictions = result.predict(start, end)
    return [float(mean_squared_error(test[test.columns[i]], predictions[predictions.columns[i]])) for i in range(test.shape[1])]

In [57]:
%%time

# Parallel implementation

errors = run_arima_parallel(packed_df)


[W] [14:49:18.521730] fit: Some batch members had optimizer problems
(1445, 531) (362, 531) (27250050000320, 87606005)
CPU times: user 16.1 s, sys: 6.65 s, total: 22.8 s
Wall time: 23.2 s
