### How to Predict Multiple Time Series At Once With Scikit-Learn in 10 Simple Steps


In [3]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

C:\Users\Zigron\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\Zigron\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


### Melt the data, stack the series


In [4]:
data = pd.read_csv(r'C:\Users\Zigron\Downloads\Sales_Transactions_Dataset_Weekly.csv')


In [5]:
data.head(2)

Unnamed: 0,Product_Code,W0,W1,W2,W3,W4,W5,W6,W7,W8,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,P1,11,12,10,8,13,12,14,21,6,...,0.06,0.22,0.28,0.39,0.5,0.0,0.22,0.17,0.11,0.39
1,P2,7,6,3,2,7,1,6,3,3,...,0.2,0.4,0.5,0.1,0.1,0.4,0.5,0.1,0.6,0.0


In [6]:
data=data.filter(regex=r'Product|W')


In [7]:
data.columns

Index(['Product_Code', 'W0', 'W1', 'W2', 'W3', 'W4', 'W5', 'W6', 'W7', 'W8',
       'W9', 'W10', 'W11', 'W12', 'W13', 'W14', 'W15', 'W16', 'W17', 'W18',
       'W19', 'W20', 'W21', 'W22', 'W23', 'W24', 'W25', 'W26', 'W27', 'W28',
       'W29', 'W30', 'W31', 'W32', 'W33', 'W34', 'W35', 'W36', 'W37', 'W38',
       'W39', 'W40', 'W41', 'W42', 'W43', 'W44', 'W45', 'W46', 'W47', 'W48',
       'W49', 'W50', 'W51'],
      dtype='object')

### melt function is used to convert the wide format into long format

In [8]:
melt =data.melt(id_vars='Product_Code', var_name='Week', value_name='Sales')

In [9]:
melt.head(2)

Unnamed: 0,Product_Code,Week,Sales
0,P1,W0,11
1,P2,W0,7


In [10]:
melt.dtypes

Product_Code    object
Week            object
Sales            int64
dtype: object

In [11]:
melt['Product_Code'] = melt['Product_Code'].str.extract('(\d+)', expand=False).astype(int)
melt['Week'] = melt['Week'].str.extract('(\d+)', expand=False).astype(int)


In [12]:
melt.head()

Unnamed: 0,Product_Code,Week,Sales
0,1,0,11
1,2,0,7
2,3,0,7
3,4,0,12
4,5,0,8


In [13]:
melt = melt.sort_values(['Week', 'Product_Code'])
melt.head()

Unnamed: 0,Product_Code,Week,Sales
0,1,0,11
1,2,0,7
2,3,0,7
3,4,0,12
4,5,0,8


### 2. Split the data


In [14]:
split_point = 40
melt_train = melt[melt['Week'] < split_point].copy()
melt_valid = melt[melt['Week'] >= split_point].copy()

### 3. Set up a 1-step target


In [37]:
melt_train['sales_next_week'] = melt_train.groupby("Product_Code")['Sales'].shift(-1)


In [41]:
melt_train.head(100)

Unnamed: 0,Product_Code,Week,Sales,sales_next_week
0,1,0,11,12.0
1,2,0,7,6.0
2,3,0,7,11.0
3,4,0,12,8.0
4,5,0,8,5.0
...,...,...,...,...
95,96,0,31,35.0
96,97,0,29,37.0
97,98,0,4,2.0
98,99,0,16,11.0


In [42]:
melt_train[melt_train['Product_Code'] == 1].head()


Unnamed: 0,Product_Code,Week,Sales,sales_next_week
0,1,0,11,12.0
811,1,1,12,10.0
1622,1,2,10,8.0
2433,1,3,8,13.0
3244,1,4,13,12.0


In [43]:
melt_valid['sales_next_week'] = melt_valid.groupby("Product_Code")['Sales'].shift(-1)


In [44]:
melt_train.tail()


Unnamed: 0,Product_Code,Week,Sales,sales_next_week
32435,815,39,2,
32436,816,39,6,
32437,817,39,0,
32438,818,39,0,
32439,819,39,0,


In [45]:
melt_train = melt_train.dropna()

In [46]:
melt_train.tail()


Unnamed: 0,Product_Code,Week,Sales,sales_next_week
31624,815,38,1,2.0
31625,816,38,4,6.0
31626,817,38,0,0.0
31627,818,38,0,0.0
31628,819,38,1,0.0


### Create 4 Fundamental Features
4.1 Lag
what if I don't have the last period? Get the closest available

In [50]:
melt_train["lag_sales_1"] = melt_train.groupby("Product_Code")['Sales'].shift(1)
melt_valid["lag_sales_1"] = melt_valid.groupby("Product_Code")['Sales'].shift(1)


In [51]:
melt_valid["lag_sales_1"]

32440    NaN
32441    NaN
32442    NaN
32443    NaN
32444    NaN
        ... 
42167    2.0
42168    6.0
42169    4.0
42170    2.0
42171    0.0
Name: lag_sales_1, Length: 9732, dtype: float64

### 4.2 Difference


In [52]:
melt_train["diff_sales_1"] = melt_train.groupby("Product_Code")['Sales'].diff(1)


In [53]:
melt_train[melt_train['Product_Code'] == 1].head()

Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1
0,1,0,11,12.0,,
811,1,1,12,10.0,11.0,1.0
1622,1,2,10,8.0,12.0,-2.0
2433,1,3,8,13.0,10.0,-2.0
3244,1,4,13,12.0,8.0,5.0


In [54]:
melt_valid["diff_sales_1"] = melt_valid.groupby("Product_Code")['Sales'].diff(1)


### 4.3 Rolling statistics


### Mean
### Max
### Min
### Std

In [55]:
melt_train.groupby("Product_Code")['Sales'].rolling(4).mean()


Product_Code       
1             0          NaN
              811        NaN
              1622       NaN
              2433     10.25
              3244     10.75
                       ...  
819           28384     0.25
              29195     1.00
              30006     1.00
              30817     1.25
              31628     1.50
Name: Sales, Length: 31629, dtype: float64

In [56]:
melt_train.groupby("Product_Code")['Sales'].rolling(4).mean().reset_index(level=0, drop=True)


0          NaN
811        NaN
1622       NaN
2433     10.25
3244     10.75
         ...  
28384     0.25
29195     1.00
30006     1.00
30817     1.25
31628     1.50
Name: Sales, Length: 31629, dtype: float64

In [71]:
melt_train["mean_sales_4"] = melt_train.groupby("Product_Code")['Sales'].rolling(4).mean().reset_index(level=0, drop=True)
melt_valid["mean_sales_4"] = melt_train.groupby("Product_Code")['Sales'].rolling(4).mean().reset_index(level=0, drop=True)


In [59]:
melt_train[melt_train['Product_Code'] == 1].head()


Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1,mean_sales_4
0,1,0,11,12.0,,,
811,1,1,12,10.0,11.0,1.0,
1622,1,2,10,8.0,12.0,-2.0,
2433,1,3,8,13.0,10.0,-2.0,10.25
3244,1,4,13,12.0,8.0,5.0,10.75


In [60]:
def mape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    #ape[~np.isfinite(ape)] = 0. # VERY questionable
    ape[~np.isfinite(ape)] = 1. # pessimist estimate
    return np.mean(ape)

def wmape(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))


In [61]:
melt_train.head(5)


Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1,mean_sales_4
0,1,0,11,12.0,,,
1,2,0,7,6.0,,,
2,3,0,7,11.0,,,
3,4,0,12,8.0,,,
4,5,0,8,5.0,,,


In [65]:
features = ['Sales', 'lag_sales_1', 'diff_sales_1', 'mean_sales_4']


In [72]:
imputer = SimpleImputer()
Xtr = imputer.fit_transform(melt_train[features])
ytr = melt_train['sales_next_week']


mdl = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
mdl.fit(Xtr, ytr)

RandomForestRegressor(n_jobs=6, random_state=0)

In [73]:
Xval = imputer.transform(melt_valid[features])
yval = melt_valid['sales_next_week']

p = mdl.predict(Xval)

In [74]:
mape(yval, p)


0.6789502899719647

In [75]:
wmape(yval, p)


0.331011282307137

In [76]:
##Extend the model to predict n-steps
melt_train['sales_next_next_week'] = melt_train.groupby("Product_Code")['Sales'].shift(-2)
melt_valid['sales_next_next_week'] = melt_valid.groupby("Product_Code")['Sales'].shift(-2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [77]:
melt_train[melt_train['Product_Code'] == 1].head()


Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1,mean_sales_4,sales_next_next_week
0,1,0,11,12.0,,,,10.0
811,1,1,12,10.0,11.0,1.0,,8.0
1622,1,2,10,8.0,12.0,-2.0,,13.0
2433,1,3,8,13.0,10.0,-2.0,10.25,12.0
3244,1,4,13,12.0,8.0,5.0,10.75,14.0


In [78]:
melt_train = melt_train.dropna(subset=['sales_next_week','sales_next_next_week'])


In [79]:
melt_train.head(2)

Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1,mean_sales_4,sales_next_next_week
0,1,0,11,12.0,,,,10.0
1,2,0,7,6.0,,,,3.0


In [80]:
imputer = SimpleImputer()
Xtr = imputer.fit_transform(melt_train[features])
ytr = melt_train[['sales_next_week', 'sales_next_next_week']]

mdl = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=6)
mdl.fit(Xtr, ytr)

RandomForestRegressor(n_jobs=6, random_state=0)

In [81]:
Xval = imputer.transform(melt_valid[features])
yval = melt_valid[['sales_next_week', 'sales_next_next_week']]

p = mdl.predict(Xval)

In [82]:
mape(yval, p)


sales_next_week         0.678918
sales_next_next_week    0.716169
dtype: float64

In [83]:
wmape(yval, p)


sales_next_week         0.331832
sales_next_next_week    0.346699
dtype: float64

### multioutput modeling when  we have more then one varaible to predict about the sales

### 10. Predicting new examples
As long as you have the same features you used to train, you can predict for any period


In [84]:
melt_valid.tail()


Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1,mean_sales_4,sales_next_next_week
42167,815,51,0,,2.0,-2.0,,
42168,816,51,5,,6.0,-1.0,,
42169,817,51,3,,4.0,-1.0,,
42170,818,51,0,,2.0,-2.0,,
42171,819,51,1,,0.0,1.0,,


In [85]:
new_examples = melt_valid[melt_valid['Week'] == 51].copy()
new_examples.head()

Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1,mean_sales_4,sales_next_next_week
41361,1,51,10,,5.0,5.0,,
41362,2,51,0,,6.0,-6.0,,
41363,3,51,7,,8.0,-1.0,,
41364,4,51,8,,7.0,1.0,,
41365,5,51,9,,8.0,1.0,,


In [89]:
Xpred = imputer.transform(new_examples[features])

p = mdl.predict(Xpred)

In [91]:
new_examples['p_sales_next_week'] = p[:, 0]
new_examples['p_sales_next_next_week'] = p[:, 1]

In [92]:
new_examples.head()


Unnamed: 0,Product_Code,Week,Sales,sales_next_week,lag_sales_1,diff_sales_1,mean_sales_4,sales_next_next_week,p_sales_next_week,p_sales_next_next_week
41361,1,51,10,,5.0,5.0,,,10.376667,9.042
41362,2,51,0,,6.0,-6.0,,,5.2775,2.745
41363,3,51,7,,8.0,-1.0,,,8.72875,9.18594
41364,4,51,8,,7.0,1.0,,,9.530952,6.718857
41365,5,51,9,,8.0,1.0,,,7.544948,11.912801
