In [1]:
import numpy as np
import pandas as pd
import sklearn

from tqdm.notebook import tqdm

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA

In [2]:
data = pd.read_csv("timeseries_updated_WG_data.csv")
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GPI,NDC,dates,disc_unitsold,holiday,unitsold
0,0,0,58300040107520,115681108,2014-01-01,0.0,1.0,0.0
1,1,1,58300040107520,115681108,2014-01-02,81.0,0.0,245.0
2,2,2,58300040107520,115681108,2014-01-03,22.0,0.0,120.0
3,3,3,58300040107520,115681108,2014-01-04,115.0,0.0,186.0
4,4,4,58300040107520,115681108,2014-01-05,80.0,0.0,151.0


In [3]:
data = data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
data.head()

Unnamed: 0,GPI,NDC,dates,disc_unitsold,holiday,unitsold
0,58300040107520,115681108,2014-01-01,0.0,1.0,0.0
1,58300040107520,115681108,2014-01-02,81.0,0.0,245.0
2,58300040107520,115681108,2014-01-03,22.0,0.0,120.0
3,58300040107520,115681108,2014-01-04,115.0,0.0,186.0
4,58300040107520,115681108,2014-01-05,80.0,0.0,151.0


In [4]:
data['dates'] = pd.to_datetime(data['dates'])
print(data.dtypes)

GPI                       int64
NDC                       int64
dates            datetime64[ns]
disc_unitsold           float64
holiday                 float64
unitsold                float64
dtype: object


In [5]:
%%time 
#feature engineering
data['dates'] = pd.to_datetime(data['dates'])
data["day"] = data["dates"].dt.day
data["weekday"] = data["dates"].dt.weekday
data["month"] = data["dates"].dt.month
data["year"] = data["dates"].dt.year
data["quarter"] = data["dates"].dt.month -1 
data["quarter"] = data["quarter"] // 3
data["quarter"] = data["quarter"] + 1 
data['Yearfull'] = data["dates"].dt.year.astype(str)+ '-01-01'
data['Week']= (((data['dates'] - data['Yearfull'].astype('datetime64[ns]')).dt.days)/7).astype('int16') +1
data['Week']= data.Week.where(data['Week']!=53, 52)

CPU times: user 1.27 s, sys: 49.6 ms, total: 1.32 s
Wall time: 1.32 s


In [6]:
data.head()

Unnamed: 0,GPI,NDC,dates,disc_unitsold,holiday,unitsold,day,weekday,month,year,quarter,Yearfull,Week
0,58300040107520,115681108,2014-01-01,0.0,1.0,0.0,1,2,1,2014,1,2014-01-01,1
1,58300040107520,115681108,2014-01-02,81.0,0.0,245.0,2,3,1,2014,1,2014-01-01,1
2,58300040107520,115681108,2014-01-03,22.0,0.0,120.0,3,4,1,2014,1,2014-01-01,1
3,58300040107520,115681108,2014-01-04,115.0,0.0,186.0,4,5,1,2014,1,2014-01-01,1
4,58300040107520,115681108,2014-01-05,80.0,0.0,151.0,5,6,1,2014,1,2014-01-01,1


In [7]:
data = data.drop('Yearfull', axis=1)
print(data.dtypes)

GPI                       int64
NDC                       int64
dates            datetime64[ns]
disc_unitsold           float64
holiday                 float64
unitsold                float64
day                       int64
weekday                   int64
month                     int64
year                      int64
quarter                   int64
Week                      int16
dtype: object


In [8]:
gpi_un = data['GPI'].nunique()
print(gpi_un)

12


In [9]:
ndc_un = data['NDC'].nunique()
print(ndc_un)

531


In [10]:
gpi = list(set(data['GPI'].astype('int')))
ndc = list(set(data['NDC'].astype('int')))

print(len(gpi), len(ndc))

12 531


In [11]:
def run_arima(df):
    X = df['unitsold'].astype('float')
    train_size = int(len(X) * 0.8)
    train, test = X[0:train_size], X[train_size:]
    model = ARIMA(train, order=(1,1,1), seasonal_order=(1,1,1,12))
    result = model.fit()
    start = len(train)
    end = len(train) + len(test) 
    predictions = result.predict(start, end)[1:]
    return pd.Series(mean_squared_error(test, predictions))

In [12]:
%%time
import numpy as np
# errors = data.groupby(['GPI','NDC']).apply(run_xgboost)

errorlist = []
gpi = list(set(data['GPI'].astype('int')))
for i in tqdm(gpi): #range(1,numstores):
    df_chunked = data.query('GPI == '+str(i))
    errors = df_chunked.groupby(['NDC']).apply(run_arima)
    error = np.mean(errors)
    errorlist.append(error)

  0%|          | 0/12 [00:00<?, ?it/s]



CPU times: user 1h 56min 42s, sys: 2h 47min 8s, total: 4h 43min 50s
Wall time: 20min 1s


