In [1]:
from datetime import datetime
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

import skforecast
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster
import shap


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ts = pd.read_csv('../../data/ts_top100_prediction.csv')

In [3]:
print(ts.shape)
ts.sample(1)

(1739323, 6)


Unnamed: 0,date,id,item,sales,income,event
1030432,2014-05-13,HOME_&_GARDEN_1_440_PHI_2,HOME_&_GARDEN_1_440,0,0.0,0


In [4]:
ts['date']= pd.to_datetime(ts['date'], format='%Y-%m-%d')

In [5]:
ts['weekday'] = ts['date'].dt.day_name()

In [6]:
ts.sample(5)

Unnamed: 0,date,id,item,sales,income,event,weekday
212144,2011-11-24,SUPERMARKET_3_555_PHI_3,SUPERMARKET_3_555,71,126.096,1,Thursday
494583,2012-10-26,SUPERMARKET_2_183_NYC_3,SUPERMARKET_2_183,2,20.808,0,Friday
1188046,2014-10-20,SUPERMARKET_1_096_NYC_4,SUPERMARKET_1_096,2,13.2,0,Monday
276461,2012-02-14,SUPERMARKET_3_804_PHI_2,SUPERMARKET_3_804,9,20.304,0,Tuesday
713846,2013-06-20,SUPERMARKET_3_080_BOS_1,SUPERMARKET_3_080,5,9.48,0,Thursday


In [7]:
lista_productos100 = list(ts['item'].unique())

In [8]:
resultados = []

# Copy of original DataFrame
ts_original = ts.copy()

# Select products by item
productos = list(ts_original['item'].unique())

for i in lista_productos100:
        dataset = ts_original[['item','date','income','weekday','event']]
        ts_product = dataset[dataset['item']==i]
        ts_product_sin_duplicates = ts_product[['date','weekday', 'event']].drop_duplicates()
        ts_predict = ts_product.groupby(['date'])['income'].sum().reset_index()
        ts_predict = ts_predict.merge(ts_product_sin_duplicates, on=['date'], how='left')
        ts_predict = pd.get_dummies(data=ts_predict, columns=['weekday'], dtype=int)
        ts_predict['date'] = pd.to_datetime(ts_predict['date'])
        ts_predict.sort_values('date', ascending=True, inplace=True)
        ts_predict.set_index('date', inplace=True)
        steps = 30
        ts_predict = ts_predict.asfreq('D')
        y, exog = ts_predict['income'], ts_predict.drop(columns=['income'])
        y_train, y_test = y[:-30], y[-30:]
        exog_train, exog_test = exog[:-30], exog[-30:]
        
        forecaster = ForecasterAutoreg(
                    regressor = RandomForestRegressor(random_state=123,
                                                        max_depth=10,
                                                        min_samples_leaf=2, 
                                                        min_samples_split=5,
                                                        n_estimators=50),
                    lags      =  7
                )
        
        # Prepare for prediction(next mounth)
        start_date = '2016-04-25' 
        end_date ='2016-05-30'
        # Create range date
        date_range = pd.date_range(start=start_date, end=end_date)
        # Create DataFrame
        ts1 = pd.DataFrame(date_range, columns=['date'])
        ts1['date'] = pd.to_datetime(ts1['date'])
        ts1['weekday'] = ts1['date'].dt.day_name()
        ts1['event'] = 0
        ts1 = pd.get_dummies(data=ts1, columns=['weekday'], dtype=int)
        ts1.sort_values('date', inplace=True, ascending=True)
        ts1.set_index('date', inplace=True)
        ts1 = ts1.asfreq('D')
        exog_test = pd.concat([exog_test, ts1], ignore_index=False)

        # Predictions
        forecaster.fit(y=y_train, exog=exog_train)
        predictions = forecaster.predict(steps=60, exog=exog_test).to_list()
        resultados.append({'i':i, 'prediction':predictions,'test':y_test.values})



In [9]:
tiendas_inventario = pd.DataFrame(resultados)
tiendas_inventario['test_suma'] = tiendas_inventario['test'].apply(sum)
suma = tiendas_inventario['test_suma'].sum()
tiendas_inventario['prediction'] = tiendas_inventario['prediction'].apply(lambda x: np.round(x, decimals=0))
tiendas_inventario['tiendas_inventario_test'] = tiendas_inventario['prediction'].apply(lambda x: sum(x[:30]))
tiendas_inventario['tiendas_inventario_prediction'] = tiendas_inventario['prediction'].apply(lambda x: sum(x[30:]))
tiendas_inventario[['i','test_suma','tiendas_inventario_test','tiendas_inventario_prediction']]

Unnamed: 0,i,test_suma,tiendas_inventario_test,tiendas_inventario_prediction
0,ACCESORIES_1_108,4592.0644,5245.0,4673.0
1,HOME_&_GARDEN_1_027,4656.6000,5546.0,5781.0
2,HOME_&_GARDEN_1_053,18937.0500,15807.0,15319.0
3,HOME_&_GARDEN_1_140,9075.5625,7454.0,7064.0
4,HOME_&_GARDEN_1_177,4153.3625,7384.0,7008.0
...,...,...,...,...
95,SUPERMARKET_3_499,10380.2880,8727.0,9226.0
96,ACCESORIES_1_158,49910.0322,45690.0,44603.0
97,SUPERMARKET_3_282,18462.7200,17146.0,16787.0
98,ACCESORIES_1_354,48962.5668,53037.0,53145.0


In [10]:
total_inventario_productos_real = tiendas_inventario['test_suma'].sum()
total_inventario_productos_predicho = tiendas_inventario['tiendas_inventario_test'].sum()
print(total_inventario_productos_real)
print(total_inventario_productos_predicho)
print(total_inventario_productos_real - total_inventario_productos_predicho)

1086598.199
1057787.0
28811.199000000022


In [11]:
income_global = pd.DataFrame(resultados)

In [12]:
file_name = "income_global.xlsx"
income_global.to_excel(file_name)