In [1]:
# DOWNLOADING DATA

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn import metrics
import pprint
import copy
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import offline
import statsmodels.graphics.tsaplots as tsa
import warnings
warnings.filterwarnings("ignore")

tomato_discount = pd.read_csv('Dataframes/tomato_discount.csv')
tomato_7 = pd.read_csv('Dataframes/tomato_7.csv')

# tomato_7

C:\Users\Владислав\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\Владислав\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
# CALCULATING CLEAR DEMAND

IDs = [col.strip('is_discount_') for col in tomato_discount.columns if 'is_discount_' in col]
Poisson_non_price_models =[]
MSE = []

for id in IDs:
    endog = tomato_7['salesvolume_' + id]
    exog = tomato_7[['av_price_' + id, 'is_supplied_' + id]]
    Poisson_non_price_models.append(
        sm.Poisson(endog = endog, exog = exog).fit(maxiter = 120))
    non_price_demand = Poisson_non_price_models[IDs.index(id)].predict(exog)

    tomato_7['non_price_demand_' + id] = tomato_7['salesvolume_' + id] - non_price_demand

    MSE.append(metrics.mean_squared_error(tomato_7['salesvolume_' + id], non_price_demand))

print(f'TOTAL MSE: {sum(MSE)}')

Optimization terminated successfully.
         Current function value: 1.938064
         Iterations 7
Optimization terminated successfully.
         Current function value: 1.195171
         Iterations 7
Optimization terminated successfully.
         Current function value: 1.225983
         Iterations 8
Optimization terminated successfully.
         Current function value: 1.048035
         Iterations 7
Optimization terminated successfully.
         Current function value: 2.887375
         Iterations 8
Optimization terminated successfully.
         Current function value: 1.247311
         Iterations 7
Optimization terminated successfully.
         Current function value: 1.597966
         Iterations 7
Optimization terminated successfully.
         Current function value: 1.255490
         Iterations 7
Optimization terminated successfully.
         Current function value: 1.664140
         Iterations 7
Optimization terminated successfully.
         Current function value: 1.793526
  

In [3]:
# SETTING Xs, ys, IDs

lags_on_Xs = []
Xs = []; ys = []

def sum_of_last(series, n = 7):
    ser_list = []

    for i in range(0, n):
        ser_list.append(0)

    for i in range(n, len(series)):
        ser_list.append(np.sum([j for j in series[(i - n):i]]))

    return ser_list

for id in IDs:
    tomato_7_process = tomato_7.copy()

    # adding num_other_discounts_
    tomato_7_process = tomato_7_process.merge(tomato_discount[['date', 'num_other_discounts_' + id]], on = 'date')

    # rename target columns
    tomato_7_process.rename(columns = {'salesvolume_' + id: 'target',
                                        'non_price_demand_' + id: 'tar_non_price',
                                        'av_price_' + id:'tar_price',
                                        'num_other_discounts_' + id: 'tar_other_discounts',
                                        'is_supplied_' + id: 'tar_is_supp'}, inplace = True)

    # drop salesvolume_
    to_drop = [col for col in tomato_7_process.columns if 'salesvolume_' in col]   # 'non_price_demand_'
    tomato_7_process.drop(to_drop, axis = 1, inplace = True)

    # shifting back of non_price_demand_
    work_with = [col for col in tomato_7_process.columns if 'non_price_demand_' in col]    # 'salesvolume_'
    tomato_7_process.loc[:, work_with] = tomato_7_process.loc[:, work_with].shift(-1)
    tomato_7_process = tomato_7_process.iloc[:-1, :].reset_index(drop = True)

    for col in work_with:
        tomato_7_process.rename(columns = {col: col + '_lag'}, inplace = True)

    # prices only when is supplied
    to_supp = [col for col in tomato_7_process.columns if 'av_price_' in col]
    for col in to_supp:
        tomato_7_process[col] = tomato_7_process[col] * tomato_7_process['is_supplied_' + col.replace('av_price_', '')]

    # adding lags on target variables
    lags_on_X = [[*tsa.pacf(tomato_7_process['target'])].index(sorted([*tsa.pacf(tomato_7_process['target'])])[::-1][lag]) for lag in [1, 2, 3]]
    lags_on_Xs.append(lags_on_X.copy())

    for i in lags_on_X:
        for col in ['tar_non_price', 'tar_price']:
            tomato_7_process[col + '_' + str(i)] = tomato_7_process[col].shift(i)

    tomato_7_process = tomato_7_process.iloc[10:, :].reset_index(drop = True)

    # adding week statistics
    tomato_7_process['tar_non_price_week'] = sum_of_last(tomato_7_process['tar_non_price'], n = 7)
    tomato_7_process['non_price_demand_week'] = \
        sum_of_last(tomato_7_process[[col for col in tomato_7_process.columns if 'non_price_demand_' in col]].sum(axis = 1), n = 7)
    tomato_7_process = tomato_7_process.iloc[7:, :].reset_index(drop = True)

    # changing the order
    tomato_7_process = tomato_7_process\
                            .loc[:, [c for c in tomato_7_process.columns if 'tar' in c] +
                                    [c for c in tomato_7_process.columns if 'tar' not in c]]

    # setting X and y
    Xs.append(tomato_7_process.drop(['target', 'tar_non_price', 'date'], axis = 1).copy())
    ys.append(tomato_7_process['target'].copy())

    # setting date
    if id == IDs[0]:
        date = tomato_7_process['date']

In [4]:
def timesplit(X, y, train_size = 0.85):
    n_train = int(len(y) * train_size)
    return X[:n_train], X[n_train:], y[:n_train], y[n_train:]

X_trains = []; X_tests = []; y_trains = []; y_tests = []

for i in range(0, len(Xs)):
    X_train, X_test, y_train, y_test = timesplit(Xs[i], ys[i])
    X_trains.append(X_train.copy()); X_tests.append(X_test.copy()); y_trains.append(y_train.copy()); y_tests.append(y_test.copy())

In [5]:
pd.DataFrame(y_tests).T\
    .to_csv('Results/Vector models/y_tests.csv', index = False)
pd.Series(date[y_tests[1].index])\
    .to_csv('Results/Vector models/date_test.csv', index = False)

In [8]:
# FITTING POISSON MODELS

# https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4

def NB2_fitting_i(X_trains, y_trains, i):
    X_in_for = pd.DataFrame()

    def block(y, X):
        Poisson_reg = sm.GLM(y, X, family = sm.families.Poisson())\
            .fit()

        df_train = pd.concat([y, X], join = 'outer', axis = 1)
        df_train['BB_LAMBDA'] = Poisson_reg.mu
        df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['target'] - x['BB_LAMBDA']) ** 2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis = 1)

        aux_olsr_results = smf.ols("""AUX_OLS_DEP ~ BB_LAMBDA - 1""", df_train)\
            .fit()
        alpha = aux_olsr_results.params[0] if aux_olsr_results.params[0] > 0 else 1 / 10 ** 4

        return alpha

    for col_num in range(0, X_trains[i].shape[1]):
        X_in_for = pd.concat([X_in_for, X_trains[i].iloc[:, col_num]], axis = 1)

        for p_value in [0.5, 0.2, 0.1]:
            NB2_reg = sm.GLM(y_trains[i], X_in_for, family = sm.families.NegativeBinomial(alpha = block(y = y_trains[i], X = X_in_for)))\
            .fit()

            signif_var = pd.DataFrame([x for x in NB2_reg.summary().tables[1].data[1:] if float(x[4]) < p_value],
                                  columns = NB2_reg.summary().tables[1].data[0])

            if len([*signif_var.iloc[:, 0]]) > 0:
                X_in_for = X_in_for.loc[:, [*signif_var.iloc[:, 0]]]

    k = 1
    while k == 1:
        p_values_of_NB2 = [float(x) for x in [*signif_var.iloc[:, 4]]]
        max_p_value = max(p_values_of_NB2)

        if max_p_value > 0.00001:
            signif_vars_for_X = [*signif_var.iloc[:, 0]]
            signif_vars_for_X.pop(p_values_of_NB2.index(max_p_value))

            X_in_for = X_in_for.loc[:, signif_vars_for_X]

            NB2_reg = sm.GLM(y_trains[i], X_in_for, family = sm.families.NegativeBinomial(alpha = block(y = y_trains[i], X = X_in_for)))\
                .fit()

            signif_var = pd.DataFrame([x for x in NB2_reg.summary().tables[1].data[1:]],
                                  columns = NB2_reg.summary().tables[1].data[0])
        else: k = 0

    return [*signif_var.iloc[:, 0]]

NB2_signif_vars = Parallel(n_jobs = multiprocessing.cpu_count() - 3)\
    (delayed(NB2_fitting_i)(X_trains = X_trains, y_trains = y_trains, i = i) for i in tqdm(range(0, len(X_trains))))
for j in [14, 15]:
    NB2_signif_vars[j].remove('is_supplied_461504')
    NB2_signif_vars[j].append('tar_is_supp')

Poissonmodels = []
for i in range(0, len(Xs)):
    Poissonmodels.append(
        sm.Poisson(endog = y_trains[i], exog = X_trains[i][NB2_signif_vars[i]]).fit(maxiter = 120))

Poisson_train_preds = [Poissonmodels[i].predict(X_trains[i][NB2_signif_vars[i]])  for i  in range(0, len(Xs))]
Poisson_test_preds = [Poissonmodels[i].predict(X_tests[i][NB2_signif_vars[i]])  for i  in range(0, len(Xs))]

pd.DataFrame([np.array(j) for j in Poisson_train_preds]).T\
    .to_csv('Results/Vector models/Vector Poisson/Poisson_train_preds.csv', index = False)
pd.DataFrame([np.array(j) for j in Poisson_test_preds]).T\
    .to_csv('Results/Vector models/Vector Poisson/Poisson_test_preds.csv', index = False)

MSE_trains = [metrics.mean_squared_error(y_trains[i], Poisson_train_preds[i]) for i in range(0, len(Xs))]
print(f'MSE_trains_sum: {sum(MSE_trains)}')
MSE_tests = [metrics.mean_squared_error(y_tests[i], Poisson_test_preds[i]) for i in range(0, len(Xs))]
print(f'MSE_tests_sum: {sum(MSE_tests)}')

100%|██████████| 20/20 [00:58<00:00,  2.95s/it]


Optimization terminated successfully.
         Current function value: 1.580996
         Iterations 85


LinAlgError: Singular matrix

In [7]:
print(Poissonmodels[14].summary())

                          Poisson Regression Results                          
Dep. Variable:                 target   No. Observations:                 1847
Model:                        Poisson   Df Residuals:                     1836
Method:                           MLE   Df Model:                           10
Date:                Tue, 17 May 2022   Pseudo R-squ.:                  0.6026
Time:                        11:47:15   Log-Likelihood:                -1432.8
converged:                       True   LL-Null:                       -3605.4
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
tar_price                      -0.0718      0.002    -37.944      0.000      -0.075      -0.068
tar_non_price_1                 0.0363      0.006      5.637      0.000       0.

In [115]:
# FORECASTING WITH RANDOM FORESTS

def vector_Poisson_forecast(Poissonmodels, Xs, ys, lags:list):
    global IDs, vector_Poisson_forecast_row
    Xs = copy.deepcopy(Xs)
    ys = copy.deepcopy(ys)
    forecast_lists = [[] for l in range(0, len(Xs))]

    def vector_Poisson_forecast_row(i):
        nonlocal Xs, ys, lags, forecast_lists, row
        global Poissonmodels, IDs

        # block of code without lags
        for id in [id for id in IDs if id != IDs[i]]:
            if row >= 1:
                Xs[i].loc[Xs[i].index[0] + row, 'non_price_demand_' + str(id) + '_lag'] = \
                    forecast_lists[IDs.index(id)][row - 1] - \
                    Poisson_non_price_models[IDs.index(id)].predict(
                        pd.DataFrame([Xs[IDs.index(id)].loc[Xs[IDs.index(id)].index[0] + row - 1, ['tar_price', 'tar_is_supp']]], columns=['tar_price', 'tar_is_supp'])\
                      .rename(columns={'tar_price': 'av_price_' + id, 'tar_is_supp': 'is_supplied_' + id})\
                            .reset_index(drop = True))[0]

        #!!!!!!!!!!!!!!!!!!!!!!!
        if row >= 7:
            Xs[i].loc[Xs[i].index[0] + row, 'tar_non_price_week'] = \
                np.sum([t for t in Xs[i].loc[(Xs[i].index[0] + row - 6):(Xs[i].index[0] + row), 'tar_non_price_1']])

            Xs[i].loc[Xs[i].index[0] + row, 'non_price_demand_week'] = \
                sum(Xs[i].loc[
                (Xs[i].index[0] + row - 6):(Xs[i].index[0] + row),
                [c for c in Xs[i].columns if 'non_price_demand' in c and 'week' not in c]].sum(axis = 1))

        # block of code with lags
        for lag in lags[i]:
            if row >= lag:
                Xs[i].loc[Xs[i].index[0] + row, 'tar_non_price_' + str(lag)] = \
                    forecast_lists[i][row - lag] - \
                    Poisson_non_price_models[i].predict(
                        pd.DataFrame([Xs[i].loc[Xs[i].index[0] + row - lag, ['tar_price', 'tar_is_supp']]], columns=['tar_price', 'tar_is_supp'])\
                      .rename(columns={'tar_price': 'av_price_' + IDs[i], 'tar_is_supp': 'is_supplied_' + IDs[i]})\
                            .reset_index(drop = True))[0]

        forecast_lists[i].append(Poissonmodels[i].predict(
            pd.DataFrame([Xs[i].loc[Xs[i].index[0] + row, :][NB2_signif_vars[i]]], columns = NB2_signif_vars[i]).reset_index(drop = True)
        )[0])

        return

    for row in tqdm(range(0, len(ys[0]))):
        Parallel(n_jobs = 1)\
            (delayed(vector_Poisson_forecast_row)(i = i) for i in range(0, len(Xs))) # n_jobs = multiprocessing.cpu_count() - 1

    result = pd.DataFrame([np.array(j) for j in forecast_lists]).T
    result.columns = ['salesvolume_' + id for id in IDs]
    return result, Xs

Poisson_test_forecasts, Xs_forecast = vector_Poisson_forecast(Poissonmodels = Poissonmodels, Xs = X_tests, ys = y_tests, lags = lags_on_Xs)

Poisson_test_forecasts.to_csv('Results/Vector models/Vector Poisson/Poisson_test_forecasts.csv', index = False)

MSE_forecast = [metrics.mean_squared_error(y_tests[i], Poisson_test_forecasts.iloc[:, i]) for i in range(0, len(Xs))]
print(f'MSE_tests_sum: {sum(MSE_forecast)}')

100%|██████████| 326/326 [04:13<00:00,  1.29it/s]


MSE_tests_sum: 127.43335201722341


[10.781441196942916,
 0.0001684893726391296,
 0.0002911987273430036,
 5.218087787838176,
 25.726581604314756,
 0.00018684363631704567,
 10.043435533453758,
 1.5220196882586878,
 13.828055747326468,
 2.981728419982079e-05,
 2.384926868531582e-06,
 0.25308258849597853,
 0.00016401978744651,
 0.8370647418304119,
 2.2159509133466475,
 9.462943914104502,
 11.870636637519219,
 0.0007079191206298792,
 32.44453636235894,
 3.2279646285774826]

In [35]:
train_preds = pd.read_csv('Results/Vector models/Vector Poisson/Poisson_train_preds.csv')
test_preds = pd.read_csv('Results/Vector models/Vector Poisson/Poisson_test_preds.csv')
test_forecasts = pd.read_csv('Results/Vector models/Vector Poisson/Poisson_test_forecasts.csv')

In [38]:
# GRAPH GBDT of ID

def graph_juice(number = None, id = None):
    number = number if number is not None else IDs.index(id)

    fig = make_subplots(rows = 3, cols = 1)

    for row, what in zip([1, 2, 3], ['train_pred', 'test_pred', 'test_forecast']):
        y_actual = y_tests[number] if what in ['test_pred', 'test_forecast'] else y_trains[number]

        if what == 'train_pred':
            y_pred = train_preds.iloc[:, number]
        elif what == 'test_pred':
            y_pred = test_preds.iloc[:, number]
        elif what == 'test_forecast':
            y_pred = test_forecasts.iloc[:, number]

        fig.add_trace(go.Scatter(x = date[y_actual.index],
                                 y = y_actual,
                                 mode = 'lines+markers',
                                 name = 'Actual counts',
                                 marker = dict(color = '#00A383', size = 3.5),
                                 line = dict(color = '#00A383', width = 1.5)),
                      row = row, col = 1)

        fig.add_trace(go.Scatter(x = date[y_actual.index],
                                 y =  y_pred,
                                 # y =  [*map(round, y_pred)],
                                 mode = 'lines',
                                 name = 'Predicted counts',
                                 line = dict(color = '#F53D65', width = 2.5)),
                      row = row, col = 1)

        fig.add_trace(go.Scatter(x = date[y_actual.index],
                                 y = tomato_discount[tomato_discount['date'].isin(date[y_actual.index])]\
                                 .loc[:, 'is_discount_' + IDs[number]].replace(0, np.NaN) *\
                                 (y_actual.max() / tomato_discount[tomato_discount['date'].isin(date[y_actual.index])]\
                                 .loc[:, 'is_discount_' + IDs[number]].max() / 1.5),
                                 mode = 'lines',
                                 name = 'Presence of discount',
                                 line = dict(color = '#B1F100', width = 5)),
                      row = row, col = 1)

    fig.update_layout(height = 1000, width = 1100,
                      title_text = f'Gradient boost of decision trees - train, test predictions, test forecast - {IDs[number]} sku',
                      showlegend = False)
    # offline.plot(fig, filename='file.html')
    fig.show()
# 6, 8, 14 !, 15 !
graph_juice(number = 14)