In [5]:
# DOWNLOADING DATA

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn import metrics
import pprint
import copy
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import offline
import statsmodels.graphics.tsaplots as tsa
import warnings
warnings.filterwarnings("ignore")

tomato_discount = pd.read_csv('Dataframes/tomato_discount.csv')
tomato_7 = pd.read_csv('Dataframes/tomato_7.csv')

# tomato_7

In [6]:
# CALCULATING CLEAR DEMAND

IDs = [col.strip('is_discount_') for col in tomato_discount.columns if 'is_discount_' in col]
ZINB_non_price_models =[]
MSE = []

for id in IDs:
    endog = tomato_7['salesvolume_' + id]
    exog = tomato_7[['av_price_' + id, 'is_supplied_' + id]]
    ZINB_non_price_models.append(
        sm.ZeroInflatedNegativeBinomialP(endog = endog, exog = exog, exog_infl = exog, inflation = 'logit').fit(maxiter = 100))
    non_price_demand = ZINB_non_price_models[IDs.index(id)].predict(exog, exog_infl = exog)

    tomato_7['non_price_demand_' + id] = tomato_7['salesvolume_' + id] - non_price_demand

    MSE.append(metrics.mean_squared_error(tomato_7['salesvolume_' + id], non_price_demand))

print(f'TOTAL MSE: {sum(MSE)}')

Optimization terminated successfully.
         Current function value: 1.760910
         Iterations: 41
         Function evaluations: 44
         Gradient evaluations: 44
Optimization terminated successfully.
         Current function value: 1.122793
         Iterations: 38
         Function evaluations: 41
         Gradient evaluations: 41
Optimization terminated successfully.
         Current function value: 1.151392
         Iterations: 39
         Function evaluations: 42
         Gradient evaluations: 42
Optimization terminated successfully.
         Current function value: 1.024207
         Iterations: 32
         Function evaluations: 35
         Gradient evaluations: 35
Optimization terminated successfully.
         Current function value: 2.227683
         Iterations: 54
         Function evaluations: 69
         Gradient evaluations: 69
Optimization terminated successfully.
         Current function value: 1.186321
         Iterations: 52
         Function evaluations: 64
  

In [7]:
# SETTING Xs, ys, IDs

lags_on_Xs = []
Xs = []; ys = []

def sum_of_last(series, n = 7):
    ser_list = []

    for i in range(0, n):
        ser_list.append(0)

    for i in range(n, len(series)):
        ser_list.append(np.sum([j for j in series[(i - n):i]]))

    return ser_list

for id in IDs:
    tomato_7_process = tomato_7.copy()

    # adding num_other_discounts_
    tomato_7_process = tomato_7_process.merge(tomato_discount[['date', 'num_other_discounts_' + id]], on = 'date')

    # rename target columns
    tomato_7_process.rename(columns = {'salesvolume_' + id: 'target',
                                        'non_price_demand_' + id: 'tar_non_price',
                                        'av_price_' + id:'tar_price',
                                        'num_other_discounts_' + id: 'tar_other_discounts',
                                        'is_supplied_' + id: 'tar_is_supp'}, inplace = True)

    # drop salesvolume_
    to_drop = [col for col in tomato_7_process.columns if 'salesvolume_' in col]   # 'non_price_demand_'
    tomato_7_process.drop(to_drop, axis = 1, inplace = True)

    # shifting back of non_price_demand_
    work_with = [col for col in tomato_7_process.columns if 'non_price_demand_' in col]    # 'salesvolume_'
    tomato_7_process.loc[:, work_with] = tomato_7_process.loc[:, work_with].shift(-1)
    tomato_7_process = tomato_7_process.iloc[:-1, :].reset_index(drop = True)

    for col in work_with:
        tomato_7_process.rename(columns = {col: col + '_lag'}, inplace = True)

    # prices only when is supplied
    to_supp = [col for col in tomato_7_process.columns if 'av_price_' in col]
    for col in to_supp:
        tomato_7_process[col] = tomato_7_process[col] * tomato_7_process['is_supplied_' + col.replace('av_price_', '')]

    # adding lags on target variables
    lags_on_X = [[*tsa.pacf(tomato_7_process['target'])].index(sorted([*tsa.pacf(tomato_7_process['target'])])[::-1][lag]) for lag in [1, 2, 3]]
    lags_on_Xs.append(lags_on_X.copy())

    for i in lags_on_X:
        for col in ['tar_non_price', 'tar_price']:
            tomato_7_process[col + '_' + str(i)] = tomato_7_process[col].shift(i)

    tomato_7_process = tomato_7_process.iloc[10:, :].reset_index(drop = True)

    # adding week statistics
    tomato_7_process['tar_non_price_week'] = sum_of_last(tomato_7_process['tar_non_price'], n = 7)
    tomato_7_process['non_price_demand_week'] = \
        sum_of_last(tomato_7_process[[col for col in tomato_7_process.columns if 'non_price_demand_' in col]].sum(axis = 1), n = 7)
    tomato_7_process = tomato_7_process.iloc[7:, :].reset_index(drop = True)

    # changing the order
    tomato_7_process = tomato_7_process\
                            .loc[:, [c for c in tomato_7_process.columns if 'tar' in c] +
                                    [c for c in tomato_7_process.columns if 'tar' not in c]]

    # setting X and y
    Xs.append(tomato_7_process.drop(['target', 'tar_non_price', 'date'], axis = 1).copy())
    ys.append(tomato_7_process['target'].copy())

    # setting date
    if id == IDs[0]:
        date = tomato_7_process['date']

In [8]:
def timesplit(X, y, train_size = 0.85):
    n_train = int(len(y) * train_size)
    return X[:n_train], X[n_train:], y[:n_train], y[n_train:]

X_trains = []; X_tests = []; y_trains = []; y_tests = []

for i in range(0, len(Xs)):
    X_train, X_test, y_train, y_test = timesplit(Xs[i], ys[i])
    X_trains.append(X_train.copy()); X_tests.append(X_test.copy()); y_trains.append(y_train.copy()); y_tests.append(y_test.copy())

In [9]:
# FITTING ZERO INFLATED NEGATIVE BINOMIAL MODELS

# https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4

def NB2_fitting_i(X_trains, y_trains, i):
    X_in_for = pd.DataFrame()

    def block(y, X):
        Poisson_reg = sm.GLM(y, X, family = sm.families.Poisson())\
            .fit()

        df_train = pd.concat([y, X], join = 'outer', axis = 1)
        df_train['BB_LAMBDA'] = Poisson_reg.mu
        df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['target'] - x['BB_LAMBDA']) ** 2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis = 1)

        aux_olsr_results = smf.ols("""AUX_OLS_DEP ~ BB_LAMBDA - 1""", df_train)\
            .fit()
        alpha = aux_olsr_results.params[0] if aux_olsr_results.params[0] > 0 else 1 / 10 ** 4

        return alpha

    for col_num in range(0, X_trains[i].shape[1]):
        X_in_for = pd.concat([X_in_for, X_trains[i].iloc[:, col_num]], axis = 1)

        for p_value in [0.5, 0.2, 0.1]:
            NB2_reg = sm.GLM(y_trains[i], X_in_for, family = sm.families.NegativeBinomial(alpha = block(y = y_trains[i], X = X_in_for)))\
            .fit()

            signif_var = pd.DataFrame([x for x in NB2_reg.summary().tables[1].data[1:] if float(x[4]) < p_value],
                                  columns = NB2_reg.summary().tables[1].data[0])

            if len([*signif_var.iloc[:, 0]]) > 0:
                X_in_for = X_in_for.loc[:, [*signif_var.iloc[:, 0]]]

    k = 1
    while k == 1:
        p_values_of_NB2 = [float(x) for x in [*signif_var.iloc[:, 4]]]
        max_p_value = max(p_values_of_NB2)

        if max_p_value > 0.00001:
            signif_vars_for_X = [*signif_var.iloc[:, 0]]
            signif_vars_for_X.pop(p_values_of_NB2.index(max_p_value))

            X_in_for = X_in_for.loc[:, signif_vars_for_X]

            NB2_reg = sm.GLM(y_trains[i], X_in_for, family = sm.families.NegativeBinomial(alpha = block(y = y_trains[i], X = X_in_for)))\
                .fit()

            signif_var = pd.DataFrame([x for x in NB2_reg.summary().tables[1].data[1:]],
                                  columns = NB2_reg.summary().tables[1].data[0])
        else: k = 0

    return [*signif_var.iloc[:, 0]]

NB2_signif_vars = Parallel(n_jobs = multiprocessing.cpu_count() - 3)\
    (delayed(NB2_fitting_i)(X_trains = X_trains, y_trains = y_trains, i = i) for i in tqdm(range(0, len(X_trains))))

for j in [14, 15]:
    NB2_signif_vars[j].remove('is_supplied_461504')
    NB2_signif_vars[j].append('tar_is_supp')

ZINBmodels = []
for i in range(0, len(Xs)):
    ZINBmodels.append(
        sm.ZeroInflatedNegativeBinomialP(endog = y_trains[i],
                               exog = X_trains[i][NB2_signif_vars[i]],
                               exog_infl = X_trains[i][NB2_signif_vars[i]],
                               inflation = 'logit').fit(maxiter = 100))

ZINB_train_preds = [ZINBmodels[i].predict(X_trains[i][NB2_signif_vars[i]],
                                          exog_infl = X_trains[i][NB2_signif_vars[i]])  for i  in range(0, len(Xs))]
ZINB_test_preds = [ZINBmodels[i].predict(X_tests[i][NB2_signif_vars[i]],
                                         exog_infl = X_tests[i][NB2_signif_vars[i]])  for i  in range(0, len(Xs))]

MSE_tests = [metrics.mean_squared_error(y_tests[i], ZINB_test_preds[i]) for i in range(0, len(Xs))]
print(f'MSE_tests_sum: {sum(MSE_tests)}')

100%|██████████| 20/20 [01:30<00:00,  4.54s/it]


         Current function value: 1.557453
         Iterations: 100
         Function evaluations: 104
         Gradient evaluations: 104
         Current function value: 1.171817
         Iterations: 100
         Function evaluations: 105
         Gradient evaluations: 105
         Current function value: nan
         Iterations: 7
         Function evaluations: 121
         Gradient evaluations: 121
         Current function value: nan
         Iterations: 4
         Function evaluations: 116
         Gradient evaluations: 116
Optimization terminated successfully.
         Current function value: 11.499844
         Iterations: 22
         Function evaluations: 23
         Gradient evaluations: 23
         Current function value: nan
         Iterations: 3
         Function evaluations: 115
         Gradient evaluations: 115
         Current function value: 1.233199
         Iterations: 100
         Function evaluations: 106
         Gradient evaluations: 106
         Current function 


overflow encountered in exp


overflow encountered in exp


invalid value encountered in multiply


overflow encountered in exp



         Current function value: nan
         Iterations: 0
         Function evaluations: 2
         Gradient evaluations: 2
         Current function value: 0.530756
         Iterations: 100
         Function evaluations: 105
         Gradient evaluations: 105
         Current function value: nan
         Iterations: 3
         Function evaluations: 115
         Gradient evaluations: 115
         Current function value: 1.166698
         Iterations: 100
         Function evaluations: 104
         Gradient evaluations: 104
         Current function value: nan
         Iterations: 8
         Function evaluations: 121
         Gradient evaluations: 121
         Current function value: nan
         Iterations: 3
         Function evaluations: 115
         Gradient evaluations: 115
         Current function value: 1.007034
         Iterations: 100
         Function evaluations: 116
         Gradient evaluations: 116
Optimization terminated successfully.
         Current function value: 0.

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').