https://www.statsmodels.org/stable/generated/statsmodels.genmod.families.family.NegativeBinomial.html?highlight=negative#statsmodels.genmod.families.family.NegativeBinomial

In [3]:
# DOWNLOADING DATA

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

tomato_discount = pd.read_csv('Dataframes/tomato_discount.csv')
tomato_12 = pd.read_csv('Dataframes/tomato_12.csv') # or index_col = 0 instead of index = False in 'qjuices'

# tomato_12.drop([col for col in tomato_12.columns if
#                 'holidays_' in col or
#                 'month_' in col or
#                 'salesvolume_' in col and 'salesvolume_201676' not in col], axis = 1, inplace = True)

# tomato_12

In [5]:
# SETTING DATA TO TARGET

target_sku = '201676'

sales_names_to_shift = [col for col in tomato_12.columns if 'salesvolume_' in col and target_sku not in col]

tomato_12.loc[:, sales_names_to_shift] = tomato_12.loc[:, sales_names_to_shift].shift(1)
tomato_12 = tomato_12.iloc[1:, :]

for colname in sales_names_to_shift:
    tomato_12.rename(columns = {colname: colname + '_lag'}, inplace = True)

tomato_12.rename(columns = {'salesvolume_' + target_sku: 'target'}, inplace = True) # 'av_price_' + target_sku: 'price_of_target'
tomato_12.reset_index(drop = True, inplace = True)

print('Corr between target and it\'s price: {}'.format(round(tomato_12.corr().loc['target', 'av_price_' + target_sku], 4)))

# tomato_12

Corr between target and it's price: -0.4642


In [6]:
# ADDING more LAGS

names_to_shift = [col for col in tomato_12.columns if 'salesvolume_' in col] + \
                 [col for col in tomato_12.columns if 'av_price_' in col] + \
                 [col for col in tomato_12.columns if 'sales_week_' in col] + \
                 ['target']
#                 [col for col in tomato_12.columns if 'is_supplied_' in col] + \
for i in [1, 2, 3, 7, 14]:
    for col in names_to_shift:
        tomato_12[col + '_' + str(i)] = tomato_12[col].shift(i)

tomato_12 = tomato_12[14:]

tomato_12.reset_index(drop = True, inplace = True)

# tomato_12

In [26]:
# # GETTING RID OF DEPENDENT COLUMNS
#
# # https://stackoverflow.com/questions/44555763/is-there-a-way-to-check-for-linearly-dependent-columns-in-a-dataframe
#
# import sympy
#
# reduced_form, inds = sympy.Matrix(tomato_12.drop(['target', 'date'], axis = 1)).rref()
#
# tomato_12.drop(['target', 'date'], axis = 1).iloc[:, [*inds]]\
#     .to_csv('Dataframes/X for Poisson-based regressions.csv', index = False)

In [7]:
set(tomato_12.drop(['target', 'date'], axis = 1).columns) - set(pd.read_csv('Dataframes/X for Poisson-based regressions.csv'))

set()

In [4]:
# GETTING RID OF HIGHLY CORRELATED COLUMNS

# https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python

X_preprocessing = pd.read_csv('Dataframes/X for Poisson-based regressions.csv')

cor_matrix = X_preprocessing.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k = 1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

X = X_preprocessing.drop(to_drop, axis = 1)

In [5]:
X

Unnamed: 0,salesvolume_24089_lag,salesvolume_46135_lag,salesvolume_46902_lag,salesvolume_59042_lag,salesvolume_75320_lag,salesvolume_83524_lag,salesvolume_362058_lag,salesvolume_415514_lag,salesvolume_415824_lag,salesvolume_419020_lag,...,sales_week_471503_14,sales_week_472150_14,sales_week_481677_14,sales_week_483692_14,sales_week_485033_14,sales_week_487776_14,sales_week_489135_14,sales_week_495154_14,sales_week_600761_14,target_14
0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2163,0.0,1.0,0.0,0.0,2.0,0.0,0.0,6.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,16.0
2164,0.0,0.0,0.0,0.0,2.0,0.0,0.0,8.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,13.0
2165,1.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,15.0
2166,0.0,0.0,0.0,0.0,2.0,0.0,0.0,8.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,7.0


In [9]:
# SETTING X, y

date = tomato_12['date']
y = tomato_12['target']

def timesplit(X, y, train_size = 0.8):
    n = len(y)
    n_train = int(n * train_size)
    X_train, X_test = X[:n_train], X[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = timesplit(X, y)

In [10]:
print('variance = ' + str(y_train.var()))
print('mean = ' + str(y_train.mean()))

variance = 6.814139319060391
mean = 1.4956772334293948


In [27]:
# https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4

import statsmodels.api as sm
import statsmodels.formula.api as smf

plus = 1
range_col_num = [[d for d in range(r, r + plus)] for r in range(0, len(X.columns) - plus, plus)]
range_col_num.append([r for r in range([*range(0, len(X.columns) - 1, plus)][-1], len(X.columns))])

X_in_for = pd.DataFrame()

for col_num in range_col_num:
    X_in_for = pd.concat([X_in_for, X.iloc[:, col_num]], axis = 1)
    X_train, X_test, y_train, y_test = timesplit(X_in_for, y)

    for p_value in [0.0001]:   # [0.02, 0.01, 0.005, 0.002, 0.001, 0.0001]
        Poisson_reg = sm.GLM(y_train, X_train, family = sm.families.Poisson())\
            .fit()

        df_train = pd.concat([y_train, X_train], join = 'outer', axis = 1)

        df_train['BB_LAMBDA'] = Poisson_reg.mu
        df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['target'] - x['BB_LAMBDA']) ** 2 - x['BB_LAMBDA']) / x['BB_LAMBDA'], axis=1)

        aux_olsr_results = smf.ols("""AUX_OLS_DEP ~ BB_LAMBDA - 1""", df_train)\
            .fit()

        NB2_reg = sm.GLM(y_train, X_train, family = sm.families.NegativeBinomial(alpha = aux_olsr_results.params[0]))\
            .fit()

        signif_var = pd.DataFrame([x for x in NB2_reg.summary().tables[1].data[1:] if float(x[4]) < p_value],
                                  columns = NB2_reg.summary().tables[1].data[0])

        if len([*signif_var.iloc[:, 0]]) > 0:
            X_in_for = X_in_for.loc[:, [*signif_var.iloc[:, 0]]]
            X_train, X_test, y_train, y_test = timesplit(X_in_for, y)

        if p_value == 0.0001:
            print(str(p_value) +' : ' + NB2_reg.summary().tables[0].data[4][2:][0] +
                  NB2_reg.summary().tables[0].data[4][2:][1] + '; len of X: ' + str(len(X_train.columns)))

NB2_reg = sm.GLM(y_train, X_train, family = sm.families.NegativeBinomial(alpha = aux_olsr_results.params[0]))\
            .fit()
print(NB2_reg.summary())

NB2_train_pred = NB2_reg.get_prediction(X_train).summary_frame()['mean']
NB2_test_pred = NB2_reg.get_prediction(X_test).summary_frame()['mean']

0.0001 :   Log-Likelihood:     -3123.3; len of X: 1
0.0001 :   Log-Likelihood:     -3067.7; len of X: 2
0.0001 :   Log-Likelihood:     -3057.1; len of X: 2
0.0001 :   Log-Likelihood:     -3062.2; len of X: 2
0.0001 :   Log-Likelihood:     -3001.2; len of X: 2
0.0001 :   Log-Likelihood:     -3016.2; len of X: 2
0.0001 :   Log-Likelihood:     -3033.3; len of X: 2
0.0001 :   Log-Likelihood:     -3013.6; len of X: 2
0.0001 :   Log-Likelihood:     -3000.0; len of X: 2
0.0001 :   Log-Likelihood:     -3021.5; len of X: 2
0.0001 :   Log-Likelihood:     -3013.1; len of X: 2
0.0001 :   Log-Likelihood:     -2948.1; len of X: 3
0.0001 :   Log-Likelihood:     -2934.2; len of X: 3
0.0001 :   Log-Likelihood:     -2947.7; len of X: 3
0.0001 :   Log-Likelihood:     -2939.3; len of X: 3
0.0001 :   Log-Likelihood:     -2930.5; len of X: 4
0.0001 :   Log-Likelihood:     -2918.2; len of X: 2
0.0001 :   Log-Likelihood:     -2976.4; len of X: 2
0.0001 :   Log-Likelihood:     -2984.2; len of X: 2
0.0001 :   L

In [29]:
# NEGATIVE BINOMIAL 2

import plotly.graph_objects as go

def graph_juice(x, y_actual, y_pred):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x = x,
                             y = y_actual,
                             mode = 'lines+markers',
                             name = 'Actual counts',
                             marker = dict(color = '#00A383', size = 3.5),
                             line = dict(color = '#00A383', width = 1.5)))

    fig.add_trace(go.Scatter(x = x,
                             y =  y_pred,
                             # y =  [*map(round, y_pred)],
                             mode = 'lines',
                             name = 'Predicted counts',
                             line = dict(color = '#F53D65', width = 2.5)
                             ))

    fig.add_trace(go.Scatter(x = x,
                             y = tomato_discount[tomato_discount['date'].isin(x)]\
                             .loc[:, 'is_discount_201676'] * 50,
                             mode = 'lines',
                             name = 'Presence of discount',
                             line = dict(color = '#B1F100', width = 2.5, dash = 'dot')  # 'dash'
                             ))

    fig.show()

graph_juice(x = tomato_12.loc[X_test.index, 'date'], y_actual = y_test, y_pred = NB2_test_pred)

In [97]:
tomato_12[['av_price_201676', 'date']]

Unnamed: 0,av_price_201676,date
0,22.940697,2009-01-23
1,22.940697,2009-01-24
2,22.940697,2009-01-25
3,22.940697,2009-01-26
4,22.940697,2009-01-27
...,...,...
2163,20.213802,2014-12-26
2164,20.213802,2014-12-27
2165,20.213802,2014-12-28
2166,20.213802,2014-12-29


In [11]:
pd.DataFrame({'date': date[y_train.index], 'predicted': round(NB2_train_pred.summary_frame()['mean']), 'real': y_train})\
    .to_csv('Results/NegBin 2/NegBin2 train.csv', index = False)

pd.DataFrame({'date': date[y_test.index], 'predicted': round(NB2_test_pred.summary_frame()['mean']), 'real': y_test})\
    .to_csv('Results/NegBin 2/NegBin2 test.csv', index = False)