In [46]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import seaborn as sns

In [47]:
df = pd.read_csv("ALL_DATA_COMBINED.csv")

In [48]:
df['week_start'].head(35)

0     02-01-2017
1     09-01-2017
2     16-01-2017
3     23-01-2017
4     30-01-2017
5     06-02-2017
6     13-02-2017
7     20-02-2017
8     27-02-2017
9     06-03-2017
10    13-03-2017
11    20-03-2017
12    27-03-2017
13    03-04-2017
14    10-04-2017
15    17-04-2017
16    24-04-2017
17    01-05-2017
18    08-05-2017
19    15-05-2017
20    22-05-2017
21    29-05-2017
22    05-06-2017
23    12-06-2017
24    19-06-2017
25    26-06-2017
26    03-07-2017
27    10-07-2017
28    17-07-2017
29    24-07-2017
30    31-07-2017
31    07-08-2017
32    14-08-2017
33    21-08-2017
34    28-08-2017
Name: week_start, dtype: object

In [49]:
df['week_start'] = pd.to_datetime(df['week_start'], format='%d-%m-%Y')

In [50]:
df['week_start'].head(35)

0    2017-01-02
1    2017-01-09
2    2017-01-16
3    2017-01-23
4    2017-01-30
5    2017-02-06
6    2017-02-13
7    2017-02-20
8    2017-02-27
9    2017-03-06
10   2017-03-13
11   2017-03-20
12   2017-03-27
13   2017-04-03
14   2017-04-10
15   2017-04-17
16   2017-04-24
17   2017-05-01
18   2017-05-08
19   2017-05-15
20   2017-05-22
21   2017-05-29
22   2017-06-05
23   2017-06-12
24   2017-06-19
25   2017-06-26
26   2017-07-03
27   2017-07-10
28   2017-07-17
29   2017-07-24
30   2017-07-31
31   2017-08-07
32   2017-08-14
33   2017-08-21
34   2017-08-28
Name: week_start, dtype: datetime64[ns]

# Month Level Flag

In [51]:
df['month'] = df['week_start'].dt.month

In [52]:
df_months = pd.get_dummies(df['month'],drop_first=True)

In [53]:
df = pd.concat([df,df_months],axis = 1, join='inner', ignore_index=False, keys=None)

# Week Level Flag

In [54]:
df['week_flag'] = df['week_start'].dt.strftime('%Y-W%U')

In [55]:
df_weeks = pd.get_dummies(df['week_flag'],drop_first=True)

In [57]:
df = pd.concat([df,df_weeks],axis = 1, join='inner', ignore_index=False, keys=None)

In [62]:
df['week_flag']

0      2017-W01
1      2017-W02
2      2017-W03
3      2017-W04
4      2017-W05
         ...   
177    2020-W21
178    2020-W22
179    2020-W23
180    2020-W24
181    2020-W25
Name: week_flag, Length: 182, dtype: object

# Dropping

In [12]:
df=df.drop(['week_flag','month'],axis=1)

# Columns as String

In [13]:
df.columns = df.columns.astype(str)

In [14]:
df.columns

Index(['week_start', 'digital_catchuptv_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-equity_spends',
       'digital_directbuying_bledina_brand-equity_spends',
       'digital_keywordtargeting_bledina_brand-equity_spends',
       'digital_nativeads_bledina_brand-equity_spends',
       'digital_programmatic_bledina_brand-equity_spends',
       'digital_sea_bledina_brand-equity_spends',
       'digital_social_bledina_brand-equity_spends',
       'press_equity_bledina_brand-equity_spends',
       ...
       '2020-W16', '2020-W17', '2020-W18', '2020-W19', '2020-W20', '2020-W21',
       '2020-W22', '2020-W23', '2020-W24', '2020-W25'],
      dtype='object', length=244)

In [15]:
df.tail()

Unnamed: 0,week_start,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,...,2020-W16,2020-W17,2020-W18,2020-W19,2020-W20,2020-W21,2020-W22,2020-W23,2020-W24,2020-W25
177,2020-05-25,0.0,0.0,0.0,6562.1435,0.0,5501.142043,2537.86,8923.720012,0.0,...,0,0,0,0,0,1,0,0,0,0
178,2020-06-01,0.0,0.0,0.0,0.031994,0.0,0.0,2739.21,1075.980003,0.0,...,0,0,0,0,0,0,1,0,0,0
179,2020-06-08,0.0,0.0,0.0,0.0,0.0,0.0,2782.0,3742.977999,0.0,...,0,0,0,0,0,0,0,1,0,0
180,2020-06-15,0.0,0.0,0.0,0.0,0.0,0.0,2622.45,4007.615998,0.0,...,0,0,0,0,0,0,0,0,1,0
181,2020-06-22,0.0,0.0,0.0,0.031994,0.0,0.0,2422.14,3086.460004,0.0,...,0,0,0,0,0,0,0,0,0,1


# X and Y initialization

In [16]:
X=df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','macroeconomic_total_total_total_population','macroeconomic_total_total_total_weeklycases','macroeconomic_total_total_total_unemploymentrate','crm_automatic_bledina_brand_emails'],axis=1)
y=df['Sales']

In [17]:
O = df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','macroeconomic_total_total_total_population'],axis=1)

In [61]:
O.head()

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,2020-W16,2020-W17,2020-W18,2020-W19,2020-W20,2020-W21,2020-W22,2020-W23,2020-W24,2020-W25
0,0.01,0.0,0.01,0.0,2367.4,0.06,16089.49,0.01,0.0,2369.39,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.01,0.0,2785.65,0.05,11618.44,10034.99,0.0,2470.6,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.01,0.0,2708.31,0.08,10388.53,6955.56,0.0,2470.6,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,2473.94,0.08,9699.73,4630.01,0.0,2470.6,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.01,0.0,618.99,0.01,6182.01,2911.88,0.0,2463.25,...,0,0,0,0,0,0,0,0,0,0


# Scaling - MINMAX transfrom

In [18]:
X.columns = X.columns.astype(str)

In [19]:
X

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,2020-W16,2020-W17,2020-W18,2020-W19,2020-W20,2020-W21,2020-W22,2020-W23,2020-W24,2020-W25
0,0.01,0.0,0.01,0.000000,2367.40,0.060000,16089.49,0.010000,0.0,2369.390000,...,0,0,0,0,0,0,0,0,0,0
1,0.00,0.0,0.01,0.000000,2785.65,0.050000,11618.44,10034.990000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
2,0.00,0.0,0.01,0.000000,2708.31,0.080000,10388.53,6955.560000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
3,0.00,0.0,0.00,0.000000,2473.94,0.080000,9699.73,4630.010000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
4,0.00,0.0,0.01,0.000000,618.99,0.010000,6182.01,2911.880000,0.0,2463.250000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.00,0.0,0.00,6562.143500,0.00,5501.142043,2537.86,8923.720012,0.0,2777.054278,...,0,0,0,0,0,1,0,0,0,0
178,0.00,0.0,0.00,0.031994,0.00,0.000000,2739.21,1075.980003,0.0,2347.751201,...,0,0,0,0,0,0,1,0,0,0
179,0.00,0.0,0.00,0.000000,0.00,0.000000,2782.00,3742.977999,0.0,3132.570888,...,0,0,0,0,0,0,0,1,0,0
180,0.00,0.0,0.00,0.000000,0.00,0.000000,2622.45,4007.615998,0.0,3132.570888,...,0,0,0,0,0,0,0,0,1,0


In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Columns: 236 entries, digital_catchuptv_bledina_brand-equity_spends to 2020-W25
dtypes: float64(38), int64(6), uint8(192)
memory usage: 96.8 KB


In [21]:
scaler = MinMaxScaler()

In [22]:
scaler.fit(X)

In [23]:
X=scaler.transform(X)

In [24]:
X = pd.DataFrame(X)

In [25]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,226,227,228,229,230,231,232,233,234,235
0,8.915386e-08,0.0,5.752158e-07,0.000000,0.150990,6.946401e-07,1.000000,2.314267e-07,0.0,0.008477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000e+00,0.0,5.752158e-07,0.000000,0.177665,5.788667e-07,0.722114,2.322364e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000e+00,0.0,5.752158e-07,0.000000,0.172732,9.261868e-07,0.645672,1.609702e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000e+00,0.0,0.000000e+00,0.000000,0.157785,9.261868e-07,0.602861,1.071508e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000e+00,0.0,5.752158e-07,0.000000,0.039478,1.157733e-07,0.384227,6.738867e-02,0.0,0.008813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.000000e+00,0.0,0.000000e+00,0.385262,0.000000,6.368856e-02,0.157734,2.065187e-01,0.0,0.009935,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
178,0.000000e+00,0.0,0.000000e+00,0.000002,0.000000,0.000000e+00,0.170248,2.490105e-02,0.0,0.008400,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
179,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.172908,8.662250e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
180,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.162991,9.274693e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Test-Train Split

In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)

In [27]:
# Create a linear regression model to use for feature selection
lr = LinearRegression()
lr.fit(X_train,y_train)

# RFE features

In [28]:
rfe = RFE(estimator=lr,n_features_to_select=17, step=1)

rfe.fit(X_train,y_train)

# Get the list of selected features
selected_features = X.columns[rfe.support_]
selected_df = pd.DataFrame({'Selected Features': selected_features})

# Print the dataframe as a table
selected_df

Unnamed: 0,Selected Features
0,20
1,40
2,41
3,42
4,57
5,94
6,106
7,108
8,113
9,115


In [29]:
selected_features

Int64Index([20, 40, 41, 42, 57, 94, 106, 108, 113, 115, 153, 158, 167, 195,
            205, 220, 221],
           dtype='int64')

In [30]:
n = selected_features.tolist()

In [31]:
n

[20, 40, 41, 42, 57, 94, 106, 108, 113, 115, 153, 158, 167, 195, 205, 220, 221]

df[n].corr()

sns.heatmap(df[n].corr())

In [32]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[(0, False, 47),
 (1, False, 116),
 (2, False, 150),
 (3, False, 148),
 (4, False, 30),
 (5, False, 151),
 (6, False, 2),
 (7, False, 20),
 (8, False, 69),
 (9, False, 92),
 (10, False, 102),
 (11, False, 145),
 (12, False, 77),
 (13, False, 127),
 (14, False, 121),
 (15, False, 115),
 (16, False, 117),
 (17, False, 126),
 (18, False, 164),
 (19, False, 41),
 (20, True, 1),
 (21, False, 105),
 (22, False, 144),
 (23, False, 153),
 (24, False, 124),
 (25, False, 100),
 (26, False, 75),
 (27, False, 36),
 (28, False, 125),
 (29, False, 140),
 (30, False, 9),
 (31, False, 26),
 (32, False, 122),
 (33, False, 85),
 (34, False, 158),
 (35, False, 147),
 (36, False, 40),
 (37, False, 157),
 (38, False, 152),
 (39, False, 98),
 (40, True, 1),
 (41, True, 1),
 (42, True, 1),
 (43, False, 6),
 (44, False, 95),
 (45, False, 62),
 (46, False, 161),
 (47, False, 86),
 (48, False, 38),
 (49, False, 146),
 (50, False, 136),
 (51, False, 53),
 (52, False, 49),
 (53, False, 129),
 (54, False, 96),
 (5

In [33]:
n = selected_features.tolist()

In [34]:
n

[20, 40, 41, 42, 57, 94, 106, 108, 113, 115, 153, 158, 167, 195, 205, 220, 221]

In [35]:
X_train.columns

RangeIndex(start=0, stop=236, step=1)

In [36]:
X_train = X_train[selected_features]

# Adding Values

In [37]:
X_train['constant']=1

In [38]:
X_train

Unnamed: 0,20,40,41,42,57,94,106,108,113,115,153,158,167,195,205,220,221,constant
162,0.032040,0.010259,0.217879,0.377507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
153,0.081744,0.089406,0.344762,0.471900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
109,0.000000,0.015221,0.449354,0.314327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
148,0.009855,0.108650,0.375487,0.516700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
26,0.039510,0.065596,0.713173,0.355736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.000000,0.213791,0.268399,0.437180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
14,0.436421,0.044335,0.882260,0.343450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
92,0.000000,0.032033,0.356330,0.242820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
179,0.046458,0.008841,0.284303,0.631634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [39]:
y_train = pd.DataFrame(y_train)

In [40]:
y_train

Unnamed: 0,Sales
162,4677875.0
153,4913048.0
109,5263122.5
148,5511442.5
26,6100135.0
...,...
106,5316150.0
14,5428030.0
92,5004340.0
179,5183900.0


In [41]:
model = sm.OLS(y_train, X_train).fit()

In [42]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.728
Model:                            OLS   Adj. R-squared:                  0.685
Method:                 Least Squares   F-statistic:                     17.13
Date:                Fri, 21 Apr 2023   Prob (F-statistic):           2.09e-23
Time:                        18:48:06   Log-Likelihood:                -1762.5
No. Observations:                 127   AIC:                             3561.
Df Residuals:                     109   BIC:                             3612.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
20         -6.933e+05   1.72e+05     -4.025      0.0

In [43]:
X_train

Unnamed: 0,20,40,41,42,57,94,106,108,113,115,153,158,167,195,205,220,221,constant
162,0.032040,0.010259,0.217879,0.377507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
153,0.081744,0.089406,0.344762,0.471900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
109,0.000000,0.015221,0.449354,0.314327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
148,0.009855,0.108650,0.375487,0.516700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
26,0.039510,0.065596,0.713173,0.355736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.000000,0.213791,0.268399,0.437180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
14,0.436421,0.044335,0.882260,0.343450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
92,0.000000,0.032033,0.356330,0.242820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
179,0.046458,0.008841,0.284303,0.631634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [44]:
O.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Columns: 239 entries, digital_catchuptv_bledina_brand-equity_spends to 2020-W25
dtypes: float64(40), int64(7), uint8(192)
memory usage: 101.1 KB


In [45]:
import statsmodels.api as sm

# Get the selected features from the previous step
X_selected = X_train
# Calculate the VIF for each feature
vif = pd.DataFrame()
vif["Features"] = X_selected.columns
vif["VIF Factor"] = [sm.OLS(X_selected[col], X_selected.drop(col, axis=1)).fit().rsquared for col in X_selected.columns]

# Print the VIF for each feature
print(vif)

    Features  VIF Factor
0         20    0.316305
1         40    0.363466
2         41    0.333146
3         42    0.298724
4         57    0.064327
5         94    0.104071
6        106    0.018152
7        108    0.131954
8        113    0.169422
9        115    0.054370
10       153    0.015920
11       158    0.021366
12       167    0.040244
13       195    0.025114
14       205    0.020590
15       220    0.136185
16       221    0.049438
17  constant    0.953071
