In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import seaborn as sns

In [2]:
df = pd.read_csv("ALL_DATA_COMBINED.csv")

# Month Level Flag

In [3]:
df['week_start'] = pd.to_datetime(df['week_start'], format='%d-%m-%Y')

In [4]:
df['month'] = df['week_start'].dt.month

In [5]:
df_months = pd.get_dummies(df['month'],drop_first=True)

In [6]:
df = pd.concat([df,df_months],axis = 1, join='inner', ignore_index=False, keys=None)

In [7]:
df.columns = df.columns.astype(str)

In [8]:
df.columns

Index(['week_start', 'digital_catchuptv_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-equity_spends',
       'digital_directbuying_bledina_brand-equity_spends',
       'digital_keywordtargeting_bledina_brand-equity_spends',
       'digital_nativeads_bledina_brand-equity_spends',
       'digital_programmatic_bledina_brand-equity_spends',
       'digital_sea_bledina_brand-equity_spends',
       'digital_social_bledina_brand-equity_spends',
       'press_equity_bledina_brand-equity_spends',
       'tv_equity_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-range_spends',
       'digital_directbuying_bledina_brand-range_spends',
       'digital_keywordtargeting_bledina_brand-range_spends',
       'digital_programmatic_bledina_brand-range_spends',
       'digital_sea_bledina_brand-range_spends',
       'digital_social_bledina_brand-range_spends',
       'press_product_bledina_brand-range_spends',
       'app_total_bledina_brand_visits', '

In [9]:
df.head(6)

Unnamed: 0,week_start,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,...,3,4,5,6,7,8,9,10,11,12
0,2017-01-02,0.01,0.0,0.01,0.0,2367.4,0.06,16089.49,0.01,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2017-01-09,0.0,0.0,0.01,0.0,2785.65,0.05,11618.44,10034.99,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2017-01-16,0.0,0.0,0.01,0.0,2708.31,0.08,10388.53,6955.56,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2017-01-23,0.0,0.0,0.0,0.0,2473.94,0.08,9699.73,4630.01,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2017-01-30,0.0,0.0,0.01,0.0,618.99,0.01,6182.01,2911.88,0.0,...,0,0,0,0,0,0,0,0,0,0
5,2017-02-06,319.05,0.0,0.01,0.0,4.19,0.01,3034.42,605.48,0.0,...,0,0,0,0,0,0,0,0,0,0


# X and Y initialization

In [10]:
X=df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','month','macroeconomic_total_total_total_population','crm_automatic_bledina_brand_emails'],axis=1)
y=df['Sales']

In [11]:
O = df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','month','macroeconomic_total_total_total_population'],axis=1)

# Scaling - MINMAX transfrom

In [12]:
X.columns = X.columns.astype(str)

In [13]:
X

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,3,4,5,6,7,8,9,10,11,12
0,0.01,0.0,0.01,0.000000,2367.40,0.060000,16089.49,0.010000,0.0,2369.390000,...,0,0,0,0,0,0,0,0,0,0
1,0.00,0.0,0.01,0.000000,2785.65,0.050000,11618.44,10034.990000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
2,0.00,0.0,0.01,0.000000,2708.31,0.080000,10388.53,6955.560000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
3,0.00,0.0,0.00,0.000000,2473.94,0.080000,9699.73,4630.010000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
4,0.00,0.0,0.01,0.000000,618.99,0.010000,6182.01,2911.880000,0.0,2463.250000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.00,0.0,0.00,6562.143500,0.00,5501.142043,2537.86,8923.720012,0.0,2777.054278,...,0,0,1,0,0,0,0,0,0,0
178,0.00,0.0,0.00,0.031994,0.00,0.000000,2739.21,1075.980003,0.0,2347.751201,...,0,0,0,1,0,0,0,0,0,0
179,0.00,0.0,0.00,0.000000,0.00,0.000000,2782.00,3742.977999,0.0,3132.570888,...,0,0,0,1,0,0,0,0,0,0
180,0.00,0.0,0.00,0.000000,0.00,0.000000,2622.45,4007.615998,0.0,3132.570888,...,0,0,0,1,0,0,0,0,0,0


In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 57 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   digital_catchuptv_bledina_brand-equity_spends         182 non-null    float64
 1   digital_dataretailers_bledina_brand-equity_spends     182 non-null    float64
 2   digital_directbuying_bledina_brand-equity_spends      182 non-null    float64
 3   digital_keywordtargeting_bledina_brand-equity_spends  182 non-null    float64
 4   digital_nativeads_bledina_brand-equity_spends         182 non-null    float64
 5   digital_programmatic_bledina_brand-equity_spends      182 non-null    float64
 6   digital_sea_bledina_brand-equity_spends               182 non-null    float64
 7   digital_social_bledina_brand-equity_spends            182 non-null    float64
 8   press_equity_bledina_brand-equity_spends              182 no

In [15]:
scaler = MinMaxScaler()

In [16]:
scaler.fit(X)

In [17]:
X=scaler.transform(X)

In [18]:
X = pd.DataFrame(X)

In [19]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,8.915386e-08,0.0,5.752158e-07,0.000000,0.150990,6.946401e-07,1.000000,2.314267e-07,0.0,0.008477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000e+00,0.0,5.752158e-07,0.000000,0.177665,5.788667e-07,0.722114,2.322364e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000e+00,0.0,5.752158e-07,0.000000,0.172732,9.261868e-07,0.645672,1.609702e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000e+00,0.0,0.000000e+00,0.000000,0.157785,9.261868e-07,0.602861,1.071508e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000e+00,0.0,5.752158e-07,0.000000,0.039478,1.157733e-07,0.384227,6.738867e-02,0.0,0.008813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.000000e+00,0.0,0.000000e+00,0.385262,0.000000,6.368856e-02,0.157734,2.065187e-01,0.0,0.009935,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178,0.000000e+00,0.0,0.000000e+00,0.000002,0.000000,0.000000e+00,0.170248,2.490105e-02,0.0,0.008400,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.172908,8.662250e-02,0.0,0.011207,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
180,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.162991,9.274693e-02,0.0,0.011207,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Test-Train Split

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)

In [21]:
# Create a linear regression model to use for feature selection
lr = LinearRegression()
lr.fit(X_train,y_train)

# RFE features

In [22]:
rfe = RFE(estimator=lr,n_features_to_select=17, step=1)

rfe.fit(X_train,y_train)

# Get the list of selected features
selected_features = X.columns[rfe.support_]
selected_df = pd.DataFrame({'Selected Features': selected_features})

# Print the dataframe as a table
selected_df

Unnamed: 0,Selected Features
0,0
1,8
2,15
3,20
4,24
5,30
6,31
7,32
8,34
9,37


In [23]:
selected_features

Int64Index([0, 8, 15, 20, 24, 30, 31, 32, 34, 37, 42, 43, 44, 45, 47, 48, 54], dtype='int64')

In [24]:
n = selected_features.tolist()

In [25]:
n

[0, 8, 15, 20, 24, 30, 31, 32, 34, 37, 42, 43, 44, 45, 47, 48, 54]

df[n].corr()

sns.heatmap(df[n].corr())

In [26]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[(0, True, 1),
 (1, False, 22),
 (2, False, 33),
 (3, False, 16),
 (4, False, 23),
 (5, False, 32),
 (6, False, 18),
 (7, False, 29),
 (8, True, 1),
 (9, False, 6),
 (10, False, 11),
 (11, False, 26),
 (12, False, 28),
 (13, False, 30),
 (14, False, 39),
 (15, True, 1),
 (16, False, 40),
 (17, False, 35),
 (18, False, 36),
 (19, False, 2),
 (20, True, 1),
 (21, False, 10),
 (22, False, 41),
 (23, False, 25),
 (24, True, 1),
 (25, False, 12),
 (26, False, 8),
 (27, False, 34),
 (28, False, 21),
 (29, False, 37),
 (30, True, 1),
 (31, True, 1),
 (32, True, 1),
 (33, False, 31),
 (34, True, 1),
 (35, False, 20),
 (36, False, 4),
 (37, True, 1),
 (38, False, 19),
 (39, False, 24),
 (40, False, 38),
 (41, False, 9),
 (42, True, 1),
 (43, True, 1),
 (44, True, 1),
 (45, True, 1),
 (46, False, 13),
 (47, True, 1),
 (48, True, 1),
 (49, False, 17),
 (50, False, 27),
 (51, False, 15),
 (52, False, 14),
 (53, False, 7),
 (54, True, 1),
 (55, False, 5),
 (56, False, 3)]

In [27]:
n = selected_features.tolist()

In [28]:
#n.remove('retail_total_bledina_product_price')

In [29]:
#n.remove('retail_total_bledina_product_dvm')

In [30]:
#n.remove('competition_retail_competition_nonorganic_dvm')

In [31]:
#n.remove('competition_retail_competition_nonorganic_price')

In [32]:
# n.remove('macroeconomic_total_total_total_cci')

In [33]:
#n.remove('macroeconomic_total_total_total_unemploymentrate')

In [34]:
n

[0, 8, 15, 20, 24, 30, 31, 32, 34, 37, 42, 43, 44, 45, 47, 48, 54]

In [35]:
X_train = X_train[selected_features]

# Adding Values

In [36]:
X_train['constant']=1

In [37]:
X_train

Unnamed: 0,0,8,15,20,24,30,31,32,34,37,42,43,44,45,47,48,54,constant
162,0.000000,0.000000,0.0,0.032040,0.000000,0.120573,0.0,0.000000,0.000160,0.000000,0.010259,0.217879,0.377507,0.653051,0.0,0.0,0.0,1
153,0.000000,0.000000,0.0,0.081744,0.000000,0.595328,0.0,0.117647,0.000000,0.000000,0.089406,0.344762,0.471900,0.827224,0.0,0.0,0.0,1
109,0.021377,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.470588,0.000000,0.791349,0.015221,0.449354,0.314327,0.627596,0.0,0.0,0.0,1
148,0.000000,0.000000,0.0,0.009855,0.000000,0.512434,0.0,0.117647,0.000000,0.000000,0.108650,0.375487,0.516700,0.844931,0.0,0.0,0.0,1
26,0.000000,0.393939,0.0,0.039510,0.000000,1.000000,0.0,0.941176,0.000000,0.000000,0.065596,0.713173,0.355736,0.668084,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.021315,0.000000,0.0,0.000000,0.000000,0.602864,0.0,0.470588,0.000000,0.801337,0.213791,0.268399,0.437180,0.496982,0.0,0.0,0.0,1
14,0.066228,0.000000,0.0,0.436421,0.000000,0.399020,0.0,0.941176,0.000000,0.000000,0.044335,0.882260,0.343450,0.677316,0.0,1.0,0.0,1
92,0.000000,0.000000,0.0,0.000000,0.000000,0.880784,0.0,0.470588,0.000000,0.000000,0.032033,0.356330,0.242820,0.593920,0.0,0.0,1.0,1
179,0.000000,0.000000,0.0,0.046458,0.000000,0.567898,1.0,0.000000,0.040045,0.000000,0.008841,0.284303,0.631634,0.571320,0.0,0.0,0.0,1


In [38]:
y_train = pd.DataFrame(y_train)

In [39]:
y_train

Unnamed: 0,Sales
162,4677875.0
153,4913048.0
109,5263122.5
148,5511442.5
26,6100135.0
...,...
106,5316150.0
14,5428030.0
92,5004340.0
179,5183900.0


In [40]:
model = sm.OLS(y_train, X_train).fit()

In [41]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.552
Method:                 Least Squares   F-statistic:                     10.14
Date:                Thu, 20 Apr 2023   Prob (F-statistic):           1.30e-15
Time:                        18:01:07   Log-Likelihood:                -1784.9
No. Observations:                 127   AIC:                             3606.
Df Residuals:                     109   BIC:                             3657.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
0          -5.069e+05   3.64e+05     -1.393      0.1

In [42]:
O.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 58 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   digital_catchuptv_bledina_brand-equity_spends         182 non-null    float64
 1   digital_dataretailers_bledina_brand-equity_spends     182 non-null    float64
 2   digital_directbuying_bledina_brand-equity_spends      182 non-null    float64
 3   digital_keywordtargeting_bledina_brand-equity_spends  182 non-null    float64
 4   digital_nativeads_bledina_brand-equity_spends         182 non-null    float64
 5   digital_programmatic_bledina_brand-equity_spends      182 non-null    float64
 6   digital_sea_bledina_brand-equity_spends               182 non-null    float64
 7   digital_social_bledina_brand-equity_spends            182 non-null    float64
 8   press_equity_bledina_brand-equity_spends              182 no

In [43]:
import statsmodels.api as sm

# Get the selected features from the previous step
X_selected = X_train
# Calculate the VIF for each feature
vif = pd.DataFrame()
vif["Features"] = X_selected.columns
vif["VIF Factor"] = [sm.OLS(X_selected[col], X_selected.drop(col, axis=1)).fit().rsquared for col in X_selected.columns]

# Print the VIF for each feature
print(vif)

    Features  VIF Factor
0          0    0.246863
1          8    0.220362
2         15    0.147360
3         20    0.343027
4         24    0.845491
5         30    0.417461
6         31    0.687179
7         32    0.837121
8         34    0.871802
9         37    0.235624
10        42    0.218313
11        43    0.828874
12        44    0.571527
13        45    0.410098
14        47    0.264271
15        48    0.528623
16        54    0.212767
17  constant    0.985768


In [44]:
model2 = sm.OLS(y_test, X_test).fit()

In [45]:
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Thu, 20 Apr 2023   Prob (F-statistic):                nan
Time:                        18:01:07   Log-Likelihood:                 616.80
No. Observations:                  55   AIC:                            -1124.
Df Residuals:                       0   BIC:                            -1013.
Df Model:                          54                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
0           5.501e+06        inf          0        n

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
