In [29]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import seaborn as sns

In [30]:
df = pd.read_csv("ALL_DATA_COMBINED.csv")

# Month Level Flag

In [31]:
df['week_start'] = pd.to_datetime(df['week_start'], format='%d-%m-%Y')

In [32]:
df['month'] = df['week_start'].dt.month

In [33]:
df_months = pd.get_dummies(df['month'])

In [34]:
df = pd.concat([df,df_months],axis = 1, join='inner', ignore_index=False, keys=None)

In [35]:
df.columns = df.columns.astype(str)

# X and Y initialization

In [36]:
X=df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','month'],axis=1)
y=df['Sales']

In [71]:
O = df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','month'],axis=1)

# Scaling - MINMAX transfrom

In [37]:
X.columns = X.columns.astype(str)

In [38]:
X

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,3,4,5,6,7,8,9,10,11,12
0,0.01,0.0,0.01,0.000000,2367.40,0.060000,16089.49,0.010000,0.0,2369.390000,...,0,0,0,0,0,0,0,0,0,0
1,0.00,0.0,0.01,0.000000,2785.65,0.050000,11618.44,10034.990000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
2,0.00,0.0,0.01,0.000000,2708.31,0.080000,10388.53,6955.560000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
3,0.00,0.0,0.00,0.000000,2473.94,0.080000,9699.73,4630.010000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,0,0
4,0.00,0.0,0.01,0.000000,618.99,0.010000,6182.01,2911.880000,0.0,2463.250000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.00,0.0,0.00,6562.143500,0.00,5501.142043,2537.86,8923.720012,0.0,2777.054278,...,0,0,1,0,0,0,0,0,0,0
178,0.00,0.0,0.00,0.031994,0.00,0.000000,2739.21,1075.980003,0.0,2347.751201,...,0,0,0,1,0,0,0,0,0,0
179,0.00,0.0,0.00,0.000000,0.00,0.000000,2782.00,3742.977999,0.0,3132.570888,...,0,0,0,1,0,0,0,0,0,0
180,0.00,0.0,0.00,0.000000,0.00,0.000000,2622.45,4007.615998,0.0,3132.570888,...,0,0,0,1,0,0,0,0,0,0


In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 60 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   digital_catchuptv_bledina_brand-equity_spends         182 non-null    float64
 1   digital_dataretailers_bledina_brand-equity_spends     182 non-null    float64
 2   digital_directbuying_bledina_brand-equity_spends      182 non-null    float64
 3   digital_keywordtargeting_bledina_brand-equity_spends  182 non-null    float64
 4   digital_nativeads_bledina_brand-equity_spends         182 non-null    float64
 5   digital_programmatic_bledina_brand-equity_spends      182 non-null    float64
 6   digital_sea_bledina_brand-equity_spends               182 non-null    float64
 7   digital_social_bledina_brand-equity_spends            182 non-null    float64
 8   press_equity_bledina_brand-equity_spends              182 no

In [40]:
scaler = MinMaxScaler()

In [41]:
scaler.fit(X)

In [42]:
X=scaler.transform(X)

In [46]:
X = pd.DataFrame(X)

# Test-Train Split

In [47]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)

In [48]:
# Create a linear regression model to use for feature selection
lr = LinearRegression()
lr.fit(X_train,y_train)

# RFE features

In [49]:
rfe = RFE(estimator=lr,n_features_to_select=17, step=1)

rfe.fit(X_train,y_train)

# Get the list of selected features
selected_features = X.columns[rfe.support_]
selected_df = pd.DataFrame({'Selected Features': selected_features})

# Print the dataframe as a table
selected_df

Unnamed: 0,Selected Features
0,0
1,6
2,8
3,15
4,25
5,31
6,32
7,33
8,34
9,35


In [50]:
selected_features

Int64Index([0, 6, 8, 15, 25, 31, 32, 33, 34, 35, 44, 45, 46, 47, 51, 53, 56], dtype='int64')

In [51]:
n = selected_features.tolist()

In [52]:
n

[0, 6, 8, 15, 25, 31, 32, 33, 34, 35, 44, 45, 46, 47, 51, 53, 56]

df[n].corr()

sns.heatmap(df[n].corr())

In [55]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[(0, True, 1),
 (1, False, 38),
 (2, False, 7),
 (3, False, 8),
 (4, False, 42),
 (5, False, 32),
 (6, True, 1),
 (7, False, 44),
 (8, True, 1),
 (9, False, 23),
 (10, False, 19),
 (11, False, 27),
 (12, False, 28),
 (13, False, 4),
 (14, False, 26),
 (15, True, 1),
 (16, False, 39),
 (17, False, 24),
 (18, False, 2),
 (19, False, 3),
 (20, False, 6),
 (21, False, 10),
 (22, False, 25),
 (23, False, 30),
 (24, False, 41),
 (25, True, 1),
 (26, False, 37),
 (27, False, 22),
 (28, False, 35),
 (29, False, 13),
 (30, False, 36),
 (31, True, 1),
 (32, True, 1),
 (33, True, 1),
 (34, True, 1),
 (35, True, 1),
 (36, False, 14),
 (37, False, 33),
 (38, False, 20),
 (39, False, 5),
 (40, False, 34),
 (41, False, 16),
 (42, False, 40),
 (43, False, 29),
 (44, True, 1),
 (45, True, 1),
 (46, True, 1),
 (47, True, 1),
 (48, False, 31),
 (49, False, 12),
 (50, False, 11),
 (51, True, 1),
 (52, False, 43),
 (53, True, 1),
 (54, False, 18),
 (55, False, 17),
 (56, True, 1),
 (57, False, 15),
 (58, F

In [56]:
n = selected_features.tolist()

In [57]:
#n.remove('retail_total_bledina_product_price')

In [58]:
#n.remove('retail_total_bledina_product_dvm')

In [59]:
#n.remove('competition_retail_competition_nonorganic_dvm')

In [60]:
#n.remove('competition_retail_competition_nonorganic_price')

In [61]:
# n.remove('macroeconomic_total_total_total_cci')

In [62]:
#n.remove('macroeconomic_total_total_total_unemploymentrate')

In [63]:
n

[0, 6, 8, 15, 25, 31, 32, 33, 34, 35, 44, 45, 46, 47, 51, 53, 56]

In [64]:
X_train = X_train[selected_features]

# Adding Values

In [65]:
X_train['constant']=1

In [66]:
X_train

Unnamed: 0,0,6,8,15,25,31,32,33,34,35,44,45,46,47,51,53,56,constant
162,0.000000,0.256678,0.000000,0.0,0.000000,0.120573,0.0,0.982079,0.000000,0.000033,0.010259,0.217879,0.377507,0.653051,0.0,0.0,0.0,1
153,0.000000,0.132210,0.000000,0.0,0.000000,0.595328,0.0,0.953405,0.117647,0.000000,0.089406,0.344762,0.471900,0.827224,0.0,0.0,0.0,1
109,0.021377,0.056263,0.000000,0.0,0.000000,0.000000,0.0,0.681004,0.470588,0.000000,0.015221,0.449354,0.314327,0.627596,0.0,0.0,0.0,1
148,0.000000,0.278765,0.000000,0.0,0.000000,0.512434,0.0,0.935484,0.117647,0.000000,0.108650,0.375487,0.516700,0.844931,0.0,0.0,0.0,1
26,0.000000,0.272473,0.393939,0.0,0.000000,1.000000,0.0,0.143369,0.941176,0.000000,0.065596,0.713173,0.355736,0.668084,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.021315,0.000000,0.000000,0.0,0.000000,0.602864,0.0,0.691756,0.470588,0.000000,0.213791,0.268399,0.437180,0.496982,0.0,0.0,0.0,1
14,0.066228,0.316233,0.000000,0.0,0.000000,0.399020,0.0,0.025090,0.941176,0.000000,0.044335,0.882260,0.343450,0.677316,1.0,0.0,0.0,1
92,0.000000,0.000000,0.000000,0.0,0.000000,0.880784,0.0,0.605735,0.470588,0.000000,0.032033,0.356330,0.242820,0.593920,0.0,0.0,0.0,1
179,0.000000,0.172908,0.000000,0.0,0.000000,0.567898,1.0,0.964158,0.000000,0.079642,0.008841,0.284303,0.631634,0.571320,0.0,1.0,0.0,1


In [67]:
y_train = pd.DataFrame(y_train)

In [68]:
y_train

Unnamed: 0,Sales
162,4677875.0
153,4913048.0
109,5263122.5
148,5511442.5
26,6100135.0
...,...
106,5316150.0
14,5428030.0
92,5004340.0
179,5183900.0


In [69]:
model = sm.OLS(y_train, X_train).fit()

In [70]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.624
Model:                            OLS   Adj. R-squared:                  0.565
Method:                 Least Squares   F-statistic:                     10.63
Date:                Thu, 20 Apr 2023   Prob (F-statistic):           3.05e-16
Time:                        17:24:51   Log-Likelihood:                -1783.0
No. Observations:                 127   AIC:                             3602.
Df Residuals:                     109   BIC:                             3653.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
0          -4.675e+05   3.53e+05     -1.323      0.1

In [72]:
O.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 60 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   digital_catchuptv_bledina_brand-equity_spends         182 non-null    float64
 1   digital_dataretailers_bledina_brand-equity_spends     182 non-null    float64
 2   digital_directbuying_bledina_brand-equity_spends      182 non-null    float64
 3   digital_keywordtargeting_bledina_brand-equity_spends  182 non-null    float64
 4   digital_nativeads_bledina_brand-equity_spends         182 non-null    float64
 5   digital_programmatic_bledina_brand-equity_spends      182 non-null    float64
 6   digital_sea_bledina_brand-equity_spends               182 non-null    float64
 7   digital_social_bledina_brand-equity_spends            182 non-null    float64
 8   press_equity_bledina_brand-equity_spends              182 no

In [None]:
model2 = sm.OLS(y_test, X_test).fit()

In [None]:
print(model2.summary())