In [61]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import seaborn as sns

In [62]:
df = pd.read_csv("ALL_DATA_COMBINED.csv")

In [63]:
df['week_start'].head(35)

0     02-01-2017
1     09-01-2017
2     16-01-2017
3     23-01-2017
4     30-01-2017
5     06-02-2017
6     13-02-2017
7     20-02-2017
8     27-02-2017
9     06-03-2017
10    13-03-2017
11    20-03-2017
12    27-03-2017
13    03-04-2017
14    10-04-2017
15    17-04-2017
16    24-04-2017
17    01-05-2017
18    08-05-2017
19    15-05-2017
20    22-05-2017
21    29-05-2017
22    05-06-2017
23    12-06-2017
24    19-06-2017
25    26-06-2017
26    03-07-2017
27    10-07-2017
28    17-07-2017
29    24-07-2017
30    31-07-2017
31    07-08-2017
32    14-08-2017
33    21-08-2017
34    28-08-2017
Name: week_start, dtype: object

In [64]:
df['week_start'] = pd.to_datetime(df['week_start'], format='%d-%m-%Y')

In [65]:
df['week_start'].head(35)

0    2017-01-02
1    2017-01-09
2    2017-01-16
3    2017-01-23
4    2017-01-30
5    2017-02-06
6    2017-02-13
7    2017-02-20
8    2017-02-27
9    2017-03-06
10   2017-03-13
11   2017-03-20
12   2017-03-27
13   2017-04-03
14   2017-04-10
15   2017-04-17
16   2017-04-24
17   2017-05-01
18   2017-05-08
19   2017-05-15
20   2017-05-22
21   2017-05-29
22   2017-06-05
23   2017-06-12
24   2017-06-19
25   2017-06-26
26   2017-07-03
27   2017-07-10
28   2017-07-17
29   2017-07-24
30   2017-07-31
31   2017-08-07
32   2017-08-14
33   2017-08-21
34   2017-08-28
Name: week_start, dtype: datetime64[ns]

In [66]:
df.shape

(182, 52)

# Month Level Flag

In [67]:
df['month'] = df['week_start'].dt.month

In [68]:
df_months = pd.get_dummies(df['month'],drop_first=True)

In [69]:
df = pd.concat([df,df_months],axis = 1, join='inner', ignore_index=False, keys=None)

# Week Level Flag

In [70]:
df['week_start'] = pd.to_datetime(df['week_start'])

In [71]:
def assign_week_numbers(group):
    group = group.sort_values('week_start')
    group['week_number'] = range(1, len(group) + 1)
    month_starts = group['week_start'].dt.to_period('M').unique().to_timestamp()
    for month_start in month_starts:
        mask = group['week_start'] >= month_start
        group.loc[mask, 'week_number'] -= group.loc[mask, 'week_number'].min() - 1
    return group

In [72]:
df = df.groupby(df['week_start'].dt.to_period('M')).apply(assign_week_numbers)

In [73]:
l=['week_start','week_number']

In [74]:
df[l].head(30)

Unnamed: 0,week_start,week_number
0,2017-01-02,1
1,2017-01-09,2
2,2017-01-16,3
3,2017-01-23,4
4,2017-01-30,5
5,2017-02-06,1
6,2017-02-13,2
7,2017-02-20,3
8,2017-02-27,4
9,2017-03-06,1


In [75]:
df_weeks = pd.get_dummies(df['week_number'],drop_first=True)

In [76]:
df = pd.concat([df,df_weeks],axis = 1, join='inner', ignore_index=False, keys=None)

In [77]:
df.head(10)

Unnamed: 0,week_start,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,...,8,9,10,11,12,week_number,2,3,4,5
0,2017-01-02,0.01,0.0,0.01,0.0,2367.4,0.06,16089.49,0.01,0.0,...,0,0,0,0,0,1,0,0,0,0
1,2017-01-09,0.0,0.0,0.01,0.0,2785.65,0.05,11618.44,10034.99,0.0,...,0,0,0,0,0,2,1,0,0,0
2,2017-01-16,0.0,0.0,0.01,0.0,2708.31,0.08,10388.53,6955.56,0.0,...,0,0,0,0,0,3,0,1,0,0
3,2017-01-23,0.0,0.0,0.0,0.0,2473.94,0.08,9699.73,4630.01,0.0,...,0,0,0,0,0,4,0,0,1,0
4,2017-01-30,0.0,0.0,0.01,0.0,618.99,0.01,6182.01,2911.88,0.0,...,0,0,0,0,0,5,0,0,0,1
5,2017-02-06,319.05,0.0,0.01,0.0,4.19,0.01,3034.42,605.48,0.0,...,0,0,0,0,0,1,0,0,0,0
6,2017-02-13,569.36,0.0,0.01,0.0,5.41,0.0,3856.28,0.0,0.0,...,0,0,0,0,0,2,1,0,0,0
7,2017-02-20,9025.68,0.0,142.3,0.0,2.03,22.43,4816.22,0.0,0.0,...,0,0,0,0,0,3,0,1,0,0
8,2017-02-27,8005.92,0.0,2529.44,0.0,10.86,25.83,5907.24,0.0,9000.0,...,0,0,0,0,0,4,0,0,1,0
9,2017-03-06,1026.42,0.0,857.08,0.0,0.35,0.0,3991.23,0.0,2500.0,...,0,0,0,0,0,1,0,0,0,0


# Dropping

In [78]:
df=df.drop(['week_number','month'],axis=1)

In [79]:
df.shape

(182, 67)

# Columns as String

In [80]:
df.columns = df.columns.astype(str)

In [81]:
df.columns

Index(['week_start', 'digital_catchuptv_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-equity_spends',
       'digital_directbuying_bledina_brand-equity_spends',
       'digital_keywordtargeting_bledina_brand-equity_spends',
       'digital_nativeads_bledina_brand-equity_spends',
       'digital_programmatic_bledina_brand-equity_spends',
       'digital_sea_bledina_brand-equity_spends',
       'digital_social_bledina_brand-equity_spends',
       'press_equity_bledina_brand-equity_spends',
       'tv_equity_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-range_spends',
       'digital_directbuying_bledina_brand-range_spends',
       'digital_keywordtargeting_bledina_brand-range_spends',
       'digital_programmatic_bledina_brand-range_spends',
       'digital_sea_bledina_brand-range_spends',
       'digital_social_bledina_brand-range_spends',
       'press_product_bledina_brand-range_spends',
       'app_total_bledina_brand_visits', '

In [82]:
df.tail()

Unnamed: 0,week_start,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,...,7,8,9,10,11,12,2,3,4,5
177,2020-05-25,0.0,0.0,0.0,6562.1435,0.0,5501.142043,2537.86,8923.720012,0.0,...,0,0,0,0,0,0,0,0,1,0
178,2020-06-01,0.0,0.0,0.0,0.031994,0.0,0.0,2739.21,1075.980003,0.0,...,0,0,0,0,0,0,0,0,0,0
179,2020-06-08,0.0,0.0,0.0,0.0,0.0,0.0,2782.0,3742.977999,0.0,...,0,0,0,0,0,0,1,0,0,0
180,2020-06-15,0.0,0.0,0.0,0.0,0.0,0.0,2622.45,4007.615998,0.0,...,0,0,0,0,0,0,0,1,0,0
181,2020-06-22,0.0,0.0,0.0,0.031994,0.0,0.0,2422.14,3086.460004,0.0,...,0,0,0,0,0,0,0,0,1,0


# X and Y initialization

In [83]:
X=df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','macroeconomic_total_total_total_population','macroeconomic_total_total_total_weeklycases','macroeconomic_total_total_total_unemploymentrate','crm_automatic_bledina_brand_emails'],axis=1)
y=df['Sales']

In [84]:
O = df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','macroeconomic_total_total_total_population'],axis=1)

In [85]:
O.head()

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,7,8,9,10,11,12,2,3,4,5
0,0.01,0.0,0.01,0.0,2367.4,0.06,16089.49,0.01,0.0,2369.39,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.01,0.0,2785.65,0.05,11618.44,10034.99,0.0,2470.6,...,0,0,0,0,0,0,1,0,0,0
2,0.0,0.0,0.01,0.0,2708.31,0.08,10388.53,6955.56,0.0,2470.6,...,0,0,0,0,0,0,0,1,0,0
3,0.0,0.0,0.0,0.0,2473.94,0.08,9699.73,4630.01,0.0,2470.6,...,0,0,0,0,0,0,0,0,1,0
4,0.0,0.0,0.01,0.0,618.99,0.01,6182.01,2911.88,0.0,2463.25,...,0,0,0,0,0,0,0,0,0,1


In [86]:
Columns_names = X.columns.tolist()

# Scaling - MINMAX transfrom

In [87]:
X.columns = X.columns.astype(str)

In [88]:
scaler = MinMaxScaler()

In [89]:
scaler.fit(X)

In [90]:
X=scaler.transform(X)

In [91]:
X = pd.DataFrame(X)

In [92]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,8.915386e-08,0.0,5.752158e-07,0.000000,0.150990,6.946401e-07,1.000000,2.314267e-07,0.0,0.008477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000e+00,0.0,5.752158e-07,0.000000,0.177665,5.788667e-07,0.722114,2.322364e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.000000e+00,0.0,5.752158e-07,0.000000,0.172732,9.261868e-07,0.645672,1.609702e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.000000e+00,0.0,0.000000e+00,0.000000,0.157785,9.261868e-07,0.602861,1.071508e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000000e+00,0.0,5.752158e-07,0.000000,0.039478,1.157733e-07,0.384227,6.738867e-02,0.0,0.008813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.000000e+00,0.0,0.000000e+00,0.385262,0.000000,6.368856e-02,0.157734,2.065187e-01,0.0,0.009935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
178,0.000000e+00,0.0,0.000000e+00,0.000002,0.000000,0.000000e+00,0.170248,2.490105e-02,0.0,0.008400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.172908,8.662250e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
180,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.162991,9.274693e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [93]:
X = X.rename(columns=dict(enumerate(Columns_names)))

In [94]:
X

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,7,8,9,10,11,12,2,3,4,5
0,8.915386e-08,0.0,5.752158e-07,0.000000,0.150990,6.946401e-07,1.000000,2.314267e-07,0.0,0.008477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000e+00,0.0,5.752158e-07,0.000000,0.177665,5.788667e-07,0.722114,2.322364e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.000000e+00,0.0,5.752158e-07,0.000000,0.172732,9.261868e-07,0.645672,1.609702e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.000000e+00,0.0,0.000000e+00,0.000000,0.157785,9.261868e-07,0.602861,1.071508e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000000e+00,0.0,5.752158e-07,0.000000,0.039478,1.157733e-07,0.384227,6.738867e-02,0.0,0.008813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.000000e+00,0.0,0.000000e+00,0.385262,0.000000,6.368856e-02,0.157734,2.065187e-01,0.0,0.009935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
178,0.000000e+00,0.0,0.000000e+00,0.000002,0.000000,0.000000e+00,0.170248,2.490105e-02,0.0,0.008400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.172908,8.662250e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
180,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.162991,9.274693e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Test-Train Split

In [95]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42,shuffle=False)

In [96]:
# Create a linear regression model to use for feature selection
lr = LinearRegression()
#lr.fit(X_train,y_train)

# RFE features

In [97]:
rfe = RFE(estimator=lr,n_features_to_select=20, step=1)

rfe.fit(X_train,y_train)

# Get the list of selected features
selected_features = X.columns[rfe.support_]
selected_df = pd.DataFrame({'Selected Features': selected_features})

# Print the dataframe as a table
selected_df

Unnamed: 0,Selected Features
0,digital_programmatic_bledina_brand-equity_spends
1,digital_sea_bledina_brand-equity_spends
2,digital_social_bledina_brand-equity_spends
3,digital_programmatic_bledina_brand-range_spends
4,digital_sea_bledina_brand-range_spends
5,crm_event_bledina_brand_emails
6,competition_ooh_competition-others_total_spends
7,competition_press_competition-nestle_total_spends
8,competition_press_competition-others_total_spends
9,macroeconomic_total_total_total_livebirths


In [98]:
selected_features

Index(['digital_programmatic_bledina_brand-equity_spends',
       'digital_sea_bledina_brand-equity_spends',
       'digital_social_bledina_brand-equity_spends',
       'digital_programmatic_bledina_brand-range_spends',
       'digital_sea_bledina_brand-range_spends',
       'crm_event_bledina_brand_emails',
       'competition_ooh_competition-others_total_spends',
       'competition_press_competition-nestle_total_spends',
       'competition_press_competition-others_total_spends',
       'macroeconomic_total_total_total_livebirths',
       'promo_total_bledina_bledina_spends',
       'competition_retail_competition_nonorganic_dvm',
       'competition_retail_competition_nonorganic_price', '2', '3', '11', '12',
       '2', '3', '4'],
      dtype='object')

In [99]:
n = selected_features.tolist()

In [100]:
n

['digital_programmatic_bledina_brand-equity_spends',
 'digital_sea_bledina_brand-equity_spends',
 'digital_social_bledina_brand-equity_spends',
 'digital_programmatic_bledina_brand-range_spends',
 'digital_sea_bledina_brand-range_spends',
 'crm_event_bledina_brand_emails',
 'competition_ooh_competition-others_total_spends',
 'competition_press_competition-nestle_total_spends',
 'competition_press_competition-others_total_spends',
 'macroeconomic_total_total_total_livebirths',
 'promo_total_bledina_bledina_spends',
 'competition_retail_competition_nonorganic_dvm',
 'competition_retail_competition_nonorganic_price',
 '2',
 '3',
 '11',
 '12',
 '2',
 '3',
 '4']

In [101]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[('digital_catchuptv_bledina_brand-equity_spends', False, 11),
 ('digital_dataretailers_bledina_brand-equity_spends', False, 26),
 ('digital_directbuying_bledina_brand-equity_spends', False, 31),
 ('digital_keywordtargeting_bledina_brand-equity_spends', False, 34),
 ('digital_nativeads_bledina_brand-equity_spends', False, 27),
 ('digital_programmatic_bledina_brand-equity_spends', True, 1),
 ('digital_sea_bledina_brand-equity_spends', True, 1),
 ('digital_social_bledina_brand-equity_spends', True, 1),
 ('press_equity_bledina_brand-equity_spends', False, 19),
 ('tv_equity_bledina_brand-equity_spends', False, 21),
 ('digital_dataretailers_bledina_brand-range_spends', False, 35),
 ('digital_directbuying_bledina_brand-range_spends', False, 10),
 ('digital_keywordtargeting_bledina_brand-range_spends', False, 36),
 ('digital_programmatic_bledina_brand-range_spends', True, 1),
 ('digital_sea_bledina_brand-range_spends', True, 1),
 ('digital_social_bledina_brand-range_spends', False, 37),
 ('pr

In [102]:
n = selected_features.tolist()

In [103]:
n

['digital_programmatic_bledina_brand-equity_spends',
 'digital_sea_bledina_brand-equity_spends',
 'digital_social_bledina_brand-equity_spends',
 'digital_programmatic_bledina_brand-range_spends',
 'digital_sea_bledina_brand-range_spends',
 'crm_event_bledina_brand_emails',
 'competition_ooh_competition-others_total_spends',
 'competition_press_competition-nestle_total_spends',
 'competition_press_competition-others_total_spends',
 'macroeconomic_total_total_total_livebirths',
 'promo_total_bledina_bledina_spends',
 'competition_retail_competition_nonorganic_dvm',
 'competition_retail_competition_nonorganic_price',
 '2',
 '3',
 '11',
 '12',
 '2',
 '3',
 '4']

In [104]:
X_train.columns

Index(['digital_catchuptv_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-equity_spends',
       'digital_directbuying_bledina_brand-equity_spends',
       'digital_keywordtargeting_bledina_brand-equity_spends',
       'digital_nativeads_bledina_brand-equity_spends',
       'digital_programmatic_bledina_brand-equity_spends',
       'digital_sea_bledina_brand-equity_spends',
       'digital_social_bledina_brand-equity_spends',
       'press_equity_bledina_brand-equity_spends',
       'tv_equity_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-range_spends',
       'digital_directbuying_bledina_brand-range_spends',
       'digital_keywordtargeting_bledina_brand-range_spends',
       'digital_programmatic_bledina_brand-range_spends',
       'digital_sea_bledina_brand-range_spends',
       'digital_social_bledina_brand-range_spends',
       'press_product_bledina_brand-range_spends',
       'app_total_bledina_brand_visits', 'crm_event_bled

In [105]:
X_train

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,7,8,9,10,11,12,2,3,4,5
0,8.915386e-08,0.0,5.752158e-07,0.0,0.150990,6.946401e-07,1.000000,2.314267e-07,0.0,0.008477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000e+00,0.0,5.752158e-07,0.0,0.177665,5.788667e-07,0.722114,2.322364e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.000000e+00,0.0,5.752158e-07,0.0,0.172732,9.261868e-07,0.645672,1.609702e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.000000e+00,0.0,0.000000e+00,0.0,0.157785,9.261868e-07,0.602861,1.071508e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000000e+00,0.0,5.752158e-07,0.0,0.039478,1.157733e-07,0.384227,6.738867e-02,0.0,0.008813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,2.969321e-02,0.0,0.000000e+00,0.0,0.000000,5.424324e-02,0.340153,7.535554e-02,0.0,0.006700,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123,2.963979e-02,0.0,0.000000e+00,0.0,0.000000,5.797471e-02,0.378574,7.604310e-02,0.0,0.008040,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
124,2.929193e-02,0.0,0.000000e+00,0.0,0.000000,6.011218e-02,0.364361,7.516739e-02,0.0,0.008040,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
125,2.961021e-02,0.0,0.000000e+00,0.0,0.000000,6.599168e-02,0.383797,1.827690e-01,0.0,0.008040,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [106]:
X_train = X_train[selected_features]

# Adding Values

In [107]:
X_train['constant']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['constant']=1


In [108]:
X_train

Unnamed: 0,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,digital_programmatic_bledina_brand-range_spends,digital_sea_bledina_brand-range_spends,crm_event_bledina_brand_emails,competition_ooh_competition-others_total_spends,competition_press_competition-nestle_total_spends,competition_press_competition-others_total_spends,macroeconomic_total_total_total_livebirths,...,3,11,12,2,2.1,3.1,3.2,4,4.1,constant
0,6.946401e-07,1.000000,2.314267e-07,0.000000e+00,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,5.788667e-07,0.722114,2.322364e-01,0.000000e+00,0.000000,0.026748,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
2,9.261868e-07,0.645672,1.609702e-01,7.344175e-07,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,9.261868e-07,0.602861,1.071508e-01,7.344175e-07,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,1.157733e-07,0.384227,6.738867e-02,0.000000e+00,0.000000,0.000000,0.003245,0.000000,0.267727,0.348068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,5.424324e-02,0.340153,7.535554e-02,0.000000e+00,0.165259,0.000000,0.000000,0.235917,0.217703,0.617935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
123,5.797471e-02,0.378574,7.604310e-02,0.000000e+00,0.142126,0.000000,0.000000,0.235917,0.217703,0.617935,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
124,6.011218e-02,0.364361,7.516739e-02,0.000000e+00,0.107287,0.279544,0.000000,0.235917,0.217703,0.617935,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
125,6.599168e-02,0.383797,1.827690e-01,0.000000e+00,0.098090,0.000000,0.000000,0.226792,0.288342,0.611476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [109]:
y_train = pd.DataFrame(y_train)

In [110]:
y_train

Unnamed: 0,Sales
0,6886165.0
1,5875465.0
2,5897865.0
3,5431465.0
4,5497440.0
...,...
122,4671190.0
123,4396190.0
124,4487690.0
125,4924492.0


In [111]:
model = sm.OLS(y_train, X_train).fit()

In [112]:
y_train

Unnamed: 0,Sales
0,6886165.0
1,5875465.0
2,5897865.0
3,5431465.0
4,5497440.0
...,...
122,4671190.0
123,4396190.0
124,4487690.0
125,4924492.0


In [113]:
X_train

Unnamed: 0,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,digital_programmatic_bledina_brand-range_spends,digital_sea_bledina_brand-range_spends,crm_event_bledina_brand_emails,competition_ooh_competition-others_total_spends,competition_press_competition-nestle_total_spends,competition_press_competition-others_total_spends,macroeconomic_total_total_total_livebirths,...,3,11,12,2,2.1,3.1,3.2,4,4.1,constant
0,6.946401e-07,1.000000,2.314267e-07,0.000000e+00,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,5.788667e-07,0.722114,2.322364e-01,0.000000e+00,0.000000,0.026748,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
2,9.261868e-07,0.645672,1.609702e-01,7.344175e-07,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,9.261868e-07,0.602861,1.071508e-01,7.344175e-07,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,1.157733e-07,0.384227,6.738867e-02,0.000000e+00,0.000000,0.000000,0.003245,0.000000,0.267727,0.348068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,5.424324e-02,0.340153,7.535554e-02,0.000000e+00,0.165259,0.000000,0.000000,0.235917,0.217703,0.617935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
123,5.797471e-02,0.378574,7.604310e-02,0.000000e+00,0.142126,0.000000,0.000000,0.235917,0.217703,0.617935,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
124,6.011218e-02,0.364361,7.516739e-02,0.000000e+00,0.107287,0.279544,0.000000,0.235917,0.217703,0.617935,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
125,6.599168e-02,0.383797,1.827690e-01,0.000000e+00,0.098090,0.000000,0.000000,0.226792,0.288342,0.611476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [114]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.759
Model:                            OLS   Adj. R-squared:                  0.710
Method:                 Least Squares   F-statistic:                     15.71
Date:                Mon, 24 Apr 2023   Prob (F-statistic):           1.45e-23
Time:                        09:19:09   Log-Likelihood:                -1748.6
No. Observations:                 127   AIC:                             3541.
Df Residuals:                     105   BIC:                             3604.
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                                        coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------

In [115]:
X_train

Unnamed: 0,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,digital_programmatic_bledina_brand-range_spends,digital_sea_bledina_brand-range_spends,crm_event_bledina_brand_emails,competition_ooh_competition-others_total_spends,competition_press_competition-nestle_total_spends,competition_press_competition-others_total_spends,macroeconomic_total_total_total_livebirths,...,3,11,12,2,2.1,3.1,3.2,4,4.1,constant
0,6.946401e-07,1.000000,2.314267e-07,0.000000e+00,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,5.788667e-07,0.722114,2.322364e-01,0.000000e+00,0.000000,0.026748,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
2,9.261868e-07,0.645672,1.609702e-01,7.344175e-07,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,9.261868e-07,0.602861,1.071508e-01,7.344175e-07,0.000000,0.000000,0.003015,0.000000,0.110048,0.644009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,1.157733e-07,0.384227,6.738867e-02,0.000000e+00,0.000000,0.000000,0.003245,0.000000,0.267727,0.348068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,5.424324e-02,0.340153,7.535554e-02,0.000000e+00,0.165259,0.000000,0.000000,0.235917,0.217703,0.617935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
123,5.797471e-02,0.378574,7.604310e-02,0.000000e+00,0.142126,0.000000,0.000000,0.235917,0.217703,0.617935,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
124,6.011218e-02,0.364361,7.516739e-02,0.000000e+00,0.107287,0.279544,0.000000,0.235917,0.217703,0.617935,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
125,6.599168e-02,0.383797,1.827690e-01,0.000000e+00,0.098090,0.000000,0.000000,0.226792,0.288342,0.611476,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [116]:
y_train

Unnamed: 0,Sales
0,6886165.0
1,5875465.0
2,5897865.0
3,5431465.0
4,5497440.0
...,...
122,4671190.0
123,4396190.0
124,4487690.0
125,4924492.0


In [117]:
import statsmodels.api as sm

# Get the selected features from the previous step
X_selected = X_train
# Calculate the VIF for each feature
vif = pd.DataFrame()
vif["Features"] = X_train.columns
vif["VIF Factor"] = [sm.OLS(X_selected[col], X_selected.drop(col, axis=1)).fit().rsquared for col in X_selected.columns]

ValueError: shapes (127,4) and (127,4) not aligned: 4 (dim 1) != 127 (dim 0)

In [None]:
X_test = X_test[selected_features]

In [None]:
X_test['constant']=1

In [None]:
model2 = sm.OLS(y_test, X_test).fit()

In [None]:
print(model2.summary())