In [111]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import seaborn as sns

In [112]:
df = pd.read_csv("ALL_DATA_COMBINED.csv")

In [113]:
df['week_start'].head(35)

0     02-01-2017
1     09-01-2017
2     16-01-2017
3     23-01-2017
4     30-01-2017
5     06-02-2017
6     13-02-2017
7     20-02-2017
8     27-02-2017
9     06-03-2017
10    13-03-2017
11    20-03-2017
12    27-03-2017
13    03-04-2017
14    10-04-2017
15    17-04-2017
16    24-04-2017
17    01-05-2017
18    08-05-2017
19    15-05-2017
20    22-05-2017
21    29-05-2017
22    05-06-2017
23    12-06-2017
24    19-06-2017
25    26-06-2017
26    03-07-2017
27    10-07-2017
28    17-07-2017
29    24-07-2017
30    31-07-2017
31    07-08-2017
32    14-08-2017
33    21-08-2017
34    28-08-2017
Name: week_start, dtype: object

In [114]:
df['week_start'] = pd.to_datetime(df['week_start'], format='%d-%m-%Y')

In [115]:
df['week_start'].head(35)

0    2017-01-02
1    2017-01-09
2    2017-01-16
3    2017-01-23
4    2017-01-30
5    2017-02-06
6    2017-02-13
7    2017-02-20
8    2017-02-27
9    2017-03-06
10   2017-03-13
11   2017-03-20
12   2017-03-27
13   2017-04-03
14   2017-04-10
15   2017-04-17
16   2017-04-24
17   2017-05-01
18   2017-05-08
19   2017-05-15
20   2017-05-22
21   2017-05-29
22   2017-06-05
23   2017-06-12
24   2017-06-19
25   2017-06-26
26   2017-07-03
27   2017-07-10
28   2017-07-17
29   2017-07-24
30   2017-07-31
31   2017-08-07
32   2017-08-14
33   2017-08-21
34   2017-08-28
Name: week_start, dtype: datetime64[ns]

In [116]:
df.shape

(182, 52)

# Month Level Flag

In [117]:
df['month'] = df['week_start'].dt.month

In [118]:
df_months = pd.get_dummies(df['month'],drop_first=True)

In [119]:
df = pd.concat([df,df_months],axis = 1, join='inner', ignore_index=False, keys=None)

# Week Level Flag

In [120]:
#df['week_flag'] = df['week_start'].dt.strftime('%Y-W%U')

In [121]:
#df_weeks = pd.get_dummies(df['week_flag'],drop_first=True)

In [122]:
#df = pd.concat([df,df_weeks],axis = 1, join='inner', ignore_index=False, keys=None)

In [123]:
#df['week_flag']

In [124]:
df['week_start'] = pd.to_datetime(df['week_start'])

In [125]:
def assign_week_numbers(group):
    group = group.sort_values('week_start')
    group['week_number'] = range(1, len(group) + 1)
    month_starts = group['week_start'].dt.to_period('M').unique().to_timestamp()
    for month_start in month_starts:
        mask = group['week_start'] >= month_start
        group.loc[mask, 'week_number'] -= group.loc[mask, 'week_number'].min() - 1
    return group

In [126]:
df = df.groupby(df['week_start'].dt.to_period('M')).apply(assign_week_numbers)

In [127]:
l=['week_start','week_number']

In [128]:
df[l].head(30)

Unnamed: 0,week_start,week_number
0,2017-01-02,1
1,2017-01-09,2
2,2017-01-16,3
3,2017-01-23,4
4,2017-01-30,5
5,2017-02-06,1
6,2017-02-13,2
7,2017-02-20,3
8,2017-02-27,4
9,2017-03-06,1


In [129]:
df_weeks = pd.get_dummies(df['week_number'],drop_first=True)

In [130]:
df = pd.concat([df,df_weeks],axis = 1, join='inner', ignore_index=False, keys=None)

In [131]:
df.head(10)

Unnamed: 0,week_start,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,...,8,9,10,11,12,week_number,2,3,4,5
0,2017-01-02,0.01,0.0,0.01,0.0,2367.4,0.06,16089.49,0.01,0.0,...,0,0,0,0,0,1,0,0,0,0
1,2017-01-09,0.0,0.0,0.01,0.0,2785.65,0.05,11618.44,10034.99,0.0,...,0,0,0,0,0,2,1,0,0,0
2,2017-01-16,0.0,0.0,0.01,0.0,2708.31,0.08,10388.53,6955.56,0.0,...,0,0,0,0,0,3,0,1,0,0
3,2017-01-23,0.0,0.0,0.0,0.0,2473.94,0.08,9699.73,4630.01,0.0,...,0,0,0,0,0,4,0,0,1,0
4,2017-01-30,0.0,0.0,0.01,0.0,618.99,0.01,6182.01,2911.88,0.0,...,0,0,0,0,0,5,0,0,0,1
5,2017-02-06,319.05,0.0,0.01,0.0,4.19,0.01,3034.42,605.48,0.0,...,0,0,0,0,0,1,0,0,0,0
6,2017-02-13,569.36,0.0,0.01,0.0,5.41,0.0,3856.28,0.0,0.0,...,0,0,0,0,0,2,1,0,0,0
7,2017-02-20,9025.68,0.0,142.3,0.0,2.03,22.43,4816.22,0.0,0.0,...,0,0,0,0,0,3,0,1,0,0
8,2017-02-27,8005.92,0.0,2529.44,0.0,10.86,25.83,5907.24,0.0,9000.0,...,0,0,0,0,0,4,0,0,1,0
9,2017-03-06,1026.42,0.0,857.08,0.0,0.35,0.0,3991.23,0.0,2500.0,...,0,0,0,0,0,1,0,0,0,0


# Dropping

In [132]:
df=df.drop(['week_number','month'],axis=1)

In [133]:
df.shape

(182, 67)

# Columns as String

In [134]:
df.columns = df.columns.astype(str)

In [135]:
df.columns

Index(['week_start', 'digital_catchuptv_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-equity_spends',
       'digital_directbuying_bledina_brand-equity_spends',
       'digital_keywordtargeting_bledina_brand-equity_spends',
       'digital_nativeads_bledina_brand-equity_spends',
       'digital_programmatic_bledina_brand-equity_spends',
       'digital_sea_bledina_brand-equity_spends',
       'digital_social_bledina_brand-equity_spends',
       'press_equity_bledina_brand-equity_spends',
       'tv_equity_bledina_brand-equity_spends',
       'digital_dataretailers_bledina_brand-range_spends',
       'digital_directbuying_bledina_brand-range_spends',
       'digital_keywordtargeting_bledina_brand-range_spends',
       'digital_programmatic_bledina_brand-range_spends',
       'digital_sea_bledina_brand-range_spends',
       'digital_social_bledina_brand-range_spends',
       'press_product_bledina_brand-range_spends',
       'app_total_bledina_brand_visits', '

In [136]:
df.tail()

Unnamed: 0,week_start,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,...,7,8,9,10,11,12,2,3,4,5
177,2020-05-25,0.0,0.0,0.0,6562.1435,0.0,5501.142043,2537.86,8923.720012,0.0,...,0,0,0,0,0,0,0,0,1,0
178,2020-06-01,0.0,0.0,0.0,0.031994,0.0,0.0,2739.21,1075.980003,0.0,...,0,0,0,0,0,0,0,0,0,0
179,2020-06-08,0.0,0.0,0.0,0.0,0.0,0.0,2782.0,3742.977999,0.0,...,0,0,0,0,0,0,1,0,0,0
180,2020-06-15,0.0,0.0,0.0,0.0,0.0,0.0,2622.45,4007.615998,0.0,...,0,0,0,0,0,0,0,1,0,0
181,2020-06-22,0.0,0.0,0.0,0.031994,0.0,0.0,2422.14,3086.460004,0.0,...,0,0,0,0,0,0,0,0,1,0


# X and Y initialization

In [137]:
X=df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','macroeconomic_total_total_total_population','macroeconomic_total_total_total_weeklycases','macroeconomic_total_total_total_unemploymentrate','crm_automatic_bledina_brand_emails'],axis=1)
y=df['Sales']

In [138]:
O = df.drop(['week_start','Sales','retail_total_bledina_product_volume','retail_total_bledina_product_price','macroeconomic_total_total_total_population'],axis=1)

In [139]:
O.head()

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,7,8,9,10,11,12,2,3,4,5
0,0.01,0.0,0.01,0.0,2367.4,0.06,16089.49,0.01,0.0,2369.39,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.01,0.0,2785.65,0.05,11618.44,10034.99,0.0,2470.6,...,0,0,0,0,0,0,1,0,0,0
2,0.0,0.0,0.01,0.0,2708.31,0.08,10388.53,6955.56,0.0,2470.6,...,0,0,0,0,0,0,0,1,0,0
3,0.0,0.0,0.0,0.0,2473.94,0.08,9699.73,4630.01,0.0,2470.6,...,0,0,0,0,0,0,0,0,1,0
4,0.0,0.0,0.01,0.0,618.99,0.01,6182.01,2911.88,0.0,2463.25,...,0,0,0,0,0,0,0,0,0,1


# Scaling - MINMAX transfrom

In [140]:
X.columns = X.columns.astype(str)

In [141]:
X

Unnamed: 0,digital_catchuptv_bledina_brand-equity_spends,digital_dataretailers_bledina_brand-equity_spends,digital_directbuying_bledina_brand-equity_spends,digital_keywordtargeting_bledina_brand-equity_spends,digital_nativeads_bledina_brand-equity_spends,digital_programmatic_bledina_brand-equity_spends,digital_sea_bledina_brand-equity_spends,digital_social_bledina_brand-equity_spends,press_equity_bledina_brand-equity_spends,tv_equity_bledina_brand-equity_spends,...,7,8,9,10,11,12,2,3,4,5
0,0.01,0.0,0.01,0.000000,2367.40,0.060000,16089.49,0.010000,0.0,2369.390000,...,0,0,0,0,0,0,0,0,0,0
1,0.00,0.0,0.01,0.000000,2785.65,0.050000,11618.44,10034.990000,0.0,2470.600000,...,0,0,0,0,0,0,1,0,0,0
2,0.00,0.0,0.01,0.000000,2708.31,0.080000,10388.53,6955.560000,0.0,2470.600000,...,0,0,0,0,0,0,0,1,0,0
3,0.00,0.0,0.00,0.000000,2473.94,0.080000,9699.73,4630.010000,0.0,2470.600000,...,0,0,0,0,0,0,0,0,1,0
4,0.00,0.0,0.01,0.000000,618.99,0.010000,6182.01,2911.880000,0.0,2463.250000,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.00,0.0,0.00,6562.143500,0.00,5501.142043,2537.86,8923.720012,0.0,2777.054278,...,0,0,0,0,0,0,0,0,1,0
178,0.00,0.0,0.00,0.031994,0.00,0.000000,2739.21,1075.980003,0.0,2347.751201,...,0,0,0,0,0,0,0,0,0,0
179,0.00,0.0,0.00,0.000000,0.00,0.000000,2782.00,3742.977999,0.0,3132.570888,...,0,0,0,0,0,0,1,0,0,0
180,0.00,0.0,0.00,0.000000,0.00,0.000000,2622.45,4007.615998,0.0,3132.570888,...,0,0,0,0,0,0,0,1,0,0


In [142]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 0 to 181
Data columns (total 59 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   digital_catchuptv_bledina_brand-equity_spends         182 non-null    float64
 1   digital_dataretailers_bledina_brand-equity_spends     182 non-null    float64
 2   digital_directbuying_bledina_brand-equity_spends      182 non-null    float64
 3   digital_keywordtargeting_bledina_brand-equity_spends  182 non-null    float64
 4   digital_nativeads_bledina_brand-equity_spends         182 non-null    float64
 5   digital_programmatic_bledina_brand-equity_spends      182 non-null    float64
 6   digital_sea_bledina_brand-equity_spends               182 non-null    float64
 7   digital_social_bledina_brand-equity_spends            182 non-null    float64
 8   press_equity_bledina_brand-equity_spends              182 no

In [143]:
scaler = MinMaxScaler()

In [144]:
scaler.fit(X)

In [145]:
X=scaler.transform(X)

In [146]:
X = pd.DataFrame(X)

In [147]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,8.915386e-08,0.0,5.752158e-07,0.000000,0.150990,6.946401e-07,1.000000,2.314267e-07,0.0,0.008477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000e+00,0.0,5.752158e-07,0.000000,0.177665,5.788667e-07,0.722114,2.322364e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.000000e+00,0.0,5.752158e-07,0.000000,0.172732,9.261868e-07,0.645672,1.609702e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.000000e+00,0.0,0.000000e+00,0.000000,0.157785,9.261868e-07,0.602861,1.071508e-01,0.0,0.008839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000000e+00,0.0,5.752158e-07,0.000000,0.039478,1.157733e-07,0.384227,6.738867e-02,0.0,0.008813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,0.000000e+00,0.0,0.000000e+00,0.385262,0.000000,6.368856e-02,0.157734,2.065187e-01,0.0,0.009935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
178,0.000000e+00,0.0,0.000000e+00,0.000002,0.000000,0.000000e+00,0.170248,2.490105e-02,0.0,0.008400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.172908,8.662250e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
180,0.000000e+00,0.0,0.000000e+00,0.000000,0.000000,0.000000e+00,0.162991,9.274693e-02,0.0,0.011207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Test-Train Split

In [148]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)

In [149]:
# Create a linear regression model to use for feature selection
lr = LinearRegression()
lr.fit(X_train,y_train)

# RFE features

In [150]:
rfe = RFE(estimator=lr,n_features_to_select=20, step=1)

rfe.fit(X_train,y_train)

# Get the list of selected features
selected_features = X.columns[rfe.support_]
selected_df = pd.DataFrame({'Selected Features': selected_features})

# Print the dataframe as a table
selected_df

Unnamed: 0,Selected Features
0,2
1,5
2,7
3,12
4,17
5,20
6,28
7,30
8,31
9,40


In [151]:
selected_features

Int64Index([2, 5, 7, 12, 17, 20, 28, 30, 31, 40, 41, 42, 43, 45, 51, 52, 54,
            55, 56, 57],
           dtype='int64')

In [152]:
n = selected_features.tolist()

In [153]:
n

[2, 5, 7, 12, 17, 20, 28, 30, 31, 40, 41, 42, 43, 45, 51, 52, 54, 55, 56, 57]

df[n].corr()

sns.heatmap(df[n].corr())

In [154]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[(0, False, 31),
 (1, False, 2),
 (2, True, 1),
 (3, False, 22),
 (4, False, 35),
 (5, True, 1),
 (6, False, 18),
 (7, True, 1),
 (8, False, 30),
 (9, False, 7),
 (10, False, 5),
 (11, False, 34),
 (12, True, 1),
 (13, False, 19),
 (14, False, 11),
 (15, False, 21),
 (16, False, 37),
 (17, True, 1),
 (18, False, 25),
 (19, False, 20),
 (20, True, 1),
 (21, False, 9),
 (22, False, 36),
 (23, False, 28),
 (24, False, 17),
 (25, False, 26),
 (26, False, 6),
 (27, False, 16),
 (28, True, 1),
 (29, False, 27),
 (30, True, 1),
 (31, True, 1),
 (32, False, 10),
 (33, False, 32),
 (34, False, 4),
 (35, False, 3),
 (36, False, 29),
 (37, False, 23),
 (38, False, 38),
 (39, False, 39),
 (40, True, 1),
 (41, True, 1),
 (42, True, 1),
 (43, True, 1),
 (44, False, 13),
 (45, True, 1),
 (46, False, 12),
 (47, False, 24),
 (48, False, 33),
 (49, False, 14),
 (50, False, 15),
 (51, True, 1),
 (52, True, 1),
 (53, False, 8),
 (54, True, 1),
 (55, True, 1),
 (56, True, 1),
 (57, True, 1),
 (58, False, 4

In [155]:
n = selected_features.tolist()

In [156]:
n

[2, 5, 7, 12, 17, 20, 28, 30, 31, 40, 41, 42, 43, 45, 51, 52, 54, 55, 56, 57]

In [157]:
X_train.columns

RangeIndex(start=0, stop=59, step=1)

In [158]:
X_train = X_train[selected_features]

# Adding Values

In [159]:
X_train['constant']=1

In [160]:
X_train

Unnamed: 0,2,5,7,12,17,20,28,30,31,40,...,42,43,45,51,52,54,55,56,57,constant
162,0.000000,0.000000e+00,0.000012,0.000000,0.430517,0.032040,0.85,0.120573,0.0,0.010259,...,0.377507,0.653051,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
153,0.000000,0.000000e+00,0.074275,0.000000,0.411815,0.081744,0.70,0.595328,0.0,0.089406,...,0.471900,0.827224,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1
109,0.000000,1.229298e-01,0.000000,0.000000,0.000000,0.000000,0.35,0.000000,0.0,0.015221,...,0.314327,0.627596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
148,0.000000,0.000000e+00,0.000023,0.000000,0.230897,0.009855,0.85,0.512434,0.0,0.108650,...,0.516700,0.844931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
26,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.039510,0.80,1.000000,0.0,0.065596,...,0.355736,0.668084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.000000,1.404456e-01,0.000000,0.000000,0.000000,0.000000,0.20,0.602864,0.0,0.213791,...,0.437180,0.496982,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
14,0.986215,9.645217e-02,0.000000,0.000000,0.000000,0.436421,0.60,0.399020,0.0,0.044335,...,0.343450,0.677316,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
92,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.30,0.880784,0.0,0.032033,...,0.242820,0.593920,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
179,0.000000,0.000000e+00,0.086622,0.317978,0.614943,0.046458,0.25,0.567898,1.0,0.008841,...,0.631634,0.571320,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [161]:
y_train = pd.DataFrame(y_train)

In [162]:
y_train

Unnamed: 0,Sales
162,4677875.0
153,4913048.0
109,5263122.5
148,5511442.5
26,6100135.0
...,...
106,5316150.0
14,5428030.0
92,5004340.0
179,5183900.0


In [163]:
model = sm.OLS(y_train, X_train).fit()

In [164]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.726
Model:                            OLS   Adj. R-squared:                  0.674
Method:                 Least Squares   F-statistic:                     14.01
Date:                Sun, 23 Apr 2023   Prob (F-statistic):           2.08e-21
Time:                        03:29:26   Log-Likelihood:                -1763.0
No. Observations:                 127   AIC:                             3568.
Df Residuals:                     106   BIC:                             3628.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
2           3.169e+05   1.78e+05      1.779      0.0

In [168]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127 entries, 162 to 102
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   2         127 non-null    float64
 1   5         127 non-null    float64
 2   7         127 non-null    float64
 3   12        127 non-null    float64
 4   17        127 non-null    float64
 5   20        127 non-null    float64
 6   28        127 non-null    float64
 7   30        127 non-null    float64
 8   31        127 non-null    float64
 9   40        127 non-null    float64
 10  41        127 non-null    float64
 11  42        127 non-null    float64
 12  43        127 non-null    float64
 13  45        127 non-null    float64
 14  51        127 non-null    float64
 15  52        127 non-null    float64
 16  54        127 non-null    float64
 17  55        127 non-null    float64
 18  56        127 non-null    float64
 19  57        127 non-null    float64
 20  constant  127 non-null    int6

In [166]:
O.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 0 to 181
Data columns (total 62 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   digital_catchuptv_bledina_brand-equity_spends         182 non-null    float64
 1   digital_dataretailers_bledina_brand-equity_spends     182 non-null    float64
 2   digital_directbuying_bledina_brand-equity_spends      182 non-null    float64
 3   digital_keywordtargeting_bledina_brand-equity_spends  182 non-null    float64
 4   digital_nativeads_bledina_brand-equity_spends         182 non-null    float64
 5   digital_programmatic_bledina_brand-equity_spends      182 non-null    float64
 6   digital_sea_bledina_brand-equity_spends               182 non-null    float64
 7   digital_social_bledina_brand-equity_spends            182 non-null    float64
 8   press_equity_bledina_brand-equity_spends              182 no

In [167]:
import statsmodels.api as sm

# Get the selected features from the previous step
X_selected = X_train
# Calculate the VIF for each feature
vif = pd.DataFrame()
vif["Features"] = X_selected.columns
vif["VIF Factor"] = [sm.OLS(X_selected[col], X_selected.drop(col, axis=1)).fit().rsquared for col in X_selected.columns]

# Print the VIF for each feature
print(vif)

    Features  VIF Factor
0          2    0.280286
1          5    0.335926
2          7    0.375546
3         12    0.416698
4         17    0.780774
5         20    0.392225
6         28    0.493301
7         30    0.396376
8         31    0.719935
9         40    0.204508
10        41    0.658893
11        42    0.573617
12        43    0.492732
13        45    0.240237
14        51    0.145871
15        52    0.199443
16        54    0.270241
17        55    0.378926
18        56    0.376153
19        57    0.412139
20  constant    0.985539
