In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler

import statsmodels.formula.api as sfa
import statsmodels.api as sma

from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.linear_model import LinearRegression,Ridge,Lasso

from sklearn.metrics import r2_score, mean_squared_error

from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE


from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv('SLR_LI.csv')
df.head()

Unnamed: 0,ID,Day_No,Course_ID,Course_Domain,Course_Type,Short_Promotion,Public_Holiday,Long_Promotion,User_Traffic,Competition_Metric,Sales
0,1,1,1,Development,Course,0,1,1,11004,0.007,81
1,2,2,1,Development,Course,0,0,1,13650,0.007,79
2,3,3,1,Development,Course,0,0,1,11655,0.007,75
3,4,4,1,Development,Course,0,0,1,12054,0.007,80
4,5,5,1,Development,Course,0,0,1,6804,0.007,41


In [63]:
df.shape

(512087, 11)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512087 entries, 0 to 512086
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ID                  512087 non-null  int64  
 1   Day_No              512087 non-null  int64  
 2   Course_ID           512087 non-null  int64  
 3   Course_Domain       512087 non-null  object 
 4   Course_Type         512087 non-null  object 
 5   Short_Promotion     512087 non-null  int64  
 6   Public_Holiday      512087 non-null  int64  
 7   Long_Promotion      512087 non-null  int64  
 8   User_Traffic        512087 non-null  int64  
 9   Competition_Metric  510323 non-null  float64
 10  Sales               512087 non-null  int64  
dtypes: float64(1), int64(8), object(2)
memory usage: 43.0+ MB


In [4]:
df.dtypes

ID                      int64
Day_No                  int64
Course_ID               int64
Course_Domain          object
Course_Type            object
Short_Promotion         int64
Public_Holiday          int64
Long_Promotion          int64
User_Traffic            int64
Competition_Metric    float64
Sales                   int64
dtype: object

In [5]:
df.describe()

Unnamed: 0,ID,Day_No,Course_ID,Short_Promotion,Public_Holiday,Long_Promotion,User_Traffic,Competition_Metric,Sales
count,512087.0,512087.0,512087.0,512087.0,512087.0,512087.0,512087.0,510323.0,512087.0
mean,274007.30065,434.917869,300.388344,0.380244,0.031639,0.488968,15375.101198,0.073345,120.826924
std,158228.834029,256.044161,173.365787,0.485447,0.175038,0.499879,7727.231205,0.100115,54.355258
min,1.0,1.0,1.0,0.0,0.0,0.0,168.0,0.0,0.0
25%,136962.5,214.0,150.0,0.0,0.0,0.0,10584.0,0.01,84.0
50%,273984.0,427.0,300.0,0.0,0.0,0.0,13776.0,0.035,111.0
75%,411065.5,658.0,451.0,1.0,0.0,1.0,18123.0,0.094,146.0
max,548027.0,882.0,600.0,1.0,1.0,1.0,100002.0,0.768,682.0


In [43]:
df.skew()

ID                    0.000167
Day_No                0.058609
Course_ID             0.002538
Short_Promotion       0.493386
Public_Holiday        5.351565
Long_Promotion        0.044140
User_Traffic          0.029702
Competition_Metric    0.493669
Sales                 1.374770
dtype: float64

In [19]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()

df['User_Traffic'] = pt.fit_transform(df[['User_Traffic']])

In [21]:
df['Competition_Metric'] = pt.fit_transform(df[['Competition_Metric']])

In [22]:
df.skew()

  df.skew()


ID                    0.000167
Day_No                0.058609
Course_ID             0.002538
Short_Promotion       0.493386
Public_Holiday        5.351565
Long_Promotion        0.044140
User_Traffic          0.029702
Competition_Metric    0.490800
Sales                 1.374770
dtype: float64

In [25]:
df.isna().sum()

ID                       0
Day_No                   0
Course_ID                0
Course_Domain            0
Course_Type              0
Short_Promotion          0
Public_Holiday           0
Long_Promotion           0
User_Traffic             0
Competition_Metric    1764
Sales                    0
dtype: int64

In [30]:
df['Competition_Metric'].fillna(df['Competition_Metric'].median(), inplace=True)

In [61]:
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)

IQR = q3 - q1
ul = q3 + 1.5 * IQR
ll = q1 - 1.5 * IQR

outliers = df[((df < ll) | (df > ul)).any(axis =1)]

In [62]:
outliers

Unnamed: 0,ID,Day_No,Course_ID,Course_Domain,Course_Type,Short_Promotion,Public_Holiday,Long_Promotion,User_Traffic,Competition_Metric,Sales
0,1,1,1,Development,Course,0,1,1,-0.521700,-1.014316,81
87,88,88,1,Development,Course,1,1,1,-0.985692,-1.014316,60
90,91,91,1,Development,Course,0,1,1,-3.461981,-1.014316,15
120,121,121,1,Development,Course,1,1,1,-1.311467,-1.014316,44
128,129,129,1,Development,Course,0,1,1,-1.199506,-1.014316,41
...,...,...,...,...,...,...,...,...,...,...,...
512027,547968,823,600,Software Marketing,Program,1,1,1,-0.481152,0.484457,141
512030,547971,826,600,Software Marketing,Program,0,1,1,-3.214809,0.484457,36
512055,547996,851,600,Software Marketing,Program,1,1,1,-0.682432,0.484457,128
512068,548009,864,600,Software Marketing,Program,0,1,1,-1.166954,0.484457,88


In [82]:
df_num = df.select_dtypes(include=np.number)
df_cat = df.select_dtypes(include=object)
cat = df_cat.columns
cat

Index(['Course_Domain', 'Course_Type'], dtype='object')

In [83]:
df1 = pd.get_dummies(data=df, columns=cat, drop_first=True)

In [129]:
df1.head()     #ENCODED (DF1) DATASET !

Unnamed: 0,ID,Day_No,Course_ID,Short_Promotion,Public_Holiday,Long_Promotion,User_Traffic,Competition_Metric,Sales,Course_Domain_Development,Course_Domain_Finance & Accounting,Course_Domain_Software Marketing,Course_Type_Degree,Course_Type_Program
0,1,1,1,0,1,1,-0.5217,-1.014316,81,1,0,0,0,0
1,2,2,1,0,0,1,-0.055007,-1.014316,79,1,0,0,0,0
2,3,3,1,0,0,1,-0.398182,-1.014316,75,1,0,0,0,0
3,4,4,1,0,0,1,-0.325518,-1.014316,80,1,0,0,0,0
4,5,5,1,0,0,1,-1.527939,-1.014316,41,1,0,0,0,0


In [130]:
df1.drop('Course_ID', axis=1, inplace=True)  #ID, Course_ID is waste to go with for Modelling...

In [134]:
x = df1.drop('Sales', axis=1)
y = df1['Sales']

xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.30,random_state=1) 

xc = sma.add_constant(x)

In [135]:
model_1 = sma.OLS(y,xc).fit()
model_1.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.785
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,156200.0
Date:,"Wed, 15 Nov 2023",Prob (F-statistic):,0.0
Time:,12:28:13,Log-Likelihood:,-2378700.0
No. Observations:,512087,AIC:,4757000.0
Df Residuals:,512074,BIC:,4758000.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,80.0626,0.502,159.340,0.000,79.078,81.047
ID,-1.728e-06,2.24e-07,-7.710,0.000,-2.17e-06,-1.29e-06
Day_No,0.0084,0.000,61.146,0.000,0.008,0.009
Short_Promotion,16.9467,0.075,225.108,0.000,16.799,17.094
Public_Holiday,12.0659,0.206,58.612,0.000,11.662,12.469
Long_Promotion,7.3798,0.074,100.027,0.000,7.235,7.524
User_Traffic,49.0399,0.042,1178.373,0.000,48.958,49.121
Competition_Metric,5.5252,0.038,144.744,0.000,5.450,5.600
Course_Domain_Development,20.0820,0.500,40.147,0.000,19.102,21.062

0,1,2,3
Omnibus:,169428.177,Durbin-Watson:,0.723
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1044147.051
Skew:,1.457,Prob(JB):,0.0
Kurtosis:,9.36,Cond. No.,10300000.0


In [153]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
vf = (vif(xc.values,i) for i in range(xc.shape[1]))
pd.DataFrame(vf, index=xc.columns, columns=['vif'])

Unnamed: 0,vif
const,9.561227
ID,1.011486
Day_No,1.001443
Short_Promotion,1.074055
Public_Holiday,1.045985
Long_Promotion,1.079919
User_Traffic,1.317359
Competition_Metric,1.123621
Course_Domain_Finance & Accounting,1.036394
Course_Type_Degree,1.058565


In [150]:
X = df1.drop(['Course_Domain_Development','Course_Domain_Software Marketing','Sales'], axis=1)
Y = df1['Sales']

xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.30,random_state=1)

xc = sma.add_constant(X)

In [151]:
model_2 = sma.OLS(Y,xc).fit()
model_2.summary() 

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.768
Model:,OLS,Adj. R-squared:,0.768
Method:,Least Squares,F-statistic:,169500.0
Date:,"Wed, 15 Nov 2023",Prob (F-statistic):,0.0
Time:,13:00:30,Log-Likelihood:,-2398700.0
No. Observations:,512087,AIC:,4797000.0
Df Residuals:,512076,BIC:,4797000.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,103.3237,0.113,913.233,0.000,103.102,103.545
ID,7.229e-10,2.33e-07,0.003,0.998,-4.55e-07,4.57e-07
Day_No,0.0083,0.000,58.227,0.000,0.008,0.009
Short_Promotion,17.8130,0.078,228.037,0.000,17.660,17.966
Public_Holiday,10.2740,0.214,48.056,0.000,9.855,10.693
Long_Promotion,9.2468,0.076,121.562,0.000,9.098,9.396
User_Traffic,47.2515,0.042,1125.127,0.000,47.169,47.334
Competition_Metric,7.1063,0.039,182.918,0.000,7.030,7.182
Course_Domain_Finance & Accounting,-9.8851,0.104,-94.958,0.000,-10.089,-9.681

0,1,2,3
Omnibus:,149900.567,Durbin-Watson:,0.656
Prob(Omnibus):,0.0,Jarque-Bera (JB):,807700.005
Skew:,1.31,Prob(JB):,0.0
Kurtosis:,8.566,Cond. No.,5560000.0


In [162]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

sfs_forward = SequentialFeatureSelector(estimator=lr, k_features=5, forward=True)
sfs_forward.fit(xtrain, ytrain)
sfs_forward.k_feature_names_

('Short_Promotion',
 'Long_Promotion',
 'User_Traffic',
 'Competition_Metric',
 'Course_Type_Degree')

In [None]:
train test split pannu

In [None]:
after ols model pannu

In [None]:
ridge/lasso

tune it gridsearchCV

ridge/lasso

final model (ols way/sklearn way)