In [5]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [6]:
data = pd.read_csv("sales.csv")

In [7]:
data.head()

Unnamed: 0,sales,price,brand,feat
0,8256.0,3.87,tropicana,0
1,6144.0,3.87,tropicana,0
2,3840.0,3.87,tropicana,0
3,8000.0,3.87,tropicana,0
4,8896.0,3.87,tropicana,0


In [8]:
data["brand"].unique()

array(['tropicana', 'minute.maid', 'dominicks'], dtype=object)

In [12]:
# Regression Model with no interaction terms
#brand value = 0 if absent and 1 if present

model_1 = smf.glm(formula = "np.log(sales) ~ brand + np.log(price)", data = data)
result_1 = model_1.fit()

print(result_1.params)

Intercept               10.279095
brand[T.minute.maid]     0.681564
brand[T.tropicana]       1.301756
np.log(price)           -2.529893
feat                     0.890625
dtype: float64


In [14]:
# With interaction between 'log(price)', 'brand' & 'feat'
model_2 = smf.glm(formula = "np.log(sales) ~ np.log(price) * brand * feat", data = data)
result_2 = model_2.fit()

print(result_2.params)

Intercept                                  10.406576
brand[T.minute.maid]                        0.047203
brand[T.tropicana]                          0.707941
np.log(price)                              -2.774154
np.log(price):brand[T.minute.maid]          0.782932
np.log(price):brand[T.tropicana]            0.735793
feat                                        1.094407
brand[T.minute.maid]:feat                   1.172944
brand[T.tropicana]:feat                     0.785252
np.log(price):feat                         -0.470553
np.log(price):brand[T.minute.maid]:feat    -1.109224
np.log(price):brand[T.tropicana]:feat      -0.986141
dtype: float64


In [16]:
result_2.summary()
#Df Residuals =  Residual Degree of Freedom
#Df Model = Model Degrees of Freedom

0,1,2,3
Dep. Variable:,np.log(sales),No. Observations:,28947.0
Model:,GLM,Df Residuals:,28935.0
Model Family:,Gaussian,Df Model:,11.0
Link Function:,Identity,Scale:,0.48297
Method:,IRLS,Log-Likelihood:,-30534.0
Date:,"Fri, 08 Mar 2024",Deviance:,13975.0
Time:,19:37:52,Pearson chi2:,14000.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.684
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,10.4066,0.023,445.668,0.000,10.361,10.452
brand[T.minute.maid],0.0472,0.047,1.012,0.311,-0.044,0.139
brand[T.tropicana],0.7079,0.051,13.937,0.000,0.608,0.808
np.log(price),-2.7742,0.039,-71.445,0.000,-2.850,-2.698
np.log(price):brand[T.minute.maid],0.7829,0.061,12.750,0.000,0.663,0.903
np.log(price):brand[T.tropicana],0.7358,0.057,12.946,0.000,0.624,0.847
feat,1.0944,0.038,28.721,0.000,1.020,1.169
brand[T.minute.maid]:feat,1.1729,0.082,14.312,0.000,1.012,1.334
brand[T.tropicana]:feat,0.7853,0.099,7.952,0.000,0.592,0.979


In [25]:
#Null Deviance
sst = np.sum((np.log(data['sales']) - np.mean(np.log(data['sales']))) ** 2)
print(sst)
print(result_2.null_deviance)

30078.713766381134
30078.713766381137


In [28]:
#Residual Deviance or Deviance
sse = np.sum((np.log(data['sales']) - result_2.fittedvalues) ** 2)
print(sse)
print(result_2.deviance)

13974.755295365494
13974.755295365494


In [22]:
#R^2 -  2 Methods
r2 = 1 - (sse / sst)
print(r2)
#By correlation
correlation_coeff = np.corrcoef(result_2.fittedvalues, np.log(data['sales']))[0][1]
print(correlation_coeff ** 2)

0.5353938534770384
0.5353938534770369


In [24]:
#  residual variance
sigma2 = sse / result_2.df_resid
print(sigma2)

0.4829706340198892


In [30]:
#Predictions
# New dataframe for prediction
exp_data = pd.DataFrame({
    'price': [2, 2, 2],
    'brand': ['tropicana', 'minute.maid', 'dominicks'],
    'feat': [1, 1, 1]
})

print(exp_data)

   price        brand  feat
0      2    tropicana     1
1      2  minute.maid     1
2      2    dominicks     1


In [31]:
sales_pred = np.exp(result_2.predict(exp_data))


In [32]:
sales_pred

0    39010.563728
1    28166.852093
2    10424.587380
dtype: float64