# Linear Regression Analysis of Altima Sales

In [16]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [17]:
# Load the dataset
df = pd.read_excel('Modeling dataset.xlsx', sheet_name='modeling dataset', engine='openpyxl')
df.head()

Unnamed: 0,model,month,gas_price,discount,sales,TV,YouTube,Radio,Print
0,altima,2018-04-01,2.757,2169.005767,6799,0.0,6.313814e-06,0.0,1.398882
1,altima,2018-05-01,2.901,4745.168681,14992,0.0,6.313814e-06,3.300356,0.224208
2,altima,2018-06-01,2.891,4895.379638,19572,0.0,7.892268e-07,6.479757,0.147138
3,altima,2018-07-01,2.849,4923.842447,14876,0.0,0.0,11.619001,0.680676
4,altima,2018-08-01,2.836,5026.378048,14656,0.0,0.0,32.4581,1.633934


In [18]:
# Define the dependent variable (Sales) and independent variables
df['week index']=pd.to_datetime(df['month']).dt.month
df["Seasonality"] = df.groupby(['week index'])["sales"].transform('mean')/df['sales'].mean()
X = df[['gas_price', 'discount', 'TV', 'YouTube', 'Radio', 'Print',"Seasonality"]]
y = df['sales']
X = sm.add_constant(X)

In [19]:
# Fit the linear regression model
model = sm.OLS(y, X).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.772
Model:                            OLS   Adj. R-squared:                  0.715
Method:                 Least Squares   F-statistic:                     13.56
Date:                Mon, 19 May 2025   Prob (F-statistic):           1.65e-07
Time:                        03:11:11   Log-Likelihood:                -314.52
No. Observations:                  36   AIC:                             645.0
Df Residuals:                      28   BIC:                             657.7
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const       -1.571e+04   2803.587     -5.604      

In [20]:
# Calculate R-squared and Adjusted R-squared
r_squared = model.rsquared
adj_r_squared = model.rsquared_adj
r_squared, adj_r_squared

(0.7721878466149624, 0.715234808268703)

In [26]:
# Calculate VIF to check for multicollinearity
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns[1:]
vif_data['VIF'] = [variance_inflation_factor(X.iloc[:,1:].values, i) for i in range(len(X.columns[1:]))]
vif_data

Unnamed: 0,feature,VIF
0,gas_price,58.035967
1,discount,61.32241
2,TV,2.880998
3,YouTube,2.966796
4,Radio,2.500499
5,Print,2.877877
6,Seasonality,44.265096


In [22]:
# Calculate MAPE
y_pred = model.predict(X)
mape = mean_absolute_percentage_error(y, y_pred)
mape

0.14734763282498048