In [11]:
# imports 

import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [12]:
# load the data
df = pd.read_csv('./data/auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [13]:
df.drop(columns=['car name'], inplace=True)

In [14]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [15]:
model_df = df.copy()
y= model_df['mpg']
X= model_df.drop(columns=['mpg'])
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,8,307.0,130,3504,12.0,70,1
1,8,350.0,165,3693,11.5,70,1
2,8,318.0,150,3436,11.0,70,1
3,8,304.0,150,3433,12.0,70,1
4,8,302.0,140,3449,10.5,70,1


In [16]:
def print_results(sk_model,ols_model):
    print(f"""

StatsModels intercept:    {ols_model.params["const"]}
scikit-learn intercept:   {sk_model.intercept_}

StatsModels coefficient:\n{ols_model.params}
scikit-learn coefficient: {sk_model.coef_}
""")

In [17]:
def build_ols_sk_model(X,y):
    
    ols_model = sm.OLS(y,sm.add_constant(X))
    ols_results = ols_model.fit()
    
    sk_model = LinearRegression()
    sk_results = sk_model.fit(X=X,y=y)
    
    print_results(sk_results,ols_results)
    
    return (ols_results,sk_results)

In [18]:
ols_results,sk_results =  build_ols_sk_model(X=X,y=y)
ols_results.summary()



StatsModels intercept:    -17.218434622018332
scikit-learn intercept:   -17.21843462201748

StatsModels coefficient:
const          -17.218435
cylinders       -0.493376
displacement     0.019896
horsepower      -0.016951
weight          -0.006474
acceleration     0.080576
model year       0.750773
origin           1.426140
dtype: float64
scikit-learn coefficient: [-0.49337632  0.01989564 -0.01695114 -0.00647404  0.08057584  0.75077268
  1.4261405 ]



0,1,2,3
Dep. Variable:,mpg,R-squared:,0.821
Model:,OLS,Adj. R-squared:,0.818
Method:,Least Squares,F-statistic:,252.4
Date:,"Thu, 27 Nov 2025",Prob (F-statistic):,2.04e-139
Time:,16:03:53,Log-Likelihood:,-1023.5
No. Observations:,392,AIC:,2063.0
Df Residuals:,384,BIC:,2095.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-17.2184,4.644,-3.707,0.000,-26.350,-8.087
cylinders,-0.4934,0.323,-1.526,0.128,-1.129,0.142
displacement,0.0199,0.008,2.647,0.008,0.005,0.035
horsepower,-0.0170,0.014,-1.230,0.220,-0.044,0.010
weight,-0.0065,0.001,-9.929,0.000,-0.008,-0.005
acceleration,0.0806,0.099,0.815,0.415,-0.114,0.275
model year,0.7508,0.051,14.729,0.000,0.651,0.851
origin,1.4261,0.278,5.127,0.000,0.879,1.973

0,1,2,3
Omnibus:,31.906,Durbin-Watson:,1.309
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.1
Skew:,0.529,Prob(JB):,2.95e-12
Kurtosis:,4.46,Cond. No.,85900.0


In [19]:
X1=X.copy()
X1['W*H'] = X1['weight'] * X1['horsepower']
X1.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,W*H
0,8,307.0,130,3504,12.0,70,1,455520
1,8,350.0,165,3693,11.5,70,1,609345
2,8,318.0,150,3436,11.0,70,1,515400
3,8,304.0,150,3433,12.0,70,1,514950
4,8,302.0,140,3449,10.5,70,1,482860


In [20]:
ols_X1,sk_X1 =  build_ols_sk_model(X=X1,y=y)
ols_X1.summary()



StatsModels intercept:    2.875748260332685
scikit-learn intercept:   2.8757482603374207

StatsModels coefficient:
const           2.875748
cylinders      -0.029551
displacement    0.005950
horsepower     -0.231327
weight         -0.011215
acceleration   -0.090193
model year      0.769461
origin          0.834402
W*H             0.000055
dtype: float64
scikit-learn coefficient: [-2.95514105e-02  5.94989005e-03 -2.31326725e-01 -1.12146512e-02
 -9.01930212e-02  7.69461261e-01  8.34401609e-01  5.52885253e-05]



0,1,2,3
Dep. Variable:,mpg,R-squared:,0.862
Model:,OLS,Adj. R-squared:,0.859
Method:,Least Squares,F-statistic:,298.6
Date:,"Thu, 27 Nov 2025",Prob (F-statistic):,1.88e-159
Time:,16:05:26,Log-Likelihood:,-973.24
No. Observations:,392,AIC:,1964.0
Df Residuals:,383,BIC:,2000.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8757,4.511,0.638,0.524,-5.993,11.744
cylinders,-0.0296,0.288,-0.103,0.918,-0.596,0.537
displacement,0.0059,0.007,0.881,0.379,-0.007,0.019
horsepower,-0.2313,0.024,-9.791,0.000,-0.278,-0.185
weight,-0.0112,0.001,-15.393,0.000,-0.013,-0.010
acceleration,-0.0902,0.089,-1.019,0.309,-0.264,0.084
model year,0.7695,0.045,17.124,0.000,0.681,0.858
origin,0.8344,0.251,3.320,0.001,0.340,1.329
W*H,5.529e-05,5.23e-06,10.577,0.000,4.5e-05,6.56e-05

0,1,2,3
Omnibus:,40.936,Durbin-Watson:,1.474
Prob(Omnibus):,0.0,Jarque-Bera (JB):,73.199
Skew:,0.629,Prob(JB):,1.27e-16
Kurtosis:,4.703,Cond. No.,12300000.0
