In [6]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
import matplotlib.pylab as plt

import dmba
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

no display found. Using non-interactive Agg backend


In [15]:
car_df = dmba.load_data('ToyotaCorolla.csv')
car_df = car_df.iloc[0:1000]

In [16]:
car_df.describe()


Unnamed: 0,Id,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,CC,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,502.538,11860.796,48.034,5.486,2000.29,62669.429,102.188,0.683,0.048,1590.18,...,0.614,0.983,0.158,0.306,0.288,0.798,0.224,0.157,0.004,0.245
std,289.858781,3748.829864,16.696215,3.299195,1.383407,35414.796253,15.571484,0.46554,0.213873,492.203496,...,0.487074,0.129336,0.364924,0.46106,0.453058,0.401693,0.417131,0.363983,0.063151,0.430302
min,1.0,4350.0,1.0,1.0,1999.0,1.0,69.0,0.0,0.0,1300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,251.75,9450.0,39.0,3.0,1999.0,38924.0,97.0,0.0,0.0,1400.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,502.5,10900.0,52.0,5.0,2000.0,58288.5,110.0,1.0,0.0,1600.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,753.25,12950.0,62.0,8.0,2001.0,79446.25,110.0,1.0,0.0,1600.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
max,1004.0,32500.0,68.0,12.0,2004.0,243000.0,192.0,1.0,1.0,16000.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Id                 1000 non-null   int64 
 1   Model              1000 non-null   object
 2   Price              1000 non-null   int64 
 3   Age_08_04          1000 non-null   int64 
 4   Mfg_Month          1000 non-null   int64 
 5   Mfg_Year           1000 non-null   int64 
 6   KM                 1000 non-null   int64 
 7   Fuel_Type          1000 non-null   object
 8   HP                 1000 non-null   int64 
 9   Met_Color          1000 non-null   int64 
 10  Color              1000 non-null   object
 11  Automatic          1000 non-null   int64 
 12  CC                 1000 non-null   int64 
 13  Doors              1000 non-null   int64 
 14  Cylinders          1000 non-null   int64 
 15  Gears              1000 non-null   int64 
 16  Quarterly_Tax      1000 non-null   int64 
 

In [23]:
predictors = ['Age_08_04', 'KM', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight']
outcome = 'Price'

#One-Hot-Encoding to obtain dummy variables
X = pd.get_dummies(car_df[predictors], drop_first=True)
y = car_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X,y, test_size=0.4, random_state=1)

car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

intercept  2237.1286350864284
       Predictor  coefficient
0      Age_08_04  -140.024285
1             KM    -0.020691
2             HP    41.095039
3      Met_Color    55.563729
4      Automatic   437.920023
5             CC     0.005057
6          Doors   -22.436326
7  Quarterly_Tax     6.942553
8         Weight    11.854104


In [24]:
#prints regression statistics for standard errors 
regressionSummary(train_y, car_lm.predict(train_X))


Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 1426.6036
            Mean Absolute Error (MAE) : 1070.0948
          Mean Percentage Error (MPE) : -1.0984
Mean Absolute Percentage Error (MAPE) : 9.5823


In [25]:
#Create r statistics
pred_y = car_lm.predict(train_X)

#Making predictions on a new set
car_lm_pred = car_lm.predict(valid_X)