In [1]:
#RFE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
train = pd.read_csv('Data/train.csv')

In [3]:
train.head()

Unnamed: 0,price,wheelbase,curbweight,enginesize,boreratio,horsepower,carlength,carwidth,gas,turbo,...,ohcv,rotor,five,four,six,three,twelve,two,Medium,Highend
0,18344,104.9,2700,134,3.43,72,175.0,66.1,False,False,...,False,False,False,True,False,False,False,False,True,False
1,15580,107.9,3075,120,3.46,95,186.7,68.4,True,False,...,False,False,False,True,False,False,False,False,True,False
2,6918,95.7,2280,92,3.05,62,169.7,63.6,True,False,...,False,False,False,True,False,False,False,False,False,False
3,7499,94.5,1971,97,3.15,69,165.3,63.8,True,False,...,False,False,False,True,False,False,False,False,True,False
4,9095,96.5,2289,110,3.15,86,167.5,65.2,True,False,...,False,False,False,True,False,False,False,False,False,False


In [4]:
scaler = MinMaxScaler()
num_vars = ['wheelbase', 'curbweight', 'enginesize', 'boreratio', 'horsepower','carlength','carwidth','price']
train[num_vars] = scaler.fit_transform(train[num_vars])

In [5]:
columns = list(train.columns)
for column in columns[8:]:
    train[column] = train[column].astype(int)

In [6]:
y_train = train.pop('price')
X_train = train

In [7]:
print(X_train.shape)
print(y_train.shape)

(164, 29)
(164,)


In [8]:
lm = LinearRegression()

In [9]:
rfe = RFE(estimator=LinearRegression(), n_features_to_select=10)
rfe = rfe.fit(X_train,y_train)

In [10]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[('wheelbase', False, 12),
 ('curbweight', True, 1),
 ('enginesize', False, 5),
 ('boreratio', True, 1),
 ('horsepower', True, 1),
 ('carlength', False, 8),
 ('carwidth', True, 1),
 ('gas', False, 13),
 ('turbo', False, 19),
 ('hardtop', False, 14),
 ('hatchback', True, 1),
 ('sedan', True, 1),
 ('wagon', True, 1),
 ('fwd', False, 18),
 ('rwd', False, 17),
 ('dohcv', True, 1),
 ('l', False, 16),
 ('ohc', False, 15),
 ('ohcf', False, 7),
 ('ohcv', False, 6),
 ('rotor', False, 9),
 ('five', False, 4),
 ('four', False, 2),
 ('six', False, 3),
 ('three', False, 20),
 ('twelve', True, 1),
 ('two', False, 11),
 ('Medium', False, 10),
 ('Highend', True, 1)]

In [11]:
#Selects the top 10 columns from the dataframe
X_train.columns[rfe.support_]

Index(['curbweight', 'boreratio', 'horsepower', 'carwidth', 'hatchback',
       'sedan', 'wagon', 'dohcv', 'twelve', 'Highend'],
      dtype='object')

#### Building the models
Using statsmodels for detailed statistics

In [12]:
X_train_rfe = X_train[X_train.columns[rfe.support_]]

In [13]:
X_train_rfe.head()

Unnamed: 0,curbweight,boreratio,horsepower,carwidth,hatchback,sedan,wagon,dohcv,twelve,Highend
0,0.392078,0.635714,0.084746,0.409524,0,1,0,0,0,0
1,0.558968,0.657143,0.182203,0.628571,0,1,0,0,0,0
2,0.205162,0.364286,0.042373,0.171429,0,0,1,0,0,0
3,0.067646,0.435714,0.072034,0.190476,0,1,0,0,0,0
4,0.209168,0.435714,0.144068,0.32381,1,0,0,0,0,0


In [14]:
def build_model(X,y):
    '''
    The below command is used to add an intercept to the equation i.e. y = mx + b. This is done because the model by default
    assumes the value of b to be 0 if not given explicitly. This causes the algorithm to not optimise b at all, which is 
    undesirable.
    '''
    X = sm.add_constant(X)
    lm = sm.OLS(y,X).fit() #fitting the model 
    print(lm.summary()) # model summary
    return X

In [26]:
def checkVIF(X):
    vif = pd.DataFrame()
    vif['features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'],2)
    vif = vif.sort_values(by='VIF',ascending=False)
    return vif

Model 1

In [16]:
X_train_new = build_model(X_train_rfe,y_train)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.938
Model:                            OLS   Adj. R-squared:                  0.934
Method:                 Least Squares   F-statistic:                     231.1
Date:                Wed, 23 Aug 2023   Prob (F-statistic):           5.86e-87
Time:                        00:24:04   Log-Likelihood:                 266.07
No. Observations:                 164   AIC:                            -510.1
Df Residuals:                     153   BIC:                            -476.0
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0811      0.023      3.525      0.0

In [17]:
X_train_new = X_train_rfe.drop(["twelve"],axis=1)

In [20]:
X_train_new.shape

(164, 9)

Model 2

In [21]:
X_train_new = build_model(X_train_new,y_train)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.936
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                     252.1
Date:                Wed, 23 Aug 2023   Prob (F-statistic):           2.13e-87
Time:                        00:28:18   Log-Likelihood:                 264.15
No. Observations:                 164   AIC:                            -508.3
Df Residuals:                     154   BIC:                            -477.3
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0860      0.023      3.734      0.0

In [23]:
X_train_new = X_train_rfe.drop(['boreratio'],axis=1)

In [27]:
checkVIF(X_train_new)

Unnamed: 0,features,VIF
0,curbweight,19.66
2,carwidth,19.12
1,horsepower,8.57
4,sedan,2.99
3,hatchback,2.32
8,Highend,1.82
5,wagon,1.65
6,dohcv,1.29
7,twelve,1.12


Model 3