In [1]:
# Importing necessary libraries 

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
from scipy import stats

In [2]:
# Importing Car Prices Dataset
CP = pd.read_csv('CarPrice_Assignment.csv')

# Displaying first 5 Rows
CP.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [3]:
# Exploratory Analysis and Summary of the Dataset

CP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [4]:
# Checking for the missing or null values

CP.apply(lambda x:np.sum(x == '?'))

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [5]:
# Statistical Dataset Analysis of Numeric Series

print(CP.describe())

           car_ID   symboling   wheelbase   carlength    carwidth   carheight  \
count  205.000000  205.000000  205.000000  205.000000  205.000000  205.000000   
mean   103.000000    0.834146   98.756585  174.049268   65.907805   53.724878   
std     59.322565    1.245307    6.021776   12.337289    2.145204    2.443522   
min      1.000000   -2.000000   86.600000  141.100000   60.300000   47.800000   
25%     52.000000    0.000000   94.500000  166.300000   64.100000   52.000000   
50%    103.000000    1.000000   97.000000  173.200000   65.500000   54.100000   
75%    154.000000    2.000000  102.400000  183.100000   66.900000   55.500000   
max    205.000000    3.000000  120.900000  208.100000   72.300000   59.800000   

        curbweight  enginesize   boreratio      stroke  compressionratio  \
count   205.000000  205.000000  205.000000  205.000000        205.000000   
mean   2555.565854  126.907317    3.329756    3.255415         10.142537   
std     520.680204   41.642693    0.270844

In [6]:
# Statistical Dataset Analysis of Object Series

CP.describe(include=['object'])

Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
count,205,205,205,205,205,205,205,205,205,205
unique,147,2,2,2,5,3,2,7,7,8
top,toyota corona,gas,std,four,sedan,fwd,front,ohc,four,mpfi
freq,6,185,168,115,96,120,202,148,159,94


In [7]:
# Drop the unnecessary columns which doesn't influence the Price.

CP.drop('car_ID', axis = 1, inplace = True)
CP.drop('CarName', axis = 1, inplace = True)

In [8]:
# Getting all the factor values of the variables and displaying them

fueltype = CP['fueltype'].unique()
print("fueltype : " + str(fueltype))

aspiration = CP['aspiration'].unique()
print("aspiration : " + str(aspiration))

doornumber = CP['doornumber'].unique()
print("doornumber : " + str(doornumber))

carbody = CP['carbody'].unique()
print("carbody : " + str(carbody))

drivewheel = CP['drivewheel'].unique()
print("drivewheel : " + str(drivewheel))

enginelocation = CP['enginelocation'].unique()
print("enginelocation : " + str(enginelocation))

enginetype = CP['enginetype'].unique()
print("enginetype : " + str(enginetype))

cylindernumber = CP['cylindernumber'].unique()
print("cylindernumber : " + str(cylindernumber))

fuelsystem = CP['fuelsystem'].unique()
print("fuelsystem : " + str(fuelsystem))

fueltype : ['gas' 'diesel']
aspiration : ['std' 'turbo']
doornumber : ['two' 'four']
carbody : ['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']
drivewheel : ['rwd' 'fwd' '4wd']
enginelocation : ['front' 'rear']
enginetype : ['dohc' 'ohcv' 'ohc' 'l' 'rotor' 'ohcf' 'dohcv']
cylindernumber : ['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']
fuelsystem : ['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' '4bbl' 'idi' 'spdi']


In [9]:
# Assigning the labels for all the factor values of the variables and displaying top 10 rows

fueltype = {'gas': 0,'diesel': 1}
aspiration = {'std': 0, 'turbo': 1}
doornumber = {'two': 0, 'four': 1}
carbody = {'convertible': 0, 'hatchback': 1, 'sedan': 2, 'wagon': 3, 'hardtop': 4}
drivewheel = {'rwd': 0, 'fwd': 1, '4wd': 2}
enginelocation = {'front': 0, 'rear': 1}
enginetype = {'dohc': 0, 'ohcv': 1, 'ohc': 2, 'l': 3, 'rotor': 4, 'ohcf': 5, 'dohcv': 6}
cylindernumber = {'four': 0, 'six': 1, 'five': 2, 'three': 3, 'twelve': 4, 'two': 5, 'eight': 6}
fuelsystem = {'mpfi': 0, '2bbl': 1, 'mfi': 2, '1bbl': 3, 'spfi': 4, '4bbl': 5, 'idi': 6, 'spdi': 7}

CP['fueltype'] = CP['fueltype'].map(fueltype)
CP['aspiration'] = CP['aspiration'].map(aspiration)
CP['doornumber'] = CP['doornumber'].map(doornumber)
CP['carbody'] = CP['carbody'].map(carbody)
CP['drivewheel'] = CP['drivewheel'].map(drivewheel)
CP['enginelocation'] = CP['enginelocation'].map(enginelocation)
CP['enginetype'] = CP['enginetype'].map(enginetype)
CP['cylindernumber'] = CP['cylindernumber'].map(cylindernumber)
CP['fuelsystem'] = CP['fuelsystem'].map(fuelsystem)

CP.head(10)

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,0,0,0,0,0,0,88.6,168.8,64.1,...,130,0,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,0,0,0,0,0,0,88.6,168.8,64.1,...,130,0,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,0,0,0,1,0,0,94.5,171.2,65.5,...,152,0,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,0,0,1,2,1,0,99.8,176.6,66.2,...,109,0,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,0,0,1,2,2,0,99.4,176.6,66.4,...,136,0,3.19,3.4,8.0,115,5500,18,22,17450.0
5,2,0,0,0,2,1,0,99.8,177.3,66.3,...,136,0,3.19,3.4,8.5,110,5500,19,25,15250.0
6,1,0,0,1,2,1,0,105.8,192.7,71.4,...,136,0,3.19,3.4,8.5,110,5500,19,25,17710.0
7,1,0,0,1,3,1,0,105.8,192.7,71.4,...,136,0,3.19,3.4,8.5,110,5500,19,25,18920.0
8,1,0,1,1,2,1,0,105.8,192.7,71.4,...,131,0,3.13,3.4,8.3,140,5500,17,20,23875.0
9,0,0,1,0,1,2,0,99.5,178.2,67.9,...,131,0,3.13,3.4,7.0,160,5500,16,22,17859.167


In [10]:
# Splitting Dataset into Train and Test Set for Target(Dependent) and Independent Variables

Target_Var = CP[['price']]
Independent_Var = CP.iloc[:, list(range(23))]
Independent_Var_train, Independent_Var_test, Target_Var_train, Target_Var_test = train_test_split(Independent_Var, Target_Var, test_size = 0.3, random_state = 25)

In [11]:
# Building a Linear Regression Model

Constant_Var = sm.add_constant(Independent_Var_train)
Est = sm.OLS(Target_Var_train, Constant_Var)
Est2 = Est.fit()
print(Est2.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.888
Method:                 Least Squares   F-statistic:                     49.76
Date:                Fri, 22 Apr 2022   Prob (F-statistic):           3.04e-50
Time:                        23:34:59   Log-Likelihood:                -1315.6
No. Observations:                 143   AIC:                             2679.
Df Residuals:                     119   BIC:                             2750.
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const            -9029.8705   1.75e+04  

  x = pd.concat(x[::order], 1)


In [12]:
CP.drop('symboling', axis = 1, inplace = True)
CP.drop('fueltype', axis = 1, inplace = True)
CP.drop('aspiration', axis = 1, inplace = True)
CP.drop('doornumber', axis = 1, inplace = True)
CP.drop('drivewheel', axis = 1, inplace = True)
CP.drop('wheelbase', axis = 1, inplace = True)
CP.drop('carlength', axis = 1, inplace = True)
CP.drop('carwidth', axis = 1, inplace = True)
CP.drop('carheight', axis = 1, inplace = True)
CP.drop('curbweight', axis = 1, inplace = True)
CP.drop('enginetype', axis = 1, inplace = True)
CP.drop('fuelsystem', axis = 1, inplace = True)
CP.drop('boreratio', axis = 1, inplace = True)
CP.drop('compressionratio', axis = 1, inplace = True)
CP.drop('horsepower', axis = 1, inplace = True)
CP.drop('citympg', axis = 1, inplace = True)
CP.drop('highwaympg', axis = 1, inplace = True)

In [15]:
# Splitting Dataset into Train and Test Set for Target(Dependent) and Independent Variables

Target_Var = CP[['price']]
Independent_Var = CP.iloc[:, list(range(6))]
Independent_Var_train, Independent_Var_test, Target_Var_train, Target_Var_test = train_test_split(Independent_Var, Target_Var, test_size = 0.3, random_state = 25)

In [16]:
# Running the model to increase its accuracy post dropping irrelevant feature variables

Constant_Var = sm.add_constant(Independent_Var_train)
Est = sm.OLS(Target_Var_train, Constant_Var)
Est2 = Est.fit()
print(Est2.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.847
Model:                            OLS   Adj. R-squared:                  0.840
Method:                 Least Squares   F-statistic:                     125.2
Date:                Fri, 22 Apr 2022   Prob (F-statistic):           7.14e-53
Time:                        23:35:23   Log-Likelihood:                -1350.4
No. Observations:                 143   AIC:                             2715.
Df Residuals:                     136   BIC:                             2736.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const          -1.012e+04   4332.621     -2.

  x = pd.concat(x[::order], 1)
