# AUTO MODEL

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

## 1. Problem Statement

In [None]:
price prediction of automobile from given features

## 2. Data Gathering

In [2]:
auto_df=pd.read_csv('autos_dataset.csv')
auto_df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
auto_df.shape

(205, 26)

In [5]:
auto_df.axes

[RangeIndex(start=0, stop=205, step=1),
 Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
        'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
        'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
        'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
        'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
        'highway-mpg', 'price'],
       dtype='object')]

In [6]:
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [9]:
auto_df.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

## 3.Exploratory Data Analysis

In [14]:
#feature-1
auto_df['symboling'].value_counts().to_dict()

{0: 67, 1: 54, 2: 32, 3: 27, -1: 22, -2: 3}

In [61]:
#feature-2
auto_df['normalized-losses'].isna().mean()

0.2

In [16]:
auto_df.replace({'?': np.nan},inplace=True)

In [19]:
#feature-3
auto_df['make'].value_counts()

toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
jaguar            3
chevrolet         3
alfa-romero       3
renault           2
mercury           1
Name: make, dtype: int64

In [20]:
#feature-4
auto_df['fuel-type'].value_counts()

gas       185
diesel     20
Name: fuel-type, dtype: int64

In [62]:
auto_df['fuel-type'].value_counts().to_dict()

{'gas': 185, 'diesel': 20}

In [63]:
auto_df['fuel-type'].replace({'gas': 0, 'diesel': 1},inplace=True)

In [77]:
auto_df['fuel-type'].dtypes

dtype('int64')

In [21]:
#feature-5
auto_df['aspiration'].value_counts()

std      168
turbo     37
Name: aspiration, dtype: int64

In [65]:
auto_df['aspiration'].value_counts().to_dict()

{'std': 168, 'turbo': 37}

In [66]:
auto_df['aspiration'].replace({'std': 1, 'turbo': 0},inplace=True)

In [22]:
#feature-6
auto_df['num-of-doors'].value_counts()

four    114
two      89
Name: num-of-doors, dtype: int64

In [67]:
auto_df['num-of-doors'].value_counts().to_dict()

{'four': 114, 'two': 89}

In [68]:
auto_df['num-of-doors'].replace({'four': 4, 'two': 2},inplace=True)

In [23]:
#feature-7
auto_df['body-style'].value_counts()

sedan          96
hatchback      70
wagon          25
hardtop         8
convertible     6
Name: body-style, dtype: int64

In [71]:
body_type=pd.get_dummies(auto_df['body-style'],prefix='bs',drop_first=True )
body_type

Unnamed: 0,bs_hardtop,bs_hatchback,bs_sedan,bs_wagon
0,0,0,0,0
1,0,0,0,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
200,0,0,1,0
201,0,0,1,0
202,0,0,1,0
203,0,0,1,0


In [72]:
auto_df=pd.concat([auto_df,body_type],axis=1)

In [24]:
#feature-8
auto_df['drive-wheels'].value_counts()

fwd    120
rwd     76
4wd      9
Name: drive-wheels, dtype: int64

In [79]:
drive_wheel=pd.get_dummies(auto_df['drive-wheels'],prefix='dw',drop_first=True)

In [80]:
auto_df=pd.concat([auto_df,drive_wheel],axis=1)

In [25]:
#feature-9
auto_df['engine-location'].value_counts()

front    202
rear       3
Name: engine-location, dtype: int64

In [29]:
#feature-10
auto_df['wheel-base'].nunique()

53

In [31]:
#feature-11
auto_df['length']

0      168.8
1      168.8
2      171.2
3      176.6
4      176.6
       ...  
200    188.8
201    188.8
202    188.8
203    188.8
204    188.8
Name: length, Length: 205, dtype: float64

In [81]:
#feature-12
auto_df['height'].nunique()

49

In [33]:
#feature-13
auto_df['curb-weight'].nunique()

171

In [36]:
#feature-14
auto_df['engine-type'].value_counts()

ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: engine-type, dtype: int64

In [48]:
auto_df['engine-type'].nunique()

7

In [82]:
#feature-15
auto_df['num-of-cylinders'].value_counts().to_dict()

{'four': 159,
 'six': 24,
 'five': 11,
 'eight': 5,
 'two': 4,
 'three': 1,
 'twelve': 1}

In [83]:
auto_df['num-of-cylinders'].replace({'four': 4,'six': 6,'five': 5,'eight': 8,'two': 2,'three':3,'twelve': 12},inplace=True)

In [39]:
#feature-16
auto_df['engine-size'].nunique()

44

In [41]:
#feature-17
auto_df['fuel-system'].value_counts()

mpfi    94
2bbl    66
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: fuel-system, dtype: int64

In [84]:
fuel_sys=pd.get_dummies(auto_df['fuel-system'],prefix='fs',drop_first=True)
auto_df=pd.concat([auto_df,fuel_sys],axis=1)

In [43]:
#feature-18
auto_df['bore'].nunique()

38

In [44]:
#feature-19
auto_df['stroke'].nunique()

36

In [45]:
#feature-20
auto_df['compression-ratio'].nunique()

32

In [46]:
#feature-21
auto_df['horsepower'].nunique()

59

In [47]:
#feature-22
auto_df['peak-rpm'].nunique()

23

In [49]:
#feature-23
auto_df['city-mpg'].nunique()

29

In [50]:
#feature-24
auto_df['highway-mpg'].nunique()

30

In [51]:
#feature-25
auto_df['highway-mpg'].nunique()

30

In [86]:
list_to_drop=["normalized-losses",'make','body-style','engine-location','drive-wheels','engine-type','fuel-system']
auto_df.drop(list_to_drop,axis=1,inplace=True)

In [89]:
auto_df.isna().sum()

symboling            0
fuel-type            0
aspiration           0
num-of-doors         2
wheel-base           0
length               0
width                0
height               0
curb-weight          0
num-of-cylinders     0
engine-size          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                4
bs_hardtop           0
bs_hatchback         0
bs_sedan             0
bs_wagon             0
dw_fwd               0
dw_rwd               0
fs_2bbl              0
fs_4bbl              0
fs_idi               0
fs_mfi               0
fs_mpfi              0
fs_spdi              0
fs_spfi              0
dtype: int64

In [93]:
auto_df['num-of-doors'].fillna(auto_df['num-of-doors'].mode()[0],inplace=True)
auto_df['bore'].fillna(auto_df['bore'].median(),inplace=True)
auto_df['stroke'].fillna(auto_df['stroke'].median(),inplace=True)
auto_df['horsepower'].fillna(auto_df['horsepower'].median(),inplace=True)
auto_df['peak-rpm'].fillna(auto_df['peak-rpm'].median(),inplace=True)
auto_df['price'].fillna(auto_df['price'].median(),inplace=True)

In [94]:
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   fuel-type          205 non-null    int64  
 2   aspiration         205 non-null    int64  
 3   num-of-doors       205 non-null    float64
 4   wheel-base         205 non-null    float64
 5   length             205 non-null    float64
 6   width              205 non-null    float64
 7   height             205 non-null    float64
 8   curb-weight        205 non-null    int64  
 9   num-of-cylinders   205 non-null    int64  
 10  engine-size        205 non-null    int64  
 11  bore               205 non-null    object 
 12  stroke             205 non-null    object 
 13  compression-ratio  205 non-null    float64
 14  horsepower         205 non-null    object 
 15  peak-rpm           205 non-null    object 
 16  city-mpg           205 non

In [106]:
auto_df['bore']=auto_df['bore'].astype('float')
auto_df['stroke']=auto_df['stroke'].astype('float')
auto_df['horsepower']=auto_df['horsepower'].astype('float')
auto_df['peak-rpm']=auto_df['peak-rpm'].astype('float')
auto_df['price']=auto_df['price'].astype('float')

In [107]:
auto_df['stroke'].dtype

dtype('float64')

In [108]:
auto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   fuel-type          205 non-null    int64  
 2   aspiration         205 non-null    int64  
 3   num-of-doors       205 non-null    float64
 4   wheel-base         205 non-null    float64
 5   length             205 non-null    float64
 6   width              205 non-null    float64
 7   height             205 non-null    float64
 8   curb-weight        205 non-null    int64  
 9   num-of-cylinders   205 non-null    int64  
 10  engine-size        205 non-null    int64  
 11  bore               205 non-null    float64
 12  stroke             205 non-null    float64
 13  compression-ratio  205 non-null    float64
 14  horsepower         205 non-null    float64
 15  peak-rpm           205 non-null    float64
 16  city-mpg           205 non

## Model Training

In [109]:
x=auto_df.drop('price',axis=1)
y=auto_df['price']

In [112]:
x_train,x_test,y_train,y_test= train_test_split(x,y,train_size=0.2,random_state=3)

In [113]:
auto_model=LinearRegression()

In [114]:
auto_model.fit(x_train,y_train)

LinearRegression()

In [115]:
auto_model.coef_

array([-1.99839423e+03,  1.76702791e+02, -5.30456505e+03, -8.13554497e+02,
       -1.61490831e+02, -2.11719607e+02,  1.52177993e+03, -2.53223658e+02,
        1.56464444e+00, -2.55410704e+02,  1.80671727e+02, -5.01913795e+03,
       -1.56251041e+03,  4.14526902e+02, -1.58663095e+02,  1.97624239e+00,
       -6.32352592e+02,  3.72097458e+02, -5.15479693e+03, -4.85265431e+03,
       -4.32785464e+03, -2.50041677e+03,  7.57149011e+00,  2.83703580e+03,
        3.97353551e+03,  0.00000000e+00,  1.76702791e+02,  0.00000000e+00,
        5.82971139e+03, -1.07944159e+02,  0.00000000e+00])

In [116]:
auto_model.intercept_

-12790.318749494105

In [119]:
#testing data evaluation
y_pred=auto_model.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
print('mean squared error:',mse)

mae= mean_absolute_error(y_test,y_pred)
print('mean absolute error:',mae)

rmse = np.sqrt(mse)
print("Root Mean Squared error:",rmse)

r_squared= r2_score(y_test,y_pred)
print("R-squared value:",r_squared)

adj_r2 = 1-((1-r_squared**2)*(len(x_test)-1)/(len(x_test)-len(x.columns)-1))
print('adjusted r_square:', adj_r2)

mean squared error: 41669789.49687355
mean absolute error: 4220.174803673224
Root Mean Squared error: 6455.214132534532
R-squared value: 0.3945355719551298
adjusted r_square: -0.04263404728266895
