In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math
pd.set_option('display.max_columns' , None)
pd.set_option('display.max_rows' , None)



### READING DATA

In [2]:
df = pd.read_csv('car_price.csv')

In [3]:
df.head(15)

Unnamed: 0,symboling,normalized_losses,Fuel_type,make,num_of_doors,aspiration,wheel_base,engine_location,drive_wheels,body_style,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,gas,std,two,convertible,rwd,front,88.6,alfa-romero,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,gas,std,two,convertible,rwd,front,88.6,alfa-romero,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,gas,std,two,hatchback,rwd,front,94.5,alfa-romero,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,gas,std,four,sedan,fwd,front,99.8,audi,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,gas,std,four,sedan,4wd,front,99.4,audi,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
5,2,?,gas,std,two,sedan,fwd,front,99.8,audi,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250
6,1,158,gas,std,four,sedan,fwd,front,105.8,audi,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
7,1,?,gas,std,four,wagon,fwd,front,105.8,audi,192.7,71.4,55.7,2954,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,18920
8,1,158,gas,turbo,four,sedan,fwd,front,105.8,audi,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
9,0,?,gas,turbo,two,hatchback,4wd,front,99.5,audi,178.2,67.9,52.0,3053,ohc,five,131,mpfi,3.13,3.4,7.0,160,5500,16,22,?


In [4]:
df.shape

(205, 26)

### REMOVING NULL VALUES

In [5]:
df = df.replace({'?':np.nan})

In [6]:
df.isnull().sum()

symboling             0
normalized_losses    41
Fuel_type             0
make                  0
num_of_doors          2
aspiration            0
wheel_base            0
engine_location       0
drive_wheels          0
body_style            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  164 non-null    object 
 2   Fuel_type          205 non-null    object 
 3   make               205 non-null    object 
 4   num_of_doors       203 non-null    object 
 5   aspiration         205 non-null    object 
 6   wheel_base         205 non-null    object 
 7   engine_location    205 non-null    object 
 8   drive_wheels       205 non-null    float64
 9   body_style         205 non-null    object 
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  num_of_cylinders   205 non-null    object 
 16  engine_size        205 non

In [8]:
df.describe()

Unnamed: 0,symboling,drive_wheels,length,width,height,curb_weight,engine_size,compression_ratio,city_mpg,highway_mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


### LABEL ENCODING

In [9]:
le = LabelEncoder()

In [10]:
df[df.select_dtypes(include = ['object']).columns] = df[df.select_dtypes(include = ['object']).columns].apply(le.fit_transform)

In [11]:
df.head()

Unnamed: 0,symboling,normalized_losses,Fuel_type,make,num_of_doors,aspiration,wheel_base,engine_location,drive_wheels,body_style,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,51,1,0,1,0,2,0,88.6,0,168.8,64.1,48.8,2548,0,2,130,5,23,4,9.0,5,10,21,27,31
1,3,51,1,0,1,0,2,0,88.6,0,168.8,64.1,48.8,2548,0,2,130,5,23,4,9.0,5,10,21,27,50
2,1,51,1,0,1,2,2,0,94.5,0,171.2,65.5,52.4,2823,5,3,152,5,1,27,9.0,20,10,19,26,50
3,2,27,1,0,0,3,1,0,99.8,1,176.6,66.2,54.3,2337,3,2,109,5,13,24,10.0,2,16,24,30,36
4,2,27,1,0,0,3,0,0,99.4,1,176.6,66.4,54.3,2824,3,1,136,5,13,24,8.0,8,16,18,22,61


### TRAIN TEST SPLIT

In [12]:
df_train , df_test = train_test_split(df, test_size = .2)

In [13]:
df_train_x = df_train.iloc[: , 0:-1]
df_train_y = df_train.iloc[: , -1]

In [14]:
df_test_x = df_test.iloc[: , 0:-1]
df_test_y = df_test.iloc[: , -1]

In [15]:
df_train_x.shape

(164, 25)

### LINEAR REGRESSION MODEL

In [17]:
lr = LinearRegression()

In [18]:
lr.fit(df_train_x,df_train_y)

LinearRegression()

In [32]:
pred = lr.predict(df_test_x)
pred

array([107.55644908,  43.95573711, 145.59531007, 102.41594132,
       131.56959665, 130.89176846,  91.64985856, 112.9722476 ,
        92.33973982,  67.07037579, 152.25715316,  53.52290869,
       104.36453284,  53.52290869, 123.58238201,  59.19962256,
        84.05985422,  84.79032008,  30.56902983,  81.72499531,
        89.57145799, 116.6125199 ,  86.39271128, 103.22861927,
        68.56566863,  81.23060161,  58.30667107, 131.47787213,
        93.51025903, 170.07658071, 127.84881479, 163.19470571,
        99.50214393, 140.19631397,  93.6092195 , 145.95605963,
       133.87289913,  54.96610841,  68.23505004,  84.60430703,
       119.78369517])

### EVALUATING MODEL

In [33]:
rmse = math.sqrt(mean_squared_error(df_test_y,pred))

In [34]:
rmse

59.19254404927368

In [35]:
r2s = r2_score(df_test_y,pred)

In [36]:
r2s

-0.008001276828966564