# ESTIMATION OF USED CAR PRICE

## 1. IMPORTING LIBRARIES

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

## 2. READING AND EXPLORING THE DATA 

In [2]:
train = pd.read_csv("train-data.csv")

In [3]:
train.columns

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'Price'],
      dtype='object')

In [4]:
train.shape

(6019, 12)

In [5]:
train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 564.4+ KB


In [7]:
train.nunique()

Name                 1876
Location               11
Year                   22
Kilometers_Driven    3093
Fuel_Type               5
Transmission            2
Owner_Type              4
Mileage               442
Engine                146
Power                 372
Seats                   9
Price                1373
dtype: int64

In [8]:
train.describe()

Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,5977.0,6019.0
mean,2013.358199,58738.38,5.278735,9.479468
std,3.269742,91268.84,0.80884,11.187917
min,1998.0,171.0,0.0,0.44
25%,2011.0,34000.0,5.0,3.5
50%,2014.0,53000.0,5.0,5.64
75%,2016.0,73000.0,5.0,9.95
max,2019.0,6500000.0,10.0,160.0


## 3. CHECKING FOR NAN VALUES

In [9]:
train.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

### The columns Mileage, Engine, Power, Seats have null values

## 4. FILTERING THE DATA

### Removing Outliers from the Data

### Removing the outliers in Fuel_Type

In [10]:
train.shape

(6019, 12)

In [11]:
train.Fuel_Type.value_counts()

Diesel      3205
Petrol      2746
CNG           56
LPG           10
Electric       2
Name: Fuel_Type, dtype: int64

In [12]:
train = train[train['Fuel_Type'] != 'Electric']
train.shape

(6017, 12)

### Removing the outliers in Kilomerers_Driven

In [13]:
print(train.Kilometers_Driven.min())
print(train.Kilometers_Driven.max())

171
6500000


In [14]:
print(len(train[train['Kilometers_Driven'] > 500000]))
print(len(train[train['Kilometers_Driven'] < 1000]))

4
2


In [15]:
train = train[train['Kilometers_Driven'] < 500000]
train = train[train['Kilometers_Driven'] > 1000]
train.shape

(6002, 12)

### Modifying car names to group by brand

In [16]:
train.Name = train.Name.str.split().str.get(0)

In [17]:
train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


### Removing outliers in car brands

In [18]:
train.Name.value_counts()

Maruti           1208
Hyundai          1104
Honda             608
Toyota            410
Mercedes-Benz     318
Volkswagen        313
Ford              299
Mahindra          271
BMW               266
Audi              236
Tata              184
Skoda             172
Renault           144
Chevrolet         121
Nissan             91
Land               60
Jaguar             39
Fiat               28
Mitsubishi         27
Mini               26
Volvo              21
Porsche            18
Jeep               15
Datsun             13
Force               3
ISUZU               2
Isuzu               1
Lamborghini         1
Smart               1
Bentley             1
Ambassador          1
Name: Name, dtype: int64

In [19]:
train = train[train['Name'] != 'Force']
train = train[train['Name'] != 'ISUZU']
train = train[train['Name'] != 'Bentley']
train = train[train['Name'] != 'Lamborghini']
train = train[train['Name'] != 'Isuzu']
train = train[train['Name'] != 'Smart']
train = train[train['Name'] != 'Ambassador']

In [20]:
train.shape

(5992, 12)

### Removing Outliers in Price

In [21]:
print(train.Price.min())
print(train.Price.max())

0.44
160.0


In [22]:
train = train[train.Price < 120]
train = train[train.Price > 0.5]
train.shape

(5985, 12)

### Converting Mileage, Engine and Power to numerical columns

In [23]:
train.Mileage = train.Mileage.str.split().str.get(0).astype('float')
train.Engine = train.Engine.str.split().str.get(0).astype('int', errors='ignore')
train.Power = train.Power.str.split().str.get(0).astype('float', errors='ignore')
train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Mumbai,2010,72000,CNG,Manual,First,26.6,998,58.16,5.0,1.75
1,Hyundai,Pune,2015,41000,Diesel,Manual,First,19.67,1582,126.2,5.0,12.5
2,Honda,Chennai,2011,46000,Petrol,Manual,First,18.2,1199,88.7,5.0,4.5
3,Maruti,Chennai,2012,87000,Diesel,Manual,First,20.77,1248,88.76,7.0,6.0
4,Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968,140.8,5.0,17.74


### Calculating age of the car from Year

In [24]:
train['Car_age'] = 2022 - train['Year']
train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Car_age
0,Maruti,Mumbai,2010,72000,CNG,Manual,First,26.6,998,58.16,5.0,1.75,12
1,Hyundai,Pune,2015,41000,Diesel,Manual,First,19.67,1582,126.2,5.0,12.5,7
2,Honda,Chennai,2011,46000,Petrol,Manual,First,18.2,1199,88.7,5.0,4.5,11
3,Maruti,Chennai,2012,87000,Diesel,Manual,First,20.77,1248,88.76,7.0,6.0,10
4,Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968,140.8,5.0,17.74,9


### Droping the Unnecessary Columns 

In [25]:
train = train.drop(columns=['Location', 'Year', 'Transmission'])

In [26]:
train.head()

Unnamed: 0,Name,Kilometers_Driven,Fuel_Type,Owner_Type,Mileage,Engine,Power,Seats,Price,Car_age
0,Maruti,72000,CNG,First,26.6,998,58.16,5.0,1.75,12
1,Hyundai,41000,Diesel,First,19.67,1582,126.2,5.0,12.5,7
2,Honda,46000,Petrol,First,18.2,1199,88.7,5.0,4.5,11
3,Maruti,87000,Diesel,First,20.77,1248,88.76,7.0,6.0,10
4,Audi,40670,Diesel,Second,15.2,1968,140.8,5.0,17.74,9


In [27]:
data=pd.DataFrame()
data['Name']=train['Name']
data.sort_values(by=['Name'],inplace=True)
data.drop_duplicates('Name',inplace=True)
data.reset_index()
data=data.iloc[:,-1]
data

1093             Audi
1137              BMW
581         Chevrolet
4169           Datsun
5114             Fiat
2198             Ford
4605            Honda
3437          Hyundai
1652           Jaguar
940              Jeep
5022             Land
1687         Mahindra
3723           Maruti
5089    Mercedes-Benz
5397             Mini
597        Mitsubishi
2947           Nissan
1078          Porsche
434           Renault
2681            Skoda
5738             Tata
4450           Toyota
3058       Volkswagen
1178            Volvo
Name: Name, dtype: object

In [28]:
data=pd.DataFrame()
data['Fuel_Type']=train['Fuel_Type']
data.sort_values(by=['Fuel_Type'],inplace=True)
data.drop_duplicates('Fuel_Type',inplace=True)
data.reset_index()
data=data.iloc[:,-1]
data

0          CNG
3356    Diesel
3595       LPG
4725    Petrol
Name: Fuel_Type, dtype: object

In [29]:
data=pd.DataFrame()
data['Owner_Type']=train['Owner_Type']
data.sort_values(by=['Owner_Type'],inplace=True)
data.drop_duplicates('Owner_Type',inplace=True)
data.reset_index()
data=data.iloc[:,-1]
data

0                First
1699    Fourth & Above
3105            Second
3479             Third
Name: Owner_Type, dtype: object

### Performing label encoding for categorical data

In [30]:
label_encoder = LabelEncoder()

In [31]:
train['Name'] = label_encoder.fit_transform(train['Name'])
train['Fuel_Type'] = label_encoder.fit_transform(train['Fuel_Type'])
train['Owner_Type'] = label_encoder.fit_transform(train['Owner_Type'])
train.head()

Unnamed: 0,Name,Kilometers_Driven,Fuel_Type,Owner_Type,Mileage,Engine,Power,Seats,Price,Car_age
0,12,72000,0,0,26.6,998,58.16,5.0,1.75,12
1,7,41000,1,0,19.67,1582,126.2,5.0,12.5,7
2,6,46000,3,0,18.2,1199,88.7,5.0,4.5,11
3,12,87000,1,0,20.77,1248,88.76,7.0,6.0,10
4,0,40670,1,2,15.2,1968,140.8,5.0,17.74,9


### Replacing the Nan Values

In [32]:
train.isnull().sum()

Name                  0
Kilometers_Driven     0
Fuel_Type             0
Owner_Type            0
Mileage               0
Engine               36
Power                36
Seats                42
Price                 0
Car_age               0
dtype: int64

In [33]:
train.dtypes

Name                   int32
Kilometers_Driven      int64
Fuel_Type              int32
Owner_Type             int32
Mileage              float64
Engine                object
Power                 object
Seats                float64
Price                float64
Car_age                int64
dtype: object

In [34]:
train.Engine = pd.to_numeric(train.Engine, errors='coerce')
train.Power = pd.to_numeric(train.Power, errors='coerce')

In [35]:
imputer = SimpleImputer(missing_values = np.nan, strategy ='mean')
train[["Engine", "Power", "Seats"]] = imputer.fit_transform(train[["Engine", "Power", "Seats"]])

In [36]:
train.isnull().sum()

Name                 0
Kilometers_Driven    0
Fuel_Type            0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
Car_age              0
dtype: int64

### The data now has no missing values

## 5.SEPARATEING DEPENDENT, INDEPENDENT VARIABLES AND TRAIN, TEST DATA

In [37]:
y = train.Price
X = train.drop(['Price'],axis=1)

In [38]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)

## 6. TRAINING THE MODEL 

### 1. Linear Regression

In [39]:
model1 = LinearRegression() 
model1.fit(X_train, y_train) 
y_pred = model1.predict(X_valid) 

In [40]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred)))
print("R2 score : %f" % r2_score(y_valid,y_pred))

Mean Absolute Error: 3.7383402531337513
Mean Squared Error: 36.84624317135215
Root Mean Squared Error: 6.070110639136008
R2 score : 0.700600


### 2. Random Forest Regressor

In [41]:
model2 = RandomForestRegressor(n_estimators=200)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_valid)

In [42]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred2)))
print("R2 score : %f" % r2_score(y_valid,y_pred2))

Mean Absolute Error: 1.6253033794307192
Mean Squared Error: 12.101749215879568
Root Mean Squared Error: 3.4787568492034002
R2 score : 0.901665


### 3. XGBoost Regressor

In [43]:
model3 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model3.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
y_pred3 = model3.predict(X_valid)

In [44]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred3))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred3)))
print("R2 score : %f" % r2_score(y_valid,y_pred3))

Mean Absolute Error: 1.5450098653783775
Mean Squared Error: 10.860355979866998
Root Mean Squared Error: 3.2955054210040373
R2 score : 0.911753


### 4. Ridge Regressor

In [45]:
model4 = Ridge(alpha=1.0)
model4.fit(X_train, y_train)
y_pred4 = model4.predict(X_valid)

In [46]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred4))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred4))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred4)))
print("R2 score : %f" % r2_score(y_valid,y_pred4))

Mean Absolute Error: 3.7382752301611073
Mean Squared Error: 36.84607613873041
Root Mean Squared Error: 6.070096880506143
R2 score : 0.700602


### 5. Lasso Regressor

In [47]:
model5 = Lasso(alpha=1.0)
model5.fit(X_train, y_train)
y_pred5 = model5.predict(X_valid)

In [48]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred5))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred5))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred5)))
print("R2 score : %f" % r2_score(y_valid,y_pred5))

Mean Absolute Error: 3.7494484568086803
Mean Squared Error: 38.3706477855957
Root Mean Squared Error: 6.194404554563392
R2 score : 0.688214


### XGBoost Regressor has the best performance among the regressors chosen. So choosing XGBoost for prediciton on test values.

## 7. TESTING THE TRAINED MODEL

In [64]:
y_estimated = model3.predict(X_valid)

In [65]:
y_valid = y_valid.reset_index(drop=True)
y_estimated = pd.DataFrame(y_estimated)

output=pd.concat([y_valid, y_estimated],axis=1)
output.columns = ['Actual Price', 'Estimated Price']

output

Unnamed: 0,Actual Price,Estimated Price
0,17.90,16.199549
1,21.69,20.647757
2,37.30,34.481819
3,4.75,5.915234
4,3.52,3.877139
...,...,...
1192,2.10,2.207733
1193,3.45,4.366333
1194,5.10,5.545352
1195,4.50,3.930534


In [66]:
import pickle
pickle.dump(model3,open('model of used car price estimation.pkl', 'wb'))