# VEHICLE PRICE PREDICTION ML MODEL

# Importing necessary files and libraries

In [67]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

# Creating Dataframe

In [2]:
df = pd.read_csv('Vehicle Dataset.csv')

# EXPLORATORY DATA ANALYSIS (EDA)

In [3]:
df.head(5)

Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Kilometer           2059 non-null   int64  
 5   Fuel Type           2059 non-null   object 
 6   Transmission        2059 non-null   object 
 7   Location            2059 non-null   object 
 8   Color               2059 non-null   object 
 9   Owner               2059 non-null   object 
 10  Seller Type         2059 non-null   object 
 11  Engine              1979 non-null   object 
 12  Max Power           1979 non-null   object 
 13  Max Torque          1979 non-null   object 
 14  Drivetrain          1923 non-null   object 
 15  Length              1995 non-null   float64
 16  Width 

In [5]:
#dropping unnecessary columns
df=df.drop(['Kilometer','Location','Color','Seller Type','Max Torque','Length','Width','Height'],axis='columns')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Fuel Type           2059 non-null   object 
 5   Transmission        2059 non-null   object 
 6   Owner               2059 non-null   object 
 7   Engine              1979 non-null   object 
 8   Max Power           1979 non-null   object 
 9   Drivetrain          1923 non-null   object 
 10  Seating Capacity    1995 non-null   float64
 11  Fuel Tank Capacity  1946 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 193.2+ KB


In [6]:
df.isnull().values.any()

True

In [7]:
df.isnull().sum()

Make                    0
Model                   0
Price                   0
Year                    0
Fuel Type               0
Transmission            0
Owner                   0
Engine                 80
Max Power              80
Drivetrain            136
Seating Capacity       64
Fuel Tank Capacity    113
dtype: int64

# Data Preprocessing

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Fuel Type           2059 non-null   object 
 5   Transmission        2059 non-null   object 
 6   Owner               2059 non-null   object 
 7   Engine              1979 non-null   object 
 8   Max Power           1979 non-null   object 
 9   Drivetrain          1923 non-null   object 
 10  Seating Capacity    1995 non-null   float64
 11  Fuel Tank Capacity  1946 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 193.2+ KB


In [9]:
df['Engine']
df['Engine'] = df['Engine'].str.replace("cc","")
df['Engine'] = df['Engine'].astype("float64")

In [10]:
df['Engine'] = df['Engine'].fillna(df['Engine'].mean())

In [11]:
df.isnull().sum()

Make                    0
Model                   0
Price                   0
Year                    0
Fuel Type               0
Transmission            0
Owner                   0
Engine                  0
Max Power              80
Drivetrain            136
Seating Capacity       64
Fuel Tank Capacity    113
dtype: int64

In [12]:
df['Max Power'] = df['Max Power'].str.replace("bhp","")
df['Max Power'] = df['Max Power'].str.replace("@","")
df['Max Power'] = df['Max Power'].str.replace("rpm","")
df[['BHP','RPM']] = df['Max Power'].str.split(expand=True)

In [13]:
df['BHP'] = df['BHP'].astype("float64")
df['RPM'] = df['RPM'].astype("float64")
df['BHP'] = df['BHP'].fillna(df['BHP'].mean())
df['RPM'] = df['RPM'].fillna(df['RPM'].mean())

In [14]:
df['MEAN RPM'] = df['RPM'].mean()
df['Max Power'] = (df['BHP']/df['RPM']) * df['MEAN RPM']

In [15]:
df.isnull().sum()

Make                    0
Model                   0
Price                   0
Year                    0
Fuel Type               0
Transmission            0
Owner                   0
Engine                  0
Max Power               0
Drivetrain            136
Seating Capacity       64
Fuel Tank Capacity    113
BHP                     0
RPM                     0
MEAN RPM                0
dtype: int64

In [16]:
df = df.drop(['BHP','RPM','MEAN RPM'],axis='columns')

In [17]:
df.isnull().sum()

Make                    0
Model                   0
Price                   0
Year                    0
Fuel Type               0
Transmission            0
Owner                   0
Engine                  0
Max Power               0
Drivetrain            136
Seating Capacity       64
Fuel Tank Capacity    113
dtype: int64

In [18]:
df['Drivetrain']
df['Drivetrain'].unique()

array(['FWD', 'RWD', 'AWD', nan], dtype=object)

In [19]:
df['Drivetrain'].value_counts()

Drivetrain
FWD    1330
RWD     321
AWD     272
Name: count, dtype: int64

In [20]:
df['Drivetrain'] = df['Drivetrain'].fillna('FWD')

In [21]:
df.isnull().sum()

Make                    0
Model                   0
Price                   0
Year                    0
Fuel Type               0
Transmission            0
Owner                   0
Engine                  0
Max Power               0
Drivetrain              0
Seating Capacity       64
Fuel Tank Capacity    113
dtype: int64

In [22]:
df['Seating Capacity'] = df['Seating Capacity'].fillna(df['Seating Capacity'].mean())
df['Seating Capacity'] = df['Seating Capacity'].astype("int64")

In [23]:
df.isnull().sum()

Make                    0
Model                   0
Price                   0
Year                    0
Fuel Type               0
Transmission            0
Owner                   0
Engine                  0
Max Power               0
Drivetrain              0
Seating Capacity        0
Fuel Tank Capacity    113
dtype: int64

In [24]:
df['Fuel Tank Capacity'] = df['Fuel Tank Capacity'].fillna(df['Fuel Tank Capacity'].mean())

In [25]:
df.isnull().sum()

Make                  0
Model                 0
Price                 0
Year                  0
Fuel Type             0
Transmission          0
Owner                 0
Engine                0
Max Power             0
Drivetrain            0
Seating Capacity      0
Fuel Tank Capacity    0
dtype: int64

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Fuel Type           2059 non-null   object 
 5   Transmission        2059 non-null   object 
 6   Owner               2059 non-null   object 
 7   Engine              2059 non-null   float64
 8   Max Power           2059 non-null   float64
 9   Drivetrain          2059 non-null   object 
 10  Seating Capacity    2059 non-null   int64  
 11  Fuel Tank Capacity  2059 non-null   float64
dtypes: float64(3), int64(3), object(6)
memory usage: 193.2+ KB


In [27]:
#one hot encoding for categorical features
df['Make'].unique()

array(['Honda', 'Maruti Suzuki', 'Hyundai', 'Toyota', 'Mercedes-Benz',
       'BMW', 'Skoda', 'Nissan', 'Renault', 'Tata', 'Volkswagen', 'Ford',
       'Audi', 'Mahindra', 'MG', 'Jeep', 'Porsche', 'Kia', 'Land Rover',
       'Volvo', 'Maserati', 'Jaguar', 'Isuzu', 'Fiat', 'MINI', 'Ferrari',
       'Mitsubishi', 'Datsun', 'Lamborghini', 'Chevrolet', 'Ssangyong',
       'Rolls-Royce', 'Lexus'], dtype=object)

In [28]:
le_Make = LabelEncoder()
df['Make'] = le_Make.fit_transform(df['Make'])
df['Make'].dtype

dtype('int32')

In [29]:
#dropping model as it can create dimensionality curse due to its high number of unique values
df['Model'].nunique()

1050

In [30]:
df['Fuel Type'].unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric', 'CNG + CNG',
       'Hybrid', 'Petrol + CNG', 'Petrol + LPG'], dtype=object)

In [31]:
le_Fuel_Type = LabelEncoder()
df['Fuel Type'] = le_Fuel_Type.fit_transform(df['Fuel Type'])
df['Fuel Type'].dtype

dtype('int32')

In [32]:
df['Transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [33]:
le_Transmission = LabelEncoder()
df['Transmission'] = le_Transmission.fit_transform(df['Transmission'])
df['Transmission'].dtype

dtype('int32')

In [34]:
df['Owner'].unique()

array(['First', 'Second', 'Third', 'Fourth', 'UnRegistered Car',
       '4 or More'], dtype=object)

In [35]:
le_Owner = LabelEncoder()
df['Owner'] = le_Owner.fit_transform(df['Owner'])
df['Owner'].dtype

dtype('int32')

In [36]:
df['Drivetrain'].unique()

array(['FWD', 'RWD', 'AWD'], dtype=object)

In [37]:
le_Drivetrain = LabelEncoder()
df['Drivetrain'] = le_Drivetrain.fit_transform(df['Drivetrain'])
df['Drivetrain'].dtype

dtype('int32')

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   int32  
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Fuel Type           2059 non-null   int32  
 5   Transmission        2059 non-null   int32  
 6   Owner               2059 non-null   int32  
 7   Engine              2059 non-null   float64
 8   Max Power           2059 non-null   float64
 9   Drivetrain          2059 non-null   int32  
 10  Seating Capacity    2059 non-null   int64  
 11  Fuel Tank Capacity  2059 non-null   float64
dtypes: float64(3), int32(5), int64(3), object(1)
memory usage: 152.9+ KB


In [40]:
df.head(5)

Unnamed: 0,Make,Model,Price,Year,Fuel Type,Transmission,Owner,Engine,Max Power,Drivetrain,Seating Capacity,Fuel Tank Capacity
0,7,Amaze 1.2 VX i-VTEC,505000,2017,6,1,1,1198.0,69.711104,1,5,35.0
1,19,Swift DZire VDI,450000,2014,2,1,3,1248.0,88.941753,1,5,42.0
2,8,i10 Magna 1.2 Kappa2,220000,2011,6,1,1,1197.0,63.300887,1,5,35.0
3,30,Glanza G,799000,2019,6,1,1,1197.0,65.704719,1,5,37.0
4,30,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,2,1,1,2393.0,209.274714,2,7,55.0


# Model Training and Testing

In [43]:
X = df[['Make','Year','Fuel Type','Transmission','Owner','Engine','Max Power','Drivetrain','Seating Capacity','Fuel Tank Capacity']]
Y = df['Price']

In [47]:
XTRAIN,XTEST,YTRAIN,YTEST = train_test_split(X,Y,test_size = 0.2,random_state = 42)

In [48]:
model = LinearRegression()

In [49]:
model.fit(XTRAIN,YTRAIN)

In [50]:
model.score(XTEST,YTEST)

0.5030674730981368

In [53]:
model.coef_

array([ 1.78966553e+03,  2.02653737e+05,  8.82853193e+04, -3.02396711e+05,
        6.82921401e+04,  1.85222513e+03, -3.59493512e-01, -1.64836672e+05,
       -6.92585706e+05,  3.91393936e+04])

In [54]:
model.intercept_

-408562988.9890434

In [64]:
XTRAIN.head(5)

Unnamed: 0,Make,Year,Fuel Type,Transmission,Owner,Engine,Max Power,Drivetrain,Seating Capacity,Fuel Tank Capacity
266,1,2012,2,0,3,1692.575543,69826.94477,1,5,52.00221
1133,1,2016,2,0,1,1995.0,221.152468,2,5,63.0
1823,18,2017,2,1,1,1997.0,176.921974,1,7,70.0
1370,21,2021,2,0,1,2925.0,435.360534,0,7,90.0
67,30,2019,2,0,1,2755.0,246.03919,2,7,80.0


In [63]:
XINPUT = np.array([['Honda',2017,'Petrol','Manual','First',1198,221,'FWD',5,35]])
XINPUT

array([['Honda', '2017', 'Petrol', 'Manual', 'First', '1198', '221',
        'FWD', '5', '35']], dtype='<U11')

In [65]:
XINPUT[:,0]=le_Make.transform(XINPUT[:,0])
XINPUT[:,2]=le_Fuel_Type.transform(XINPUT[:,2])
XINPUT[:,3]=le_Transmission.transform(XINPUT[:,3])
XINPUT[:,4]=le_Owner.transform(XINPUT[:,4])
XINPUT[:,7]=le_Drivetrain.transform(XINPUT[:,7])

XINPUT=XINPUT.astype('float')

In [68]:
model.predict(XINPUT)

array([458734.04951459])

# Loading appropriate model and class objects into pickle file

In [69]:
data = {"model":model, "le_Make":le_Make, "le_Fuel_Type":le_Fuel_Type, "le_Transmission":le_Transmission, "le_Owner":le_Owner, "le_Drivetrain":le_Drivetrain}
with open('saved_steps_of_vehicle_price_prediction.pkl','wb') as file:
    pickle.dump(data,file)

In [70]:
with open('saved_steps_of_vehicle_price_prediction.pkl','rb') as file:
    data = pickle.load(file)
    
linearregressor = data["model"]
le_Make = data["le_Make"]
le_Fuel_Type = data["le_Fuel_Type"]
le_Transmission = data["le_Transmission"]
le_Owner = data["le_Owner"]
le_Drivetrain = data["le_Drivetrain"]

In [71]:
linearregressor.predict(XINPUT)

array([458734.04951459])