# Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics

#we use the corrected dataframe (i.e. in the find error jupyter notebook)
df = pickle.load(open('df_handled','rb'))
df.head()

Unnamed: 0,id,car_name,brand,model,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,560339.9,863436.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,481651.3,759664.7,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,402237.3,660838.9,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [2]:
print(df.shape)
# df.hist(bins=50, figsize=(20,15))

(19544, 16)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19544 entries, 0 to 19543
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 19544 non-null  int64  
 1   car_name           19544 non-null  object 
 2   brand              19544 non-null  object 
 3   model              19544 non-null  object 
 4   min_cost_price     19544 non-null  float64
 5   max_cost_price     19544 non-null  float64
 6   vehicle_age        19544 non-null  int64  
 7   km_driven          19544 non-null  int64  
 8   seller_type        19544 non-null  object 
 9   fuel_type          19544 non-null  object 
 10  transmission_type  19544 non-null  object 
 11  mileage            19544 non-null  float64
 12  engine             19544 non-null  int64  
 13  max_power          19544 non-null  float64
 14  seats              19544 non-null  int64  
 15  selling_price      19544 non-null  int64  
dtypes: float64(4), int64(6

# Dropping the columns that won't contribute much to the prediction

To have the inputs from the dataset, we are dropping columns that won't be inputs from user.(e.g. selling_price is not input but output).
We also drop the features that do not significantly contribute to the price prediction.

In [4]:
inputs = df.drop(['selling_price','id','car_name','brand','model','seats'],axis='columns')
target = df['selling_price']
inputs.head()

Unnamed: 0,min_cost_price,max_cost_price,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power
0,560339.9,863436.5,9,120000,Individual,Petrol,Manual,19.7,796,46.3
1,711000.0,748000.0,5,20000,Individual,Petrol,Manual,18.9,1197,82.0
2,481651.3,759664.7,11,60000,Individual,Petrol,Manual,17.0,1197,80.0
3,402237.3,660838.9,9,37000,Individual,Petrol,Manual,20.92,998,67.1
4,1014000.0,1379000.0,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59


# Handling non-numeric data

One hot encoding method is used to handle non numeric data

In [5]:
def dummies(x,df):
    temp = pd.get_dummies(df[x], drop_first = True)
    df = pd.concat([df, temp], axis = 1)
    df.drop([x], axis = 1, inplace = True)
    return df

inputs=dummies('fuel_type',inputs)
inputs=dummies('transmission_type',inputs)
inputs=dummies('seller_type',inputs)
inputs.head(10)

Unnamed: 0,min_cost_price,max_cost_price,vehicle_age,km_driven,mileage,engine,max_power,Diesel,Electric,LPG,Petrol,Manual,Individual,Trustmark Dealer
0,560339.9,863436.5,9,120000,19.7,796,46.3,0,0,0,1,1,1,0
1,711000.0,748000.0,5,20000,18.9,1197,82.0,0,0,0,1,1,1,0
2,481651.3,759664.7,11,60000,17.0,1197,80.0,0,0,0,1,1,1,0
3,402237.3,660838.9,9,37000,20.92,998,67.1,0,0,0,1,1,1,0
4,1014000.0,1379000.0,6,30000,22.77,1498,98.59,1,0,0,0,1,0,0
5,516000.0,694000.0,8,35000,18.9,998,67.1,0,0,0,1,1,1,0
6,654000.0,663000.0,8,40000,20.36,1197,78.9,0,0,0,1,1,0,0
7,526000.0,701000.0,3,17512,20.51,998,67.04,0,0,0,1,1,0,0
8,770000.0,1302000.0,2,20000,18.15,998,118.35,0,0,0,1,0,1,0
9,1206540.0,1627510.0,4,70000,18.49,1493,100.0,1,0,0,0,1,0,0


In [6]:
# inputs.isnull().sum()

# Training the model

In [7]:
x_train,x_test,y_train,y_test = train_test_split(inputs,target,random_state=42,test_size=0.2)

In [8]:
model = RandomForestRegressor(max_features='sqrt',bootstrap='True')
model.fit(x_train,y_train)

model2 = LinearRegression()
model2.fit(x_train,y_train)

LinearRegression()

# Testing the model

Here we are able to see that random forest regressor is giving much better accuracy than linear regression

In [9]:
print(model.score(x_test,y_test))
print(model2.score(x_test,y_test))

0.9407266721666135
0.7656541091535729


In [10]:
# Inputs from user
vehicle_age=5
km_driven=20000
fuel_type='petrol'
transmission_type='manual'
seller_type='Individual'
mileage=18.9
engine=1197
max_power=82
# seats=5

fuel_list={'petrol':0,'diesel':0,'electric':0,'lpg':0,'cng':0}
if fuel_type in fuel_list:
    fuel_list[fuel_type]=1
        
transmission_list={'manual':0,'automatic':0}
if transmission_type in transmission_list:
    transmission_list[transmission_type]=1
    
seller_list={'Individual':0,'Trustmark Dealer':0,'Dealer':0}
if seller_type in seller_list:
    seller_list[seller_type]=1

mins = 711000
maxs = 748000

In [11]:
model_list = pickle.load(open('model_list','rb'))
model_min_dict = pickle.load(open('model_min_dict','rb'))
model_max_dict = pickle.load(open('model_max_dict','rb'))

In [12]:
predictions=model.predict([[mins,maxs,vehicle_age,km_driven,mileage,engine,max_power,fuel_list['diesel'],fuel_list['electric'],fuel_list['lpg'],fuel_list['petrol'],transmission_list['manual'],seller_list['Individual'],seller_list['Trustmark Dealer']]])
predictions

array([529860.83333333])

In [13]:
pred1 = model.predict([[model_min_dict['i20'],model_max_dict['i20'],11,60000,17,1197,80,0,0,0,1,1,1,0]])
pred1

array([317016.66666667])

In [14]:
pred2 = model.predict([[model_min_dict['Alto'],model_max_dict['Alto'],9,37000,20.92,998,67.10,0,0,0,1,1,1,0]])
pred2

array([229280.])

# Dump the model

In [15]:
pickle.dump(model,open('save_model1','wb'))


# Load the model

In [16]:
model1 = pickle.load(open('save_model1','rb'))
min_dict1 = pickle.load(open('model_min_dict','rb'))
max_dict1 = pickle.load(open('model_max_dict','rb'))
for i in min_dict1:
    print(i, "\t\t" ,round(min_dict1[i],2))

Alto 		 435075.16
Grand 		 766563.68
i20 		 907119.21
Ecosport 		 1040523.66
Wagon R 		 561001.14
i10 		 606068.67
Venue 		 986468.77
TUV 		 1412335.19
Indigo 		 641383.79
Captur 		 1922018.89
Swift 		 863898.71
Micra 		 811643.04
Verna 		 1196644.8
Duster 		 1213702.96
Cooper 		 4086603.28
Ciaz 		 1219199.57
C-Class 		 4974954.76
Innova 		 2017883.18
Baleno 		 876603.9
Swift Dzire 		 967040.46
Grande 		 719570.07
Vento 		 1107026.86
Creta 		 1317830.38
Xylo 		 1045217.24
City 		 1318524.24
Bolero 		 1024328.53
Fortuner 		 3900155.38
KWID 		 414003.62
Amaze 		 814449.62
Santro 		 499768.74
XUV500 		 1693438.5
Sail 		 682695.49
Xcent 		 962636.4
800 		 311397.45
Avigo 		 586037.13
Nano 		 213697.62
KUV100 		 895638.63
Etios 		 956908.81
Ignis 		 656405.57
Corolla 		 1558181.39
RediGO 		 363092.72
Vista 		 785881.8
Omni 		 433351.22
Scorpio 		 1524078.53
Marazzo 		 1405101.29
Aspire 		 894040.05
Figo 		 809264.68
Supro 		 879336.88
Vitara 		 1592439.28
Tiago 		 666130.73
Polo 		 878013.9

In [17]:
#id=18992
pred5 = model1.predict([[model_min_dict['C-Class'],model_max_dict['C-Class'],9,60000,11.74,1796,186.0,0,0,0,1,0,1,0]])
print(pred5)
pred5x = model2.predict([[model_min_dict['C-Class'],model_max_dict['C-Class'],9,60000,11.74,1796,186.0,0,0,0,1,0,1,0]])
print(pred5x)

[1188690.]
[1786243.39038266]


In [18]:
#id = 19543 (last)
pred6 = model1.predict([[model_min_dict['City'],model_max_dict['City'],2,13000,18.0,1497,117.0,0,0,0,1,0,0,0]])
pred6x =  model2.predict([[model_min_dict['City'],model_max_dict['City'],2,13000,18.0,1497,117.0,0,0,0,1,0,0,0]])
print(pred6)
print(pred6x)

[1095170.]
[1124305.57106687]


In [19]:
#id=8
pred7 = model1.predict([[model_min_dict['Venue'],model_max_dict['Venue'],2,20000,18.5,998,118.35,0,0,0,1,0,1,0]])
print(pred7)
pred7x = model2.predict([[model_min_dict['Venue'],model_max_dict['Venue'],2,20000,18.5,998,118.35,0,0,0,1,0,1,0]])
print(pred7x)

[916335.]
[1000548.79633633]


In [20]:
model_of_brand = {}
