In [2]:
# import libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('cleaning_car.csv')
df

Unnamed: 0.1,Unnamed: 0,company,name,year,kms_driven,fuel_type,Price
0,0,Hyundai,Hyundai Santro Xing XO eRLX Euro III,2007,45000,Petrol,80000.0
1,1,Hyundai,Hyundai Grand i10 Magna 1.2 Kappa VTVT,2014,28000,Petrol,325000.0
2,2,Ford,Ford EcoSport Titanium 1.5L TDCi,2014,36000,Diesel,575000.0
3,3,Ford,Ford Figo,2012,41000,Diesel,175000.0
4,4,Hyundai,Hyundai Eon,2013,25000,Petrol,190000.0
...,...,...,...,...,...,...,...
646,646,Maruti,Maruti Suzuki Ritz VXI ABS,2011,50000,Petrol,270000.0
647,647,Tata,Tata Indica V2 DLE BS III,2009,30000,Diesel,110000.0
648,648,Toyota,Toyota Corolla Altis,2009,132000,Petrol,300000.0
649,649,Tata,Tata Zest XM Diesel,2018,27000,Diesel,260000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651 entries, 0 to 650
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  651 non-null    int64  
 1   company     651 non-null    object 
 2   name        651 non-null    object 
 3   year        651 non-null    int64  
 4   kms_driven  651 non-null    int64  
 5   fuel_type   651 non-null    object 
 6   Price       651 non-null    float64
dtypes: float64(1), int64(3), object(3)
memory usage: 35.7+ KB


In [5]:
x = df[['company', 'name', 'year', 'kms_driven', 'fuel_type']]
y = df[['Price']]

In [6]:
x

Unnamed: 0,company,name,year,kms_driven,fuel_type
0,Hyundai,Hyundai Santro Xing XO eRLX Euro III,2007,45000,Petrol
1,Hyundai,Hyundai Grand i10 Magna 1.2 Kappa VTVT,2014,28000,Petrol
2,Ford,Ford EcoSport Titanium 1.5L TDCi,2014,36000,Diesel
3,Ford,Ford Figo,2012,41000,Diesel
4,Hyundai,Hyundai Eon,2013,25000,Petrol
...,...,...,...,...,...
646,Maruti,Maruti Suzuki Ritz VXI ABS,2011,50000,Petrol
647,Tata,Tata Indica V2 DLE BS III,2009,30000,Diesel
648,Toyota,Toyota Corolla Altis,2009,132000,Petrol
649,Tata,Tata Zest XM Diesel,2018,27000,Diesel


In [7]:
x['company'].unique()

array(['Hyundai', 'Ford', 'Maruti', 'Skoda', 'Mahindra', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat',
       'Mercedes', 'Land', 'Force', 'Jaguar', 'Volvo'], dtype=object)

In [8]:
x['fuel_type'].unique

<bound method Series.unique of 0      Petrol
1      Petrol
2      Diesel
3      Diesel
4      Petrol
        ...  
646    Petrol
647    Diesel
648    Petrol
649    Diesel
650    Diesel
Name: fuel_type, Length: 651, dtype: object>

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
ohe = OneHotEncoder()
ohe.fit(x[['company', 'name', 'fuel_type']])

In [10]:
ct = make_column_transformer((OneHotEncoder(categories = ohe.categories_),['company', 'name', 'fuel_type']),remainder = 'passthrough',force_int_remainder_cols=False)
ct

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

regLinear = LinearRegression()
regDec = DecisionTreeRegressor(random_state=0)
regRFR = RandomForestRegressor(n_estimators = 10, random_state = 0)

pipeLinear = make_pipeline(ct, regLinear)
pipeDec = make_pipeline(ct, regDec)
pipeRFR = make_pipeline(ct, regRFR)

scores = []

for i in range(0, 101):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.05, random_state = i)
    pipeLinear.fit(x_train, y_train)
    pipeDec.fit(x_train, y_train)
    pipeRFR.fit(x_train, y_train)
  
    resultLinear = pipeLinear.predict(x_test)
    scoreLinear = r2_score(y_test, resultLinear)
    rmseLinear = mean_squared_error(y_test, resultLinear)  

    resultDec = pipeDec.predict(x_test)
    scoreDec = r2_score(y_test, resultDec)
    rmseDec = mean_squared_error(y_test, resultDec) 

    resultRFR = pipeRFR.predict(x_test)
    scoreRFR = r2_score(y_test, resultRFR)
    rmseRFR = mean_squared_error(y_test, resultRFR) 
   
    scores.append(('Linear', i, scoreLinear, rmseLinear))
    scores.append(('Decision', i, scoreDec, rmseDec))
    scores.append(('Random', i, scoreRFR, rmseRFR))

In [19]:
scoreDf = pd.DataFrame(data = scores, columns= ['Algo', 'Iteration', 'R2 Score', 'RMSE Score'])
resultDf = scoreDf.sort_values(by = 'R2 Score', ascending = False)
resultDf

Unnamed: 0,Algo,Iteration,R2 Score,RMSE Score
260,Random,86,0.948680,8.450102e+09
258,Linear,86,0.900843,1.632654e+10
235,Decision,78,0.895555,1.412894e+10
88,Decision,29,0.886490,2.985494e+10
178,Decision,59,0.876524,1.621262e+10
...,...,...,...,...
69,Linear,23,-3.377180,2.173567e+11
187,Decision,62,-3.392739,2.169655e+11
105,Linear,35,-3.568765,2.292161e+11
265,Decision,88,-5.259084,1.953103e+11


In [21]:
bestindex = resultDf.loc[0, 'Iteration']
bestindex

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.05, random_state = 86)
pipeLinear.fit(x_train, y_train)
pipeDec.fit(x_train, y_train)
pipeRFR.fit(x_train, y_train)

In [25]:
company = input("Enter a Company Name:-")
name = input("Enter a Car Name:-")
year = int(input("Enter a Year:-"))
kms_driven = int(input("Enter a Kms_Driven:-"))
fuel_type = input("Enter a Fuel_Type:-")
columns = ['company', 'name', 'year', 'kms_driven', 'fuel_type']
myinput = pd.DataFrame(columns = columns, data = [[company, name, year, kms_driven, fuel_type]])
result = pipeRFR.predict(myinput)
print("You Should buy it for price :-",result)

Enter a Company Name:- Ford
Enter a Car Name:- Ford Figo
Enter a Year:- 2020
Enter a Kms_Driven:- 45000
Enter a Fuel_Type:- Petrol


You Should buy it for price :- [471499.8]
