In [1]:
!pip install xgboost==1.6.1



In [2]:
import numpy as np
import pandas as pd

In [3]:
import joblib

In [4]:
car_dekho = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRcQv44MEkBHoH6j1OfUo-J3vDuyv-R7vE-MZzcpBNUGOWc30SZCfMmRwhwO_7SpfH_ca-cRU_0_UUi/pub?gid=1127755762&single=true&output=csv'

In [5]:
cars = pd.read_csv(car_dekho)

In [6]:
cars.drop(columns=['Unnamed: 0','car_name'],axis=1,inplace=True)
cars.model = cars.brand + " " + cars.model
cars = cars[cars.seats != 0]
cars.selling_price = cars.selling_price/100000
cars.head()

Unnamed: 0,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,1.2
1,Hyundai,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,5.5
2,Hyundai,Hyundai i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,2.15
3,Maruti,Maruti Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,2.26
4,Ford,Ford Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,5.7


In [7]:
cars.to_csv('cars.csv',index=False)

In [8]:
X = cars.drop(columns=['selling_price'])
y = cars.selling_price

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [11]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
oe.fit(X[['brand','model','seller_type','fuel_type','transmission_type']])

In [12]:
#cat = oe.categories_

In [13]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [14]:
column_transformer = make_column_transformer(
    (OrdinalEncoder(categories=oe.categories_),['brand','model','seller_type','fuel_type','transmission_type']),
    (MinMaxScaler(),['vehicle_age']),
    (StandardScaler(),['mileage']),
    (RobustScaler(),['km_driven','engine','max_power']),
    remainder='passthrough'
)

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [16]:
lr = LinearRegression()
rf = RandomForestRegressor()
xg = XGBRegressor()

In [17]:
pipe = make_pipeline(column_transformer,lr)

In [18]:
pipe.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [19]:
y_pred = pipe.predict(X_test)

In [20]:
print(r2_score(y_test,y_pred))# more is good
print(mean_squared_error(y_test,y_pred)) # less is good
print(mean_absolute_error(y_test,y_pred)) # less is good

0.6491200640879395
21.54207148454765
2.652656795409261


In [21]:
pipe = make_pipeline(column_transformer,rf)

In [22]:
pipe.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [23]:
y_pred = pipe.predict(X_test)

In [24]:
print(r2_score(y_test,y_pred))# more is good
print(mean_squared_error(y_test,y_pred)) # less is good
print(mean_absolute_error(y_test,y_pred)) # less is good

0.9432451796658432
3.4844294916808307
0.9479566097378719


In [25]:
pipe_2 = make_pipeline(column_transformer,xg)

In [26]:
pipe_2.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [27]:
y_pred = pipe_2.predict(X_test)

In [28]:
print(r2_score(y_test,y_pred))# more is good
print(mean_squared_error(y_test,y_pred)) # less is good
print(mean_absolute_error(y_test,y_pred)) # less is good

0.9476498286685557
3.2140086041651843
0.9076568996310621


In [29]:
#import pickle

In [30]:
# pickle.dump(pipe,open('RandomForestModel.pkl','wb'))
joblib.dump(pipe, 'RandomForestModel2.pkl')

['RandomForestModel2.pkl']

In [31]:
#pickle.dump(pipe_2,open('XGBoostModel.pkl','wb'))
joblib.dump(pipe_2, 'XGBoostModel2.pkl')

['XGBoostModel2.pkl']

In [32]:
# RandomForest Model
pipe.predict(pd.DataFrame([['Mercedes-Benz','Mercedes-Benz C-Class',7,65000,'Dealer','Diesel','Automatic',19.27,2143,170,5]],columns=['brand','model','vehicle_age','km_driven','seller_type','fuel_type','transmission_type','mileage','engine','max_power','seats']))

array([14.47825])

In [33]:
# XGBoost Model
pipe_2.predict(pd.DataFrame([['Mercedes-Benz','Mercedes-Benz C-Class',100,65000,'Dealer','Petrol','Automatic',19.27,2143,170,5]],columns=['brand','model','vehicle_age','km_driven','seller_type','fuel_type','transmission_type','mileage','engine','max_power','seats']))

array([6.2763863], dtype=float32)

In [34]:
pipe_2.predict(pd.DataFrame([['Mercedes-Benz','Mercedes-Benz C-Class',12.3,65000,'Dealer','Petrol','Manual',19.27,2143,170,5]],columns=['brand','model','vehicle_age','km_driven','seller_type','fuel_type','transmission_type','mileage','engine','max_power','seats']))

array([6.173309], dtype=float32)

In [35]:
def Pred_model(pipe,pipe_2):
    my_list = list()
    brand = input('Enter the brand of the car: ')
    contains_brand = cars.apply(lambda brand: brand.astype(str).str.contains(brand)).any().any()
    if not contains_brand: print("Brand not found in the dataset"); return
    my_list.append(brand)
    model = input('Enter the model of the car: ')
    contains_model = cars.apply(lambda model: model.astype(str).str.contains(model)).any().any()
    if not contains_model: print("Brand not found in the dataset"); return
    my_list.append(model)
    vehicle_age = float(input("Enter the vehicle_age"))
    my_list.append(vehicle_age)
    km_driven = float(input("Enter the km_driven"))
    my_list.append(km_driven)
    seller_type = input('Enter the brand of the car: ')
    seller_type = cars.apply(lambda seller_type: seller_type.astype(str).str.contains(seller_type)).any().any()
    if not contains_brand: print("seller_type not found in the dataset"); return
    my_list.append(seller_type)
    fuel_type = input('Enter the brand of the car: ')
    fuel_type = cars.apply(lambda fuel_type: fuel_type.astype(str).str.contains(fuel_type)).any().any()
    if not contains_brand: print("fuel_type not found in the dataset"); return
    my_list.append(fuel_type)
    transmission_type = input('Enter the brand of the car: ')
    transmission_type = cars.apply(lambda transmission_type: fuel_type.astype(str).str.contains(transmission_type)).any().any()
    if not contains_brand: print("fuel_type not found in the dataset"); return
    my_list.append(transmission_type)
    mileage = float(input("Enter the mileage"))
    my_list.append(mileage)
    engine = float(input("Enter the engine"))
    my_list.append(engine)
    max_power = float(input("Enter the max_power"))
    my_list.append(max_power)
    seats = float(input("Enter the seats"))
    my_list.append(seats)
    
    result_1 = pipe.predict(pd.DataFrame([my_list],columns=['brand','model','vehicle_age','km_driven','seller_type','fuel_type','transmission_type','mileage','engine','max_power','seats']))
    result_2 = pipe_2.predict(pd.DataFrame([my_list],columns=['brand','model','vehicle_age','km_driven','seller_type','fuel_type','transmission_type','mileage','engine','max_power','seats']))
    
    return f'The Random forest model output: {result_1}\nThe XGBoost model output: {result_2}'

In [36]:
#Pred_model(pipe,pipe_2)