## Used Car Price Prediction

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv('cardekho_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [3]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [4]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
discrete_features = [feature for feature in num_features if len(df[feature].unique()) <= 25 ]
continuous_features = [feature for feature in num_features if feature not in discrete_features ]
print(len(num_features))
print(len(cat_features))
print(len(discrete_features))
print(len(continuous_features))

8
6
2
6


In [5]:
from sklearn.model_selection import train_test_split
X = df.drop(['selling_price'],axis=1)
y = df['selling_price']

In [6]:
X = X.iloc[:,1:]

In [7]:
X.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

In [9]:
X.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti Alto,Maruti,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Hyundai Grand,Hyundai,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,Hyundai i20,Hyundai,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Maruti Alto,Maruti,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ford Ecosport,Ford,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [10]:
X.drop(['car_name','brand'],axis=1,inplace=True)

In [11]:
num_features = X.select_dtypes(exclude='object').columns
onehot_columns = ['seller_type','fuel_type','transmission_type']
label_encoder_columns = ['model']

In [12]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

In [13]:
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,onehot_columns),
        ('StandardScaler',numeric_transformer,num_features)
    ],remainder='passthrough'
)

In [14]:
X = preprocessor.fit_transform(X)

In [15]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [16]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.261053,0.319814,0.283541,-2.038093,1.753906,2.662498,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.793003,-1.339555,-0.883751,0.992261,-0.550880,-0.386028,-0.403022
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.244390,-1.339555,-0.961245,-0.168096,0.890331,3.274530,-0.403022
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.024131,0.319814,0.143045,-0.455788,0.020999,0.388902,-0.403022
4,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508844,1.315436,0.478051,0.157955,-0.554718,-0.504712,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12323,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.556082,0.319814,1.397111,0.256249,-0.456846,-0.274327,2.073444
12324,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.903133,1.647309,0.065551,-0.865749,0.214823,0.060778,-0.403022
12325,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.040794,0.319814,-0.690016,0.193916,-0.936610,-0.780708,-0.403022
12326,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,-1.339555,-0.786884,-0.263994,-0.554718,-0.435829,-0.403022


In [17]:
## Model Training


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [19]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'KNeighborsRegressor' : KNeighborsRegressor(),
    'Decision Tree' : DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
}

In [20]:
def evaluate(true, predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true,predicted)
    return mae,rmse,r2

In [24]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_tr_mae,model_tr_rmse,model_tr_r2 = evaluate(y_train_pred,y_train)
    model_ts_mae,model_ts_rmse,model_ts_r2 = evaluate(y_test_pred,y_test)

    print(list(models.keys())[i])
    print("-------Training--------")
    print("RMSE: ",model_tr_rmse)
    print("MAE: ",model_tr_mae)
    print("R2 :",model_tr_r2)
    print("-------Test--------")
    print("RMSE: ",model_ts_rmse)
    print("MAE: ",model_ts_mae)
    print("R2 :",model_ts_r2)
    print("-------------------------")


LinearRegression
-------Training--------
RMSE:  553855.6665411664
MAE:  268101.60708299355
R2 : 0.39169330869029795
-------Test--------
RMSE:  502543.59302309825
MAE:  279618.5794158426
R2 : 0.5223787131063777
-------------------------
Lasso
-------Training--------
RMSE:  553855.6709544231
MAE:  268099.2226498115
R2 : 0.3916884585589191
-------Test--------
RMSE:  502542.66963789385
MAE:  279614.7461034126
R2 : 0.5223774919012767
-------------------------
Ridge
-------Training--------
RMSE:  553856.3159709624
MAE:  268059.80146883137
R2 : 0.3915633156142062
-------Test--------
RMSE:  502533.8229890288
MAE:  279557.21689302777
R2 : 0.5223152844680572
-------------------------
KNeighborsRegressor
-------Training--------
RMSE:  325880.8558351813
MAE:  91392.31018818948
R2 : 0.8224160243542835
-------Test--------
RMSE:  253138.6083086116
MAE:  112578.24359390205
R2 : 0.8973996246111303
-------------------------
Decision Tree
-------Training--------
RMSE:  20797.23516567643
MAE:  5164.819922

In [25]:
knn_params = {'n_neighbors':[2,3,5,10,15,20]}
rf_params = {
    "max_depth":[5,8,15,None,10],
    "max_features":[5,7,"auto",8],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,500,1000]
}

In [26]:
randomcvmods = [
    ('KNN',KNeighborsRegressor(),knn_params),
    ('RF',RandomForestRegressor(),rf_params)
]

In [27]:
from sklearn.model_selection import RandomizedSearchCV
##hyperparameter tuning
model_param = {}
for name, model,params in randomcvmods:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions= params,
        n_iter= 100,
        cv = 3,
        verbose= 2, 
        n_jobs=1
    )

    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for modelname in model_param:
    print(modelname, model_param[modelname] )



Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=5; total time=   0.1s
[CV] END ......................................n_neighbors=5; total time=   0.1s
[CV] END ......................................n_neighbors=5; total time=   0.1s
[CV] END .....................................n_neighbors=10; total time=   0.1s
[CV] END .....................................n_neighbors=10; total time=   0.1s
[CV] END .....................................n_n

69 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
69 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/

KNN {'n_neighbors': 5}
RF {'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 5, 'max_depth': None}


In [29]:
models = {
    'Random Forest': RandomForestRegressor(n_estimators = 1000, min_samples_split= 2, max_features= 5, max_depth= None),
    'KNN' : KNeighborsRegressor(n_neighbors= 5)
}

In [30]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_tr_mae,model_tr_rmse,model_tr_r2 = evaluate(y_train_pred,y_train)
    model_ts_mae,model_ts_rmse,model_ts_r2 = evaluate(y_test_pred,y_test)

    print(list(models.keys())[i])
    print("-------Training--------")
    print("RMSE: ",model_tr_rmse)
    print("MAE: ",model_tr_mae)
    print("R2 :",model_tr_r2)
    print("-------Test--------")
    print("RMSE: ",model_ts_rmse)
    print("MAE: ",model_ts_mae)
    print("R2 :",model_ts_r2)
    print("-------------------------")


Random Forest
-------Training--------
RMSE:  125103.22420959017
MAE:  38968.76127878484
R2 : 0.9779623676604079
-------Test--------
RMSE:  208172.62648265762
MAE:  97849.41409170622
R2 : 0.9374310242309826
-------------------------
KNN
-------Training--------
RMSE:  325880.8558351813
MAE:  91392.31018818948
R2 : 0.8224160243542835
-------Test--------
RMSE:  253138.6083086116
MAE:  112578.24359390205
R2 : 0.8973996246111303
-------------------------
