# __Used Car Price Prediction__

## 1) Problem statement.

* This dataset comprises used cars sold on cardehko.com in India as well as important features of these cars.
* If user can predict the price of the car based on input features.
* Prediction results can be used to give new seller the price suggestion based on market condition.

## 2) Data Collection.
* The Dataset is collected from scrapping from cardheko webiste
* The data consists of 13 column and 15411 rows.

In [22]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

In [23]:
df = pd.read_csv('..\Lesson_016-Random_Forest\cardekho.csv')
df

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,19537,Hyundai i10,Hyundai,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5,250000
15407,19540,Maruti Ertiga,Maruti,Ertiga,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7,925000
15408,19541,Skoda Rapid,Skoda,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5,425000
15409,19542,Mahindra XUV500,Mahindra,XUV500,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7,1225000


## __Data Cleaning__
### Handling Missing values

* Handling Missing values 
* Handling Duplicates
* Check data type
* Understand the dataset

In [24]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [25]:
# Remove Unnecessary Columns

df.drop(['Unnamed: 0', 'car_name', 'brand'], axis=1, inplace=True)
df

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000
...,...,...,...,...,...,...,...,...,...,...,...
15406,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5,250000
15407,Ertiga,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7,925000
15408,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5,425000
15409,XUV500,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7,1225000


In [26]:
## check for duplicates
df.duplicated().sum()

167

In [27]:
# Remove duplicates
df.drop_duplicates(inplace = True)

In [28]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [29]:
# Getting All Different Types of Features

num_features = df.select_dtypes(exclude = 'O').columns
print('Number of Numerical Features :', len(num_features))
 
cat_features = df.select_dtypes(include = 'O').columns
print('Number of Categorical Features :', len(cat_features))
 
discrete_features = [col for col in num_features if df[col].nunique() <= 25]
print('Number of Discrete Features :', len(discrete_features))
 
continuous_features = [col for col in num_features if col not in discrete_features]
print('Number of Continuous Features :', len(continuous_features))
 

Number of Numerical Features : 7
Number of Categorical Features : 4
Number of Discrete Features : 2
Number of Continuous Features : 5


In [30]:
## Independent, Dependent Features
X = df.drop('selling_price', axis = 1)
y = df['selling_price']

X

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.70,796,46.30,5
1,Grand,5,20000,Individual,Petrol,Manual,18.90,1197,82.00,5
2,i20,11,60000,Individual,Petrol,Manual,17.00,1197,80.00,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.10,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5
...,...,...,...,...,...,...,...,...,...,...
15406,i10,9,10723,Dealer,Petrol,Manual,19.81,1086,68.05,5
15407,Ertiga,2,18000,Dealer,Petrol,Manual,17.50,1373,91.10,7
15408,Rapid,6,67000,Dealer,Diesel,Manual,21.14,1498,103.52,5
15409,XUV500,5,3800000,Dealer,Diesel,Manual,16.00,2179,140.00,7


## Feature Encoding and Scaling
**One Hot Encoding for Columns which had lesser unique values and not ordinal**
* One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.

In [31]:
for col in df.select_dtypes(include = 'O'):
    print('\nColumn name is :', col)
    print(f'Number of Unique values is : {df[col].nunique()}\n')
    print(df[col].unique())
    print('='*35)


Column name is : model
Number of Unique values is : 120

['Alto' 'Grand' 'i20' 'Ecosport' 'Wagon R' 'i10' 'Venue' 'Swift' 'Verna'
 'Duster' 'Cooper' 'Ciaz' 'C-Class' 'Innova' 'Baleno' 'Swift Dzire'
 'Vento' 'Creta' 'City' 'Bolero' 'Fortuner' 'KWID' 'Amaze' 'Santro'
 'XUV500' 'KUV100' 'Ignis' 'RediGO' 'Scorpio' 'Marazzo' 'Aspire' 'Figo'
 'Vitara' 'Tiago' 'Polo' 'Seltos' 'Celerio' 'GO' '5' 'CR-V' 'Endeavour'
 'KUV' 'Jazz' '3' 'A4' 'Tigor' 'Ertiga' 'Safari' 'Thar' 'Hexa' 'Rover'
 'Eeco' 'A6' 'E-Class' 'Q7' 'Z4' '6' 'XF' 'X5' 'Hector' 'Civic' 'D-Max'
 'Cayenne' 'X1' 'Rapid' 'Freestyle' 'Superb' 'Nexon' 'XUV300' 'Dzire VXI'
 'S90' 'WR-V' 'XL6' 'Triber' 'ES' 'Wrangler' 'Camry' 'Elantra' 'Yaris'
 'GL-Class' '7' 'S-Presso' 'Dzire LXI' 'Aura' 'XC' 'Ghibli' 'Continental'
 'CR' 'Kicks' 'S-Class' 'Tucson' 'Harrier' 'X3' 'Octavia' 'Compass' 'CLS'
 'redi-GO' 'Glanza' 'Macan' 'X4' 'Dzire ZXI' 'XC90' 'F-PACE' 'A8' 'MUX'
 'GTC4Lusso' 'GLS' 'X-Trail' 'XE' 'XC60' 'Panamera' 'Alturas' 'Altroz'
 'NX' 'Car

In [32]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [33]:
# Creaate column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude ='O').columns
onehot_col = ['seller_type', 'fuel_type', 'transmission_type']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", OneHotEncoder(drop = 'first'), onehot_col), 
        ('StandardScaler', StandardScaler(), num_features)
    ], verbose = 3, remainder = 'passthrough'
)

preprocessor

0,1,2
,transformers,"[('OneHotEncoder', ...), ('StandardScaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,3
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [34]:
X = preprocessor.fit_transform(X)

pd.DataFrame(X)

[ColumnTransformer] . (1 of 2) Processing OneHotEncoder, total=   0.0s
[ColumnTransformer]  (2 of 2) Processing StandardScaler, total=   0.0s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519581,0.981015,1.243329,0.000640,-1.326227,-1.265491,-0.403299
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225821,-0.345188,-0.688493,-0.191245,-0.555669,-0.433600,-0.403299
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.535894,1.644117,0.084236,-0.646971,-0.555669,-0.480205,-0.403299
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519581,0.981015,-0.360084,0.293264,-0.938066,-0.780804,-0.403299
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666250,-0.013637,-0.495311,0.736997,0.022729,-0.047016,-0.403299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15239,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508367,0.981015,-0.867708,0.027024,-0.768966,-0.758667,-0.403299
15240,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.556143,-1.339840,-0.727130,-0.527043,-0.217469,-0.221550,2.069705
15241,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.407295,-0.013637,0.219463,0.346032,0.022729,0.067864,-0.403299
15242,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.425787,-0.345188,72.334381,-0.886827,1.331332,0.917931,2.069705


In [35]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

X_train.shape, X_test.shape

((10670, 14), (4574, 14))

## __Model Training And Model Selection__

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [37]:
# Create a function to Evaluate Model

def evaluate_model(true, pred):
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, pred)
    return mse, mae, rmse, r2

In [38]:
# model traning pipeline
models = {
    'Linear Regression' : LinearRegression(), 
    'Lasso' : Lasso(), 
    'Ridge' : Ridge(), 
    'K-Neighbors Regressor' : KNeighborsRegressor(), 
    'Decision Tree' : DecisionTreeRegressor(), 
    'Random Forest Regressor' : RandomForestRegressor(), 
    'AdaBoost Regressor' : AdaBoostRegressor(), 
    'Gradient Boosting' : GradientBoostingRegressor(), 
}

for i in models:
    model = models[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mse, train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    
    test_mse, test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

    print(f'âœ¨{i}')
    print('ðŸ’  Model Performance for Training Set')
    print(f' - MSE : {train_mse:.2f}')
    print(f' - MAE : {train_mae:.2f}')
    print(f' - RMSE : {train_rmse:.2f}')
    print(f' - R2_Score : {train_r2*100:.2f}')

    print('-'*35)

    print('ðŸ’  Model Performance for Testing Set')
    print(f' - MSE : {test_mse:.2f}')
    print(f' - MAE : {test_mae:.2f}')
    print(f' - RMSE : {test_rmse:.2f}')
    print(f' - R2_Score : {test_r2*100:.2f}')

    print('='*50)

âœ¨Linear Regression
ðŸ’  Model Performance for Training Set
 - MSE : 223129722333.46
 - MAE : 260406.10
 - RMSE : 472366.09
 - R2_Score : 67.75
-----------------------------------
ðŸ’  Model Performance for Testing Set
 - MSE : 471660202181.19
 - MAE : 261495.13
 - RMSE : 686775.22
 - R2_Score : 55.23
âœ¨Lasso
ðŸ’  Model Performance for Training Set
 - MSE : 223129728450.02
 - MAE : 260404.27
 - RMSE : 472366.10
 - R2_Score : 67.75
-----------------------------------
ðŸ’  Model Performance for Testing Set
 - MSE : 471661195271.09
 - MAE : 261495.55
 - RMSE : 686775.94
 - R2_Score : 55.23
âœ¨Ridge
ðŸ’  Model Performance for Training Set
 - MSE : 223130045616.53
 - MAE : 260377.09
 - RMSE : 472366.43
 - R2_Score : 67.75
-----------------------------------
ðŸ’  Model Performance for Testing Set
 - MSE : 471690953038.77
 - MAE : 261467.71
 - RMSE : 686797.61
 - R2_Score : 55.23
âœ¨K-Neighbors Regressor
ðŸ’  Model Performance for Training Set
 - MSE : 45723157633.79
 - MAE : 90208.14
 - RM

## __HyperParameter Tuning__

In [39]:
#Initialize few parameter for Hyperparamter tuning

knn_params = {
    'n_neighbors' : [2, 3, 10, 20, 40, 50]
}


gradient_params={"loss": ['squared_error','huber','absolute_error'],
             "criterion": ['friedman_mse','squared_error','mse'],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500],
              "max_depth": [5, 8, 15, None, 10],
            }

In [40]:
randomcv_models = [
    ('KNN', KNeighborsRegressor(), knn_params),
    ('GB', GradientBoostingRegressor(), gradient_params)
]

In [41]:
from sklearn.model_selection import RandomizedSearchCV

model_params = {}

for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator = model, param_distributions = params, verbose = 3, n_iter = 100, cv = 3, n_jobs = -1)

    random.fit(X_train, y_train)
    model_params[name] = random.best_params_

for model_name in model_params:
    print(f'\n-----------------Best params for {model_name}---------------')
    print(model_params[model_name])

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits

-----------------Best params for KNN---------------
{'n_neighbors': 3}

-----------------Best params for GB---------------
{'n_estimators': 500, 'min_samples_split': 20, 'max_depth': None, 'loss': 'absolute_error', 'criterion': 'friedman_mse'}


In [43]:
## Retraining the models with best parameters
models = {
    "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=3, n_jobs=-1), 
    "Gradient Boosting" : GradientBoostingRegressor(n_estimators = 500, min_samples_split = 20, max_depth = None, loss = 'absolute_error', criterion = 'friedman_mse')
    
}

for i in models:
    model = models[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mse, train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    
    test_mse, test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

    print(f'âœ¨{i}')
    print('ðŸ’  Model Performance for Training Set')
    print(f' - MSE : {train_mse:.2f}')
    print(f' - MAE : {train_mae:.2f}')
    print(f' - RMSE : {train_rmse:.2f}')
    print(f' - R2_Score : {train_r2*100:.2f}')

    print('-'*35)

    print('ðŸ’  Model Performance for Testing Set')
    print(f' - MSE : {test_mse:.2f}')
    print(f' - MAE : {test_mae:.2f}')
    print(f' - RMSE : {test_rmse:.2f}')
    print(f' - R2_Score : {test_r2*100:.2f}')

    print('='*50)

âœ¨K-Neighbors Regressor
ðŸ’  Model Performance for Training Set
 - MSE : 32475639821.87
 - MAE : 77480.29
 - RMSE : 180209.99
 - R2_Score : 95.31
-----------------------------------
ðŸ’  Model Performance for Testing Set
 - MSE : 245290308895.69
 - MAE : 117793.62
 - RMSE : 495267.92
 - R2_Score : 76.72
âœ¨Gradient Boosting
ðŸ’  Model Performance for Training Set
 - MSE : 1854686976.11
 - MAE : 10171.61
 - RMSE : 43066.08
 - R2_Score : 99.73
-----------------------------------
ðŸ’  Model Performance for Testing Set
 - MSE : 252790806762.89
 - MAE : 109607.73
 - RMSE : 502783.06
 - R2_Score : 76.00
