##### <strong><code>Aim</code></strong> : To create a model that predicts <i>used car</i> prices based on historical data, with high accuracy.

##### <code>Activities Performed</code> 

1. Importing necessary <code>libraries</code>

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

2. Data reading

In [2]:
data = pd.read_csv("./Data/used_car_prices_dataset.csv")

Dropping unnecessary column

In [3]:
data.drop("Unnamed: 0", axis = 1, inplace = True)

In [4]:
data.describe().style.background_gradient().format(precision = 2)

Unnamed: 0,vehicle_age,km_driven,mileage,engine,max_power,seats,selling_price
count,15411.0,15411.0,15411.0,15411.0,15411.0,15411.0,15411.0
mean,6.04,55616.48,19.7,1486.06,100.59,5.33,774971.12
std,3.01,51618.55,4.17,521.11,42.97,0.81,894128.36
min,0.0,100.0,4.0,793.0,38.4,0.0,40000.0
25%,4.0,30000.0,17.0,1197.0,74.0,5.0,385000.0
50%,6.0,50000.0,19.67,1248.0,88.5,5.0,556000.0
75%,8.0,70000.0,22.7,1582.0,117.3,5.0,825000.0
max,29.0,3800000.0,33.54,6592.0,626.0,9.0,39500000.0


- Most cars in the dataset are around 6 years old, have been driven about 55,000 km, and give an average mileage of 19.7 km/l.
- Engine sizes and power vary a lot, but most cars have 5 seats.
- Typical selling prices range widely, with a median around ₹5.5 lakh.

In [5]:
data.head(5)

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


3. Checking for <code>nulls</code> and <code>duplicates</code>

In [6]:
data.isna().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [7]:
data.duplicated().sum()

np.int64(167)

Data is devoid of null values, but contains 167 duplicates. Removing them is necessary for model accuracy.

In [8]:
print(f"Shape of data before removing duplicates: {data.shape}")
data.drop_duplicates(inplace = True)
print(f"Shape of data after removing duplicates: {data.shape}")

Shape of data before removing duplicates: (15411, 13)
Shape of data after removing duplicates: (15244, 13)


4. <code>Data Preprocessing</code>

In [9]:
model_data = data.copy()

In [10]:
model_data.head(5)

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


- Preprocessing <code>text</code> columns

Dropping <code>car_name</code> column

In [None]:
model_data.drop("car_name", axis = 1, inplace = True)

In [12]:
model_data.head(5)

Unnamed: 0,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


Removing <code>Outliers</code>

In [13]:
for col in ["km_driven", "mileage", "engine", "max_power"]:
    Q1 = model_data[col].quantile(0.25)
    Q3 = model_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    model_data = model_data[(model_data[col] >= lower_bound) & (model_data[col] <= upper_bound)]

In [14]:
model_data.shape

(11960, 12)

Creating <code>Pipelines</code> for other <code>Preprocessing</code> activities

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [16]:
cat_cols = ["brand", "model", "seller_type", "fuel_type", "transmission_type"]
num_cols = ["vehicle_age", "km_driven", "mileage", "engine", "max_power", "seats"]

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("scale", StandardScaler(), num_cols)
    ]   
)

5. Model Building

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
model = RandomForestRegressor(random_state = 10)

In [24]:
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", model)
])

In [22]:
parameters = {
    'model__n_estimators': np.arange(100, 500, 50),
    'model__max_depth': np.arange(10, 100, 10),
    'model__min_samples_split': np.arange(2, 8, 1),
    'model__min_samples_leaf': np.arange(1, 5, 1)
}

<code>Hypertuning</code> Parameters

In [23]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
random_search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = parameters,
    n_iter = 20,
    cv = 5,
    scoring = "neg_mean_squared_error",
    random_state = 10
)

Train-Validation split

In [32]:
from sklearn.model_selection import train_test_split

In [51]:
X = model_data.drop("selling_price", axis = 1)
y = model_data["selling_price"]

In [52]:
trainX, valX, trainY, valY = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [56]:
pipeline.fit(trainX, trainY)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('scale', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Evaluating Performance

In [37]:
from sklearn.metrics import mean_squared_error, r2_score

In [57]:
yhat = pipeline.predict(valX)
mse = mean_squared_error(valY, yhat)
r2 = r2_score(valY, yhat)
print(f"Mean Squared Error : {mse:.2f}")
print(f"R2 Score : {r2:.2f}")

Mean Squared Error : 8427553670.14
R2 Score : 0.88


<code>MSE</code> is 8427553670.14. So our model has <code>+-90K error range</code>.

Saving the model pipeline

In [58]:
import joblib

In [59]:
joblib.dump(pipeline, "./models/used_car_price_model.pkl")

['./models/used_car_price_model.pkl']