In [15]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

In [9]:
COLOR_SCHEME = px.colors.sequential.Teal_r

### <code>Data Reading</code> and <code>Preprocessing</code>

In [3]:
data = pd.read_csv("./Data/used_car_prices_dataset.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         15411 non-null  int64  
 1   car_name           15411 non-null  object 
 2   brand              15411 non-null  object 
 3   model              15411 non-null  object 
 4   vehicle_age        15411 non-null  int64  
 5   km_driven          15411 non-null  int64  
 6   seller_type        15411 non-null  object 
 7   fuel_type          15411 non-null  object 
 8   transmission_type  15411 non-null  object 
 9   mileage            15411 non-null  float64
 10  engine             15411 non-null  int64  
 11  max_power          15411 non-null  float64
 12  seats              15411 non-null  int64  
 13  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 1.6+ MB


In [5]:
print(f"Duplicate Records : {data.duplicated().sum()}")

Duplicate Records : 0


- Data contains around <code>15.4k</code> records and <code>14</code> columns.
- <code>Null values</code> are absent.
- There are <code>167</code> duplicate records.

Dropping <code>duplicate</code> records

In [6]:
data.drop_duplicates(inplace = True)
print(f"Duplicate Records : {data.duplicated().sum()}")

Duplicate Records : 0


Dropping unnecessary column <code>Unnamed: 0</code>

In [7]:
data.drop("Unnamed: 0", axis = 1, inplace = True)

In [8]:
data.describe().T.style.background_gradient()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
vehicle_age,15411.0,6.036338,3.013291,0.0,4.0,6.0,8.0,29.0
km_driven,15411.0,55616.480631,51618.548422,100.0,30000.0,50000.0,70000.0,3800000.0
mileage,15411.0,19.701151,4.171265,4.0,17.0,19.67,22.7,33.54
engine,15411.0,1486.057751,521.106696,793.0,1197.0,1248.0,1582.0,6592.0
max_power,15411.0,100.588254,42.972979,38.4,74.0,88.5,117.3,626.0
seats,15411.0,5.325482,0.807628,0.0,5.0,5.0,5.0,9.0
selling_price,15411.0,774971.11641,894128.363263,40000.0,385000.0,556000.0,825000.0,39500000.0


- <code>vehicle_age</code> : Around 6 years (most between 4–8 years)
- <code>km_driven</code> : Median around 50,000 km (large outliers present)
- <code>mileage</code> : Avg around 20 km/l
- <code>engine</code> : Typical size is around 1,200 – 1,600 cc
- <code>max_power</code> : Around 75 – 117 bhp
- <code>seats</code> : Mostly 5 seats
- <code>selling_price</code> : Median is around Rs 5.6 lakh (most between Rs 3.8–8.2 lakh)

### <code>Data Analysis</code>

Distribution of Selling Price

In [42]:
fig = go.Figure()
fig.add_trace(
    go.Box(
        x = data["selling_price"],
        marker = dict(
            color = COLOR_SCHEME[0]
        ),
        name = "Selling Price Box Plot"
    )
)
fig.add_trace(
    go.Violin(
        x = data["selling_price"],
        marker = dict(
            color = COLOR_SCHEME[1]
        ),
        name = "Selling Price Violin Plot"
    )
)
fig.update_layout(
    title = "Selling Price Distribution - Box and Violin Plot",
    xaxis = dict(
        title = "Selling Price"
    ),
    yaxis = dict(
        showticklabels = False
    )
)
fig.update_traces(
    hovertemplate = "<b>Selling Price</b> : %{x}<br>"
)
fig.show()

In [65]:
median_selling_price_by_seller_type = data.groupby("seller_type")["selling_price"].median().reset_index().sort_values(by = "selling_price", ascending = False)

fig = px.bar(
    data_frame = median_selling_price_by_seller_type,
    x = "seller_type",
    y = "selling_price",
    color = "seller_type",
    color_discrete_sequence = COLOR_SCHEME,
    text_auto = True
)
fig.update_layout(
    title = "Median Selling Price by Seller Type",
    xaxis = dict(
        title = "Seller Type"
    ),
    yaxis = dict(
        title = "Median Selling Price",
        range = [500000, 600000]
    ),
    showlegend = False
)
fig.update_traces(
    hovertemplate = "<b>Seller Type</b> : %{x}<br>" + "<b>Median Selling Price</b> : %{y}<br>"
)
fig.show()

Vehicle Age Distribution

In [32]:
fig = px.histogram(
    data_frame = data,
    x = "vehicle_age",
    color_discrete_sequence = COLOR_SCHEME
)
fig.update_layout(
    title = "Vehicle Age Distribution",
    xaxis = dict(
        title = "Vehicle Age (in years)"
    ),
    yaxis = dict(
        title = "Frequency"
    )
)
fig.update_traces(
    hovertemplate = "<b>Vehicle Age</b> : %{x}<br>" + "<b>Frequency</b> : %{y}<br>"
)
fig.show()

Vehicle Age vs KM Driven

In [73]:
fig = px.scatter(
    data_frame = data,
    x = "km_driven",
    y = "vehicle_age",
    color_discrete_sequence = COLOR_SCHEME
)
fig.update_layout(
    title = "Vehicle Age vs KMs Driven",
    xaxis = dict(
        title = "KMs Driven"
    ),
    yaxis = dict(
        title = "Vehicle Age (in years)"
    )
)
fig.update_traces(
    hovertemplate = "<b>KMs Driven</b> : %{x}<br>" + "<b>Vehicle Age</b> : %{y} years<br>"
)
fig.show()

Mileage vs Engine Size

In [72]:
fig = px.scatter(
    data_frame = data,
    x = "mileage",
    y = "engine",
    color_discrete_sequence = COLOR_SCHEME
)
fig.update_layout(
    title = "Mileage vs Engine Size",
    xaxis = dict(
        title = "Mileage"
    ),
    yaxis = dict(
        title = "Engine Size"
    )
)
fig.update_traces(
    hovertemplate = "<b>Mileage</b> : %{x}<br>" + "<b>Engine Size</b> : %{y}<br>"
)
fig.show()

Engine size vs Max power

In [50]:
fig = px.scatter(
    data_frame = data,
    x = "max_power",
    y = "engine",
    color_discrete_sequence = COLOR_SCHEME
)
fig.update_layout(
    title = "Max Power vs Engine Size",
    xaxis = dict(
        title = "Max Power"
    ),
    yaxis = dict(
        title = "Engine Size"
    )
)
fig.update_traces(
    hovertemplate = "<b>Max Power</b> : %{x}<br>" + "<b>Engine Size</b> : %{y}<br>"
)
fig.show()

Vehicle Age vs Mileage

In [56]:
data["age_groups"] = pd.cut(
    data["vehicle_age"],
    bins = [0, 5, 10, 15, 20],
    labels = ["0 - 5 years", "6 - 10 years", "11 - 15 years", "16 - 20 years"]
)

fig = px.box(
    data_frame = data.sort_values(by = "age_groups"),
    y = "age_groups",
    x = "selling_price",
    color_discrete_sequence = COLOR_SCHEME
)
fig.update_layout(
    title = "Selling Price vs Vehicle Age Groups",
    yaxis = dict(
        title = "Vehicle Age Groups"
    ),
    xaxis = dict(
        title = "Selling Price"
    )
)
fig.update_traces(
    hovertemplate = "<b>Vehicle Age Group</b> : %{y}<br>" + "<b>Selling Price</b> : %{x}<br>"
)
fig.show()

KM Driven vs Mileage

In [67]:
fig = px.scatter(
    data_frame = data,
    x = "mileage",
    y = "km_driven",
    color_discrete_sequence = COLOR_SCHEME
)
fig.update_layout(
    title = "Mileage vs KMs Driven",
    xaxis = dict(
        title = "Mileage"
    ),
    yaxis = dict(
        title = "KMs Driven"
    )
)
fig.update_traces(
    hovertemplate = "<b>Mileage</b> : %{x}<br>" + "<b>KMs Driven</b> : %{y} km<br>"
)
fig.show()

Correlation Matrix

In [71]:
fig = px.imshow(
    data[data.select_dtypes(include = np.number).columns].corr().round(2),
    color_continuous_scale = COLOR_SCHEME[::-1],
    text_auto = True
)
fig.update_layout(
    title = "Correlation Matrix of Numerical Features"
)
fig.show()

### <code>Observations</code>

- Most vehicles in the dataset are between <code>4</code> and <code>10</code> years old.
- <code>Newer cars</code> around <code>0 – 5</code> years old command significantly <code>higher</code> prices than older vehicles.
- <code>Dealer-listed</code> cars have the <code>highest</code> median prices, while <code>individual</code> sellers list cars <code>cheaper</code>.
- <code>Older</code> vehicles generally have <code>higher</code> kilometers driven.
- <code>Mileage</code> <code>decreases</code> as <code>engine size</code> <code>increases</code>.
- <code>Mileage</code> shows little clear trend with <code>distance driven</code>, but extreme <code>outliers</code> exist.
- <code>Larger</code> <code>engine sizes</code> generally produce <code>higher</code> <code>max power</code>.
- <code>Selling price</code> is <code>highly</code> correlated with <code>max power</code> and <code>engine size</code>.