# Cars Price Prediction: An Applied Machine Learning Pipeline
Using EDA, Feature Engineering, and PyCaret


## Scenario

Imagine you are a data scientist working with an automotive analytics firm.
The goal is to predict car prices based on easily available specifications
(engine size, horsepower, fuel type, seating capacity, etc.)


## Our Mission

- Goal: Predict car price accurately
- Constraint: Use publicly available car specifications
- Impact: Support pricing, comparison, and market analysis


In [90]:

import numpy as np
import pandas as pd
import os
from pathlib import Path
import kagglehub

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from pycaret.regression import *


## 1. Data Understanding


In [91]:
path = kagglehub.dataset_download("abdulmalik1518/cars-datasets-2025")

csv_path = os.path.join(path, "Cars Datasets 2025.csv")

df = pd.read_csv(csv_path, encoding="latin1")
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Company Names              1218 non-null   object
 1   Cars Names                 1218 non-null   object
 2   Engines                    1218 non-null   object
 3   CC/Battery Capacity        1215 non-null   object
 4   HorsePower                 1218 non-null   object
 5   Total Speed                1218 non-null   object
 6   Performance(0 - 100 )KM/H  1212 non-null   object
 7   Cars Prices                1218 non-null   object
 8   Fuel Types                 1218 non-null   object
 9   Seats                      1218 non-null   object
 10  Torque                     1217 non-null   object
dtypes: object(11)
memory usage: 104.8+ KB


In [93]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Company Names,1218,37,Nissan,149
Cars Names,1218,1201,Macan T,2
Engines,1218,356,I4,64
CC/Battery Capacity,1215,311,"2,000 cc",31
HorsePower,1218,456,355 hp,23
Total Speed,1218,114,250 km/h,145
Performance(0 - 100 )KM/H,1212,180,6.5 sec,45
Cars Prices,1218,535,"$35,000",36
Fuel Types,1218,23,Petrol,871
Seats,1218,19,5,692


## 2. Data Preparation

### 2.1 Column Renaming


In [94]:
df = df.rename(columns={
    "Company Names": "company",
    "Cars Names": "car_name",
    "Engines": "engine",
    "CC/Battery Capacity": "cc_battery",
    "HorsePower": "horsepower",
    "Total Speed": "top_speed",
    "Performance(0 - 100 )KM/H": "acceleration_0_100",
    "Cars Prices": "price",
    "Fuel Types": "fuel_type",
    "Seats": "seats",
    "Torque": "torque"
})


### 2.2 Target Variable Preparation

In [95]:
df["price"] = (
    df["price"]
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.replace(" ", "", regex=False)
    .str.replace("/", "-", regex=False)
)

price_split = df["price"].str.split("-", expand=True)

price_split = price_split.apply(
    pd.to_numeric, errors="coerce"
)

df["price"] = price_split.mean(axis=1) 

df["price"].head()

0    1100000.0
1     460000.0
2      13500.0
3     161000.0
4     253290.0
Name: price, dtype: float64

### 2.3 Numerical Feature Cleaning

In [96]:
hp_clean = (
    df["horsepower"]
    .astype("string")
    .str.lower()
    .str.replace("hp", "", regex=False)
    .str.replace(" ", "", regex=False)
)

hp_split = hp_clean.str.split("-", expand=True)

hp_split = hp_split.apply(pd.to_numeric, errors="coerce")

df["horsepower"] = hp_split.mean(axis=1)

df["horsepower"].head()

0    963.0
1    563.0
2     77.5
3    630.0
4    602.0
Name: horsepower, dtype: Float64

In [97]:
speed_clean = (
    df["top_speed"]
    .astype("string")
    .str.lower()
    .str.replace("km/h", "", regex=False)
    .str.replace(" ", "", regex=False)
)

speed_split = speed_clean.str.split("-", expand=True)
speed_split = speed_split.apply(pd.to_numeric, errors="coerce")

df["top_speed"] = speed_split.mean(axis=1)

df["top_speed"].head()

0    340.0
1    250.0
2    165.0
3    250.0
4    320.0
Name: top_speed, dtype: Float64

In [98]:
acc_clean = (
    df["acceleration_0_100"]
    .astype("string")
    .str.lower()
    .str.replace("sec", "", regex=False)
    .str.replace(" ", "", regex=False)
)

df["acceleration_0_100"] = pd.to_numeric(acc_clean, errors="coerce")

df["acceleration_0_100"].head()

0     2.5
1     5.3
2    10.5
3     3.2
4     3.6
Name: acceleration_0_100, dtype: Float64

In [99]:
torque_clean = (
    df["torque"]
    .astype("string")
    .str.lower()
    .str.replace("nm", "", regex=False)
    .str.replace(" ", "", regex=False)
)

torque_split = torque_clean.str.split("-", expand=True)
torque_split = torque_split.apply(pd.to_numeric, errors="coerce")

df["torque"] = torque_split.mean(axis=1)

df["torque"].head()


0    800.0
1    900.0
2    120.0
3    900.0
4    560.0
Name: torque, dtype: Float64

In [101]:
cc_clean = (
    df["cc_battery"]
    .astype("string")
    .str.lower()
    .str.replace("cc", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.replace(" ", "", regex=False)
)

print(cc_clean.head())
df["cc_battery"] = pd.to_numeric(cc_clean, errors="coerce")

df["cc_battery"].head()


0    3990
1    6749
2    1200
3    3982
4    5204
Name: cc_battery, dtype: string


0    3990
1    6749
2    1200
3    3982
4    5204
Name: cc_battery, dtype: Int64

In [102]:
df["seats"] = pd.to_numeric(df["seats"], errors="coerce")

df["seats"].head()

0    2.0
1    5.0
2    5.0
3    4.0
4    2.0
Name: seats, dtype: float64

### 2.4 Categorical Feature Cleaning


In [103]:
df["company"] = (
    df["company"]
    .astype("string")
    .str.strip()
    .str.upper()
)

car_name was removed due to high cardinality and limited predictive value, which can negatively impact model generalization.

In [105]:
df["engine"] = (
    df["engine"]
    .astype("string")
    .str.strip()
    .str.upper()
)

In [106]:
df["fuel_type"] = (
    df["fuel_type"]
    .astype("string")
    .str.strip()
    .str.lower()
)

df["fuel_type"] = df["fuel_type"].replace({
    "plug in hybrid": "hybrid",
    "plug-in hybrid": "hybrid",
})

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   company             1218 non-null   string 
 1   engine              1218 non-null   string 
 2   cc_battery          991 non-null    Int64  
 3   horsepower          1197 non-null   Float64
 4   top_speed           1217 non-null   Float64
 5   acceleration_0_100  1201 non-null   Float64
 6   price               1213 non-null   float64
 7   fuel_type           1218 non-null   string 
 8   seats               1206 non-null   float64
 9   torque              1193 non-null   Float64
dtypes: Float64(4), Int64(1), float64(2), string(3)
memory usage: 101.2 KB


In [None]:
# 1) Choose columns to drop (screening = keep only "easy to use" features)
cols_to_drop = [
    # examples (only drop if they exist):
    "car_name",          # if you had it
]

cols_to_drop = [c for c in cols_to_drop if c in df.columns]

df_screening = df.drop(columns=cols_to_drop).copy()

print("Dropped:", cols_to_drop)
print("Remaining columns:", df_screening.columns.tolist())


Dropped: []
Remaining columns: ['company', 'engine', 'cc_battery', 'horsepower', 'top_speed', 'acceleration_0_100', 'price', 'fuel_type', 'seats', 'torque']


In [112]:
target = "price"  # your target

# Categorical = object/string/category, Numerical = number
cat_features = (
    df_screening.select_dtypes(include=["object", "string", "category"])
    .columns
    .drop([c for c in [target] if c in df_screening.columns])
    .tolist()
)

num_features = (
    df_screening.select_dtypes(include=["number"])
    .columns
    .drop([c for c in [target] if c in df_screening.columns])
    .tolist()
)

print("Categorical:", cat_features)
print("Numerical:", num_features)


KeyError: "['price'] not found in axis"

In [113]:
features = cat_features + num_features
df_clean = df_screening[features + [target]].copy()

X = df_clean[features]
y = df_clean[target]


In [114]:

from sklearn.impute import SimpleImputer

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_features),
        ("cat", categorical_pipe, cat_features),
    ],
)

pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", LinearRegression()),
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

print("MAE :", mean_absolute_error(y_test, pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))
print("R2  :", r2_score(y_test, pred))


ValueError: Input y contains NaN.

In [109]:

df_model = df.dropna(subset=["price"]).copy()

target = "price"

categorical_features = (
    df_model.select_dtypes(include=["object", "string", "category"])
    .columns
    .tolist()
)

numerical_features = (
    df_model.select_dtypes(include="number")
    .columns
    .drop(target)
    .tolist()
)


cols_to_drop = ['car_name']
categorical_features = [c for c in categorical_features if c not in cols_to_drop]
numerical_features = [c for c in numerical_features if c not in cols_to_drop]

feature_cols = numerical_features + categorical_features
df_baseline = df_model[feature_cols + [target]].copy()

print(f"Target: {target}")
print(f"Rows for modeling: {len(df_baseline)}")
print(f"Numerical Features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")

df_baseline.head()


Target: price
Rows for modeling: 1213
Numerical Features (6): ['cc_battery', 'horsepower', 'top_speed', 'acceleration_0_100', 'seats', 'torque']
Categorical Features (3): ['company', 'engine', 'fuel_type']


Unnamed: 0,cc_battery,horsepower,top_speed,acceleration_0_100,seats,torque,company,engine,fuel_type,price
0,3990,963.0,340.0,2.5,2.0,800.0,FERRARI,V8,plug in hyrbrid,1100000.0
1,6749,563.0,250.0,5.3,5.0,900.0,ROLLS ROYCE,V12,petrol,460000.0
2,1200,77.5,165.0,10.5,5.0,120.0,FORD,1.2L PETROL,petrol,13500.0
3,3982,630.0,250.0,3.2,4.0,900.0,MERCEDES,V8,petrol,161000.0
4,5204,602.0,320.0,3.6,2.0,560.0,AUDI,V10,petrol,253290.0
