In [322]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score


In [323]:
# Load data
data = pd.read_csv("./Data/car_Data.csv")

In [324]:
# -----------------------------
# Clean kms_driven
# -----------------------------
data["kms_driven"] = (
    data["kms_driven"]
    .astype(str)
    .str.replace(",", "", regex=False)
    .str.replace(" kms", "", regex=False)
)

data["kms_driven"] = pd.to_numeric(data["kms_driven"], errors="coerce")
data.dropna(subset=["kms_driven"], inplace=True)

In [325]:
# -----------------------------
# Clean year
# -----------------------------
data["year"] = pd.to_numeric(data["year"], errors="coerce")
data.dropna(subset=["year"], inplace=True)

In [326]:
# -----------------------------
# Clean fuel_type
# -----------------------------
data["fuel_type"] = data["fuel_type"].fillna("Petrol")

In [327]:
# -----------------------------
# Clean Price (TARGET)
# -----------------------------
data["Price"] = (
    data["Price"]
    .astype(str)
    .str.replace(",", "", regex=False)
)

data["Price"] = pd.to_numeric(data["Price"], errors="coerce")
data.dropna(subset=["Price"], inplace=True)

In [328]:
# -----------------------------
# Features & Target
# -----------------------------
X = data.drop(columns=["Price"])
y = data["Price"]

num_cols = ["year", "kms_driven"]
cat_cols = ["company", "fuel_type"]

In [329]:
num_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

cat_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols)
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge(alpha=1.0))
    ]
)


In [330]:
# -----------------------------
# Train / Test
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = r2_score(y_test, y_pred)

print("Score:", mae)

Score: 0.6504090052710914
