In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

In [2]:
cars = pd.read_csv('../Data/cleaned_data_updated.csv')
cars.head()

Unnamed: 0,price,brand,model,year,condition,transmission,fuel_type,kilometers_run,age,km_per_year
0,2450000,toyota,fielder,2020,reconditioned,automatic,hybrid,86000,5,17200.0
1,4500000,toyota,noah,2020,reconditioned,automatic,octane,68000,5,13600.0
2,2750000,honda,cr-v,2012,used,automatic,octane,92000,13,7076.923077
3,1350000,toyota,passo,2010,used,automatic,"octane, lpg",87369,15,5824.6
4,1760000,toyota,axio,2016,used,automatic,"hybrid, octane",66000,9,7333.333333


In [3]:
target = 'price'
neumeric_feature = ['year', 'kilometers_run', 'age', 'km_per_year']
categorical_feature = ['brand', 'model', 'condition', 'transmission', 'fuel_type']

X = cars[neumeric_feature + categorical_feature]
y = cars[target]

In [4]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (2308, 9), Test shape: (578, 9)


In [5]:
X_train.isna().sum()

year              0
kilometers_run    0
age               0
km_per_year       0
brand             0
model             0
condition         0
transmission      0
fuel_type         0
dtype: int64

In [6]:
# make sure y_train and y_test matches
X_train = X_train.dropna()
y_train = y_train[X_train.index]  

X_test = X_test.dropna()
y_test = y_test[X_test.index]

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), neumeric_feature),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_feature)
])

In [8]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [9]:
# Train & Evaluate Models
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}
    print(f"{name} -> MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")



Training LinearRegression...
LinearRegression -> MAE: 284640.19, RMSE: 397913.69, R2: 0.827

Training Ridge...
Ridge -> MAE: 289152.26, RMSE: 398364.72, R2: 0.827

Training Lasso...


  model = cd_fast.sparse_enet_coordinate_descent(


Lasso -> MAE: 285341.39, RMSE: 401393.75, R2: 0.824

Training RandomForest...
RandomForest -> MAE: 244285.58, RMSE: 387915.23, R2: 0.836

Training GradientBoosting...
GradientBoosting -> MAE: 275781.81, RMSE: 389844.41, R2: 0.834


In [10]:
# Compare Models
results_df = pd.DataFrame(results).T.sort_values("RMSE")
print("\nModel Comparison (sorted by RMSE):")
print(results_df)


Model Comparison (sorted by RMSE):
                            MAE           RMSE        R2
RandomForest      244285.577192  387915.232164  0.835970
GradientBoosting  275781.810902  389844.410319  0.834335
LinearRegression  284640.186887  397913.694026  0.827406
Ridge             289152.261370  398364.720871  0.827014
Lasso             285341.386626  401393.752379  0.824373


In [11]:
# Save Best Model
best_model_name = results_df.index[0]
print(f"\nBest model: {best_model_name}")

best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', models[best_model_name])])
best_pipeline.fit(X_train, y_train)

joblib.dump(best_pipeline, "used_car_price_model.pkl")
print("Best model saved as 'used_car_price_model.pkl'")


Best model: RandomForest


Best model saved as 'used_car_price_model.pkl'


In [12]:
cars.head()

Unnamed: 0,price,brand,model,year,condition,transmission,fuel_type,kilometers_run,age,km_per_year
0,2450000,toyota,fielder,2020,reconditioned,automatic,hybrid,86000,5,17200.0
1,4500000,toyota,noah,2020,reconditioned,automatic,octane,68000,5,13600.0
2,2750000,honda,cr-v,2012,used,automatic,octane,92000,13,7076.923077
3,1350000,toyota,passo,2010,used,automatic,"octane, lpg",87369,15,5824.6
4,1760000,toyota,axio,2016,used,automatic,"hybrid, octane",66000,9,7333.333333
