In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats import uniform, randint

In [None]:
df = pd.read_csv("/content/Vehicle-Price-Prediction.csv")
df.head()

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,2023 Dodge Durango Pursuit,White Knuckle Clearcoat 2023 Dodge Durango Pur...,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,2024 RAM 3500 Laramie,\n \n 2024 Ram 3500 Laramie Billet...,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive


In [None]:
#Handeling engine column
def extract_engine_features(engine):
    if pd.isna(engine) or engine.strip().lower() == "c":
        return pd.Series(["electric", np.nan, np.nan, np.nan])

    engine = engine.lower()
    engine_type = (
        "diesel" if "diesel" in engine else
        "hybrid" if "hybrid" in engine else
        "gasoline" if "gasoline" in engine else
        "petrol" if "petrol" in engine else
        "unknown"
    )
    valves = 16 if "16v" in engine else 24 if "24v" in engine else 32 if "32v" in engine else None
    aspiration = (
        "twin turbo" if "twin turbo" in engine else
        "turbo" if "turbo" in engine else
        "natural" if "ohv" in engine or "dohc" in engine else
        "unknown"
    )
    fuel_injection = (
        "gdi" if "gdi" in engine else
        "mpfi" if "mpfi" in engine else
        "pdi" if "pdi" in engine else
        "unknown"
    )
    return pd.Series([engine_type, valves, aspiration, fuel_injection])

df[["engine_type", "engine_valves", "aspiration", "fuel_injection"]] = df["engine"].apply(extract_engine_features)

In [None]:
# Droping/Filling missing values, updating dataset
df = df.dropna(subset=["price"])

df["cylinders"] = df["cylinders"].fillna(0)
df["mileage"] = df["mileage"].fillna(df["mileage"].median())
df["doors"] = df["doors"].fillna(df["doors"].mode()[0])

cat_cols = ["fuel", "transmission", "trim", "body", "exterior_color", "interior_color"]
for col in cat_cols:
    df[col] = df[col].fillna("Unknown").astype(str).str.strip()

df.drop(columns=["name", "description", "engine", "exterior_color", "interior_color", "engine_type"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cylinders"] = df["cylinders"].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mileage"] = df["mileage"].fillna(df["mileage"].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["doors"] = df["doors"].fillna(df["doors"].mode()[0])
A value is trying to be set on a copy of a s

In [None]:
# One-hot encoding
df_model = pd.get_dummies(df, drop_first=True)

In [None]:

import plotly.express as px
import plotly.graph_objects as go


In [None]:

fig = px.histogram(df, x='price', nbins=30, title='Distribution of Vehicle Prices', marginal='box')
fig.update_layout(xaxis_title='Price', yaxis_title='Count')
fig.show()


In [None]:

import numpy as np

corr_matrix = df.corr().round(2)
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.index,
    colorscale='RdBu',
    zmin=-1,
    zmax=1,
    colorbar=dict(title='Correlation')
))
fig.update_layout(title='Correlation Heatmap', xaxis_title='Features', yaxis_title='Features')
fig.show()


In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

# Plot distribution of target variable
plt.figure(figsize=(8, 5))
sns.histplot(df['price'], kde=True, bins=30)
plt.title('Distribution of Vehicle Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


In [None]:

plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
X = df_model.drop("price", axis=1)
y = df_model["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fill missing values
X_train["engine_valves"] = X_train["engine_valves"].fillna(0)
X_test["engine_valves"] = X_test["engine_valves"].fillna(0)

In [None]:
#Models
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42, verbosity=0)
lr = LinearRegression()

#Train models
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lr.fit(X_train, y_train)

#Predictions
rf_preds = rf.predict(X_test)
xgb_preds = xgb.predict(X_test)
lr_preds = lr.predict(X_test)

In [None]:
#Evaluation function
def evaluate_model(name, preds):
    print(f"{name}:")
    print(f"  MAE : {mean_absolute_error(y_test, preds):,.2f}")
    print(f"  R-Square  : {r2_score(y_test, preds):.4f}\n")

#Evaluate all
evaluate_model("Random Forest", rf_preds)
evaluate_model("XGBoost", xgb_preds)
evaluate_model("Linear Regression", lr_preds)

Random Forest:
  MAE : 4,359.00
  R-Square  : 0.8047

XGBoost:
  MAE : 4,275.70
  R-Square  : 0.8406

Linear Regression:
  MAE : 4,090.87
  R-Square  : 0.8409



In [None]:
#parameter for random search
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3)
}

random_search = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42, verbosity=0),
    param_distributions=param_dist,
    n_iter=20,  #try 20 random combinations
    scoring='r2',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
# Best model
best_xgb = random_search.best_estimator_
print("Best Parameters (Random Search):\n", random_search.best_params_)

# Predict and evaluate
tuned_preds = best_xgb.predict(X_test)
print("Tuned XGBoost (Random Search):")
print(f"  MAE : {mean_absolute_error(y_test, tuned_preds):,.2f}")
print(f"  R-Square  : {r2_score(y_test, tuned_preds):.4f}")

Best Parameters (Random Search):
 {'colsample_bytree': 0.7692681476866446, 'learning_rate': 0.0823076398078035, 'max_depth': 6, 'n_estimators': 298, 'subsample': 0.8829989973347863}
Tuned XGBoost (Random Search):
  MAE : 4,130.34
  R-Square  : 0.8588


In [None]:
#Note:- before you use model make sure that the dataset is in the same format(same shape) as it was trained, i.e preprocessed.
#save if you want
import joblib

model_filename = "best_xgb_model.pkl"
joblib.dump(best_xgb, model_filename)

In [None]:

import pandas as pd
import seaborn as sns

# Feature Importance
importances = model.feature_importances_
features = df.drop(columns='price').columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature')
plt.title('Feature Importance')
plt.show()


In [None]:

# Plot actual vs predicted prices
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
plt.show()


In [None]:

importances = model.feature_importances_
features = df.drop(columns='price').columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=True)

fig = px.bar(feature_importance_df, x='Importance', y='Feature', orientation='h',
             title='Feature Importance')
fig.show()


In [None]:

fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Price', 'y': 'Predicted Price'},
                 title='Actual vs Predicted Prices')
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines', name='Ideal Line'))
fig.show()
