In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("/content/drive/MyDrive/raw_data_sampled.csv",low_memory=False)
df_mini = df.sample(n=30000, random_state=42)

In [3]:
df_mini.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 75721 to 34122
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    30000 non-null  int64  
 1   region        30000 non-null  object 
 2   price         30000 non-null  int64  
 3   year          29907 non-null  float64
 4   manufacturer  28688 non-null  object 
 5   model         29655 non-null  object 
 6   condition     17770 non-null  object 
 7   cylinders     17398 non-null  object 
 8   fuel          29797 non-null  object 
 9   odometer      29713 non-null  float64
 10  title_status  29386 non-null  object 
 11  transmission  29807 non-null  object 
 12  VIN           18632 non-null  object 
 13  drive         20804 non-null  object 
 14  size          8470 non-null   object 
 15  type          23469 non-null  object 
 16  paint_color   20872 non-null  object 
 17  state         30000 non-null  object 
 18  Age           29907 non-nul

In [4]:
df_mini.drive.value_counts()

Unnamed: 0_level_0,count
drive,Unnamed: 1_level_1
4wd,9189
fwd,7483
rwd,4132


In [5]:
df_mini.manufacturer.value_counts(ascending=False)

Unnamed: 0_level_0,count
manufacturer,Unnamed: 1_level_1
ford,4968
chevrolet,3746
toyota,2413
honda,1508
nissan,1375
jeep,1337
ram,1293
gmc,1215
bmw,1011
dodge,979


In [6]:
df_mini.describe()

Unnamed: 0.1,Unnamed: 0,price,year,odometer,Age
count,30000.0,30000.0,29907.0,29713.0,29907.0
mean,50080.3931,247298.6,2011.283613,99004.88,13.716387
std,28952.721798,27710910.0,9.367113,220412.3,9.367113
min,0.0,0.0,1900.0,0.0,3.0
25%,25036.5,5895.0,2008.0,37647.0,8.0
50%,49977.5,13911.0,2014.0,85843.0,11.0
75%,75293.75,26500.0,2017.0,133572.0,17.0
max,99997.0,3736929000.0,2022.0,10000000.0,125.0


In [7]:
df_mini.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
region,0
price,0
year,93
manufacturer,1312
model,345
condition,12230
cylinders,12602
fuel,203
odometer,287


In [8]:
tmp=df.copy()
tmp['Age'] = 2025 - tmp['year']

In [9]:
tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    100000 non-null  int64  
 1   region        100000 non-null  object 
 2   price         100000 non-null  int64  
 3   year          99741 non-null   float64
 4   manufacturer  95866 non-null   object 
 5   model         98767 non-null   object 
 6   condition     59449 non-null   object 
 7   cylinders     58469 non-null   object 
 8   fuel          99255 non-null   object 
 9   odometer      98983 non-null   float64
 10  title_status  98040 non-null   object 
 11  transmission  99388 non-null   object 
 12  VIN           62217 non-null   object 
 13  drive         69485 non-null   object 
 14  size          28193 non-null   object 
 15  type          78404 non-null   object 
 16  paint_color   69652 non-null   object 
 17  state         100000 non-null  object 
 18  Age  

In [11]:
tmp = tmp.dropna(subset=['price'])
tmp = tmp[(tmp['price'] > 0) & (tmp['price'] <= 300000)]
tmp['price'] = np.log1p(tmp['price']) #Ensure price column still has standard values

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import joblib
X = tmp.drop(columns=["price"])
y = tmp["price"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [13]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [14]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),   # fill missing with median
    ('scaler', StandardScaler())
])

# Categorical pipeline: impute missing → one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # fill missing with most common
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc   = preprocessor.transform(X_val)
X_test_proc  = preprocessor.transform(X_test)

In [16]:
import xgboost as xgb
import random

In [17]:
dtrain=xgb.DMatrix(X_train_proc, label=y_train)
dval=xgb.DMatrix(X_val_proc, label=y_val)
dtest=xgb.DMatrix(X_test_proc, label=y_test)

In [18]:
param_grid = {
    "max_depth": [3, 5, 7, 9],
    "eta": [0.01, 0.05, 0.1, 0.3],
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bytree": [0.8, 0.9, 1.0]
}
n_iter=10
results=[]
for i in range(n_iter):
  params = {
      "objective": "reg:squarederror",
        "tree_method": "hist",
        "eval_metric": "mae",
        "max_depth": random.choice(param_grid["max_depth"]),
        "eta": random.choice(param_grid["eta"]),
        "subsample": random.choice(param_grid["subsample"]),
        "colsample_bytree": random.choice(param_grid["colsample_bytree"]),
        "seed": 42
  }
  model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=500,
        evals=[(dval, "validation")],
        early_stopping_rounds=30,
        verbose_eval=False
    )
  y_val_pred = model.predict(dval)
  val_mae = mean_absolute_error(y_val, y_val_pred)
  results.append((params, val_mae, model.best_iteration))

best_params, best_val_mae, best_iter = min(results, key=lambda x: x[1])
print("Best Params:", best_params)
print("Validation MAE:", best_val_mae)

Best Params: {'objective': 'reg:squarederror', 'tree_method': 'hist', 'eval_metric': 'mae', 'max_depth': 9, 'eta': 0.3, 'subsample': 1.0, 'colsample_bytree': 1.0, 'seed': 42}
Validation MAE: 0.3684280850991959


In [19]:
from scipy import sparse
X_full = sparse.vstack([X_train_proc, X_val_proc])
y_full = np.hstack([y_train, y_val])
dtrain_full = xgb.DMatrix(
    X_full,
    label=y_full
)

final_model = xgb.train(
    params=best_params,
    dtrain=dtrain_full,
    num_boost_round=best_iter,
    evals=[(dtest, "test")],
    verbose_eval=50
)


[0]	test-mae:0.72474
[50]	test-mae:0.41784
[100]	test-mae:0.40825
[150]	test-mae:0.40228
[200]	test-mae:0.39796
[250]	test-mae:0.39412
[300]	test-mae:0.38907
[350]	test-mae:0.38646
[400]	test-mae:0.38230
[450]	test-mae:0.37988
[498]	test-mae:0.37770


In [20]:
y_pred = final_model.predict(dtest)
actual_price = np.expm1(y_test)
predicted_price = np.expm1(y_pred)
print("\nTest MAE:", mean_absolute_error(actual_price, predicted_price))
print("Test R²:", r2_score(actual_price, predicted_price))


Test MAE: 4781.277272533885
Test R²: 0.6709213258458488


In [21]:
rmse = np.sqrt(mean_squared_error(actual_price, predicted_price))
print("Test RMSE:", rmse)

Test RMSE: 8979.981794269846


In [22]:
feature_names=preprocessor.get_feature_names_out()
joblib.dump(preprocessor, "/content/drive/MyDrive/preprocessor.pkl")
joblib.dump(feature_names, "/content/drive/MyDrive/feature_names.pkl")
final_model.save_model("/content/drive/MyDrive/model.json")