In [3]:
import numpy as np
import pandas as pd

In [4]:
cleaned_data = pd.read_csv('dataset/used_cars_cleaned.csv')

In [5]:
cleaned_data.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,...,drive,size,type,paint_color,description,state,lat,long,is_condition_imputed,is_title_status_imputed
0,auburn,33590,2014,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,...,4wd,unspecified,pickup,white,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
1,auburn,22590,2010,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,...,4wd,full-size,pickup,blue,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
2,auburn,39590,2020,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,...,4wd,unspecified,pickup,red,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
3,auburn,30990,2017,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,...,4wd,full-size,pickup,red,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
4,auburn,15000,2013,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,clean,...,rwd,full-size,truck,black,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592,-85.5189,False,False


In [6]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246680 entries, 0 to 246679
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   region                   246680 non-null  object 
 1   price                    246680 non-null  int64  
 2   year                     246680 non-null  int64  
 3   manufacturer             246680 non-null  object 
 4   model                    246680 non-null  object 
 5   condition                246680 non-null  object 
 6   cylinders                246680 non-null  object 
 7   fuel                     246680 non-null  object 
 8   odometer                 246680 non-null  float64
 9   title_status             246680 non-null  object 
 10  transmission             246680 non-null  object 
 11  drive                    246680 non-null  object 
 12  size                     246680 non-null  object 
 13  type                     246680 non-null  object 
 14  pain

# Feature Engineering

In [7]:
from sklearn.model_selection   import train_test_split
from sklearn.pipeline          import Pipeline
from sklearn.compose           import ColumnTransformer
from sklearn.preprocessing     import (
    FunctionTransformer,
    StandardScaler,
    OneHotEncoder
)
import category_encoders as ce

Let's first drop the features we're not going to use

In [8]:
drop_features = [
    'description', 'lat', 'long'
    ]

In [9]:
modelling_data = cleaned_data.drop(columns=drop_features)

In [10]:
binned_cols = [
    'state', 'region'
]

In [11]:
k = 10  # Number of top categories to display

for col in binned_cols:
    map = modelling_data[col].value_counts().head(k).index
    modelling_data[col] = modelling_data[col].apply(lambda x: x if x in map else 'other')

## Encoding

### Numerical Features

In [12]:
odo_transformers = Pipeline([
    ('select', FunctionTransformer(lambda d: d[['odometer']], validate=False)),
    ('log',    FunctionTransformer(np.log1p, validate=False)),
    ('scale',  StandardScaler())
])
year_transformer = Pipeline([
    ('select',   FunctionTransformer(lambda d: d[['year']], validate=False)),
    ('identity', FunctionTransformer(lambda d: d, validate=False)),
    ('scale',    StandardScaler())
])

### Ordinal Categories

In [13]:
condition_mapping = {'salvage':0,'fair':1,'good':2,'excellent':3,'like new':4,'new':5}
title_mapping     = {'unspecified':-3,'parts only':-2,'missing':-1,'lien':0,'salvage':1,'rebuilt':2,'clean':3}

In [14]:
ord_transformer = Pipeline([
    ('select', FunctionTransformer(lambda d: d[['condition','title_status']], validate=False)),
    ('map', FunctionTransformer(lambda df: pd.DataFrame({
            'condition_ord':    df['condition'].map(condition_mapping),
            'title_status_ord': df['title_status'].map(title_mapping)
        }), validate=False)),
    ('scale', StandardScaler())
])

### Nominal Categories

For the Low-Cardinality Nominal Categories will be one-hot encoded, we've also binned 'state' and 'region' and will be treated as a low-cardinality category

In [15]:
nominal_cols = ['region','cylinders','fuel','transmission','drive','size','type','paint_color','state']
nom_transformer = Pipeline([
    ('select', FunctionTransformer(lambda d: d[nominal_cols], validate=False)),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

For High-Cardinality Nominal Categories will be target-mean encoded

In [16]:
manuf_te = ce.TargetEncoder(cols=['manufacturer'])
model_te = ce.TargetEncoder(cols=['model'])

### Flags

In [17]:
flag_transformer = Pipeline([
    ('select', FunctionTransformer(lambda d: d[['is_condition_imputed','is_title_status_imputed']], validate=False)),
    ('to_int',  FunctionTransformer(lambda df: df.astype(int), validate=False))
])

### Column Transformer

In [18]:
tf = ColumnTransformer([
    ('odo',   odo_transformers,   ['odometer']),
    ('year',  year_transformer,   ['year']),
    ('ord',   ord_transformer,    ['condition','title_status']),
    ('manuf', manuf_te,           ['manufacturer']),
    ('model', model_te,           ['model']),
    ('nom',   nom_transformer,     nominal_cols),
    ('flags', flag_transformer,   ['is_condition_imputed','is_title_status_imputed'])
])

# Model

## Train-Test Split

In [19]:
df_features = modelling_data.drop(columns='price')
y = modelling_data['price']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    df_features, y, test_size=0.2, random_state=42
)

## Train Process

In [21]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
pipeline = Pipeline([
    ('pre', tf),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, max_features="sqrt", max_depth=20, min_samples_leaf=5))
])

pipeline.fit(X_train, np.log1p(y_train))

## Test

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

In [None]:
y_pred_log = pipeline.predict(X_test)
y_pred     = np.expm1(y_pred_log)

In [None]:
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)

print(f'R²: {r2:.3f}')
print(f'RMSE: {rmse:.3f}')