In [1]:
import pandas as pd
import numpy as np

# 1. Data Preprocessing and Feature Engineering

## Clean and Prepare

In [2]:
df = pd.read_csv('../data/ads_all_cleaned.csv')

### Missing Values

One missing value in title which we will ignore

In [3]:
df.isnull().sum()

id                  0
num_of_pic          0
price               0
brand               0
model               0
title               1
year                0
km                  0
steering_side       0
region_specs        0
location            0
location_cleaned    0
dtype: int64

### Outliers

I will choose to keep outliers.

## Feature Engineering

Below is the wordcloud that was generated in the data analysis phase in the ads_all_after_cleaning.html report

Some of the more common words which could be useful are **warranty**, **accident** (to determine ads where there is specific mention that the car is accident free), **service** (to indicate a full service history) 

![wordcloud](../data/title-column-wordcloud.png)

In [4]:
# Feature Engineering
df['warranty'] = df['title'].apply(lambda x: True if 'warranty' in str(x).lower() else False)
df['service_hist'] = df['title'].apply(lambda x: True if 'service' in str(x).lower() else False)
df['no_accident'] = df['title'].apply(lambda x: True if 'accident' in str(x).lower() else False)

luxury_brands = ['mercedez-benz','lexus','bmw','audi']
df['luxury'] = df['brand'].apply(lambda x: 1 if x.lower() in luxury_brands else 0)

df['age'] = 2024 - df['year']

df.drop(['id','title','year','location'], axis=1, inplace=True)


## Data Preprocessing

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define feature columns
categorical_features = ['brand','model','steering_side','region_specs','location_cleaned']
numerical_features = ['km','age','num_of_pic']
binary_features = ['warranty', 'service_hist', 'no_accident','luxury']

# Preprocessing pipelines
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=False))
])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('bin', binary_transformer, binary_features)
    ])

In [17]:
X = df.drop('price', axis=1)
y = df['price']

In [18]:
X

Unnamed: 0,num_of_pic,brand,model,km,steering_side,region_specs,location_cleaned,warranty,service_hist,no_accident,luxury,age
0,17,Ford,F-Series Pickup,107000,Left Hand,GCC Specs,Dubai,False,True,False,0,6
1,18,Ford,Mustang,2000,Left Hand,American Specs,Sharjah,True,False,False,0,3
2,19,Ford,Edge,79000,Left Hand,GCC Specs,Dubai,False,True,False,0,8
3,19,Ford,Mustang,55000,Left Hand,GCC Specs,Dubai,True,True,False,0,4
4,19,Ford,Mustang,29000,Left Hand,GCC Specs,Dubai,True,True,False,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
20867,20,Volkswagen,Passat,107624,Left Hand,GCC Specs,Dubai,False,True,False,0,10
20868,17,Volkswagen,Tiguan,0,Left Hand,Chinese Specs,Dubai,False,False,False,0,1
20869,17,Volkswagen,Transporter,0,Left Hand,GCC Specs,Dubai,False,False,False,0,5
20870,17,Volkswagen,Touareg,54300,Left Hand,GCC Specs,Dubai,False,False,False,0,3


In [19]:
y

0        159000
1        276000
2         62500
3        169000
4        209000
          ...  
20867     26000
20868    110000
20869     79000
20870    185000
20871     57000
Name: price, Length: 20872, dtype: int64

In [20]:
# train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=0)

2. Model Selection and Evaluation

In [21]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

# Define the models
models = {
    'linear_regression': LinearRegression(),
    'svm': LinearSVR(max_iter=1000),
    'random_forest': RandomForestRegressor(),
    'gradient_boosted_rf': HistGradientBoostingRegressor(),
    'xgboost': xgb.XGBRegressor(),
    'lightgbm': lgb.LGBMRegressor()
}

# Dictionary to hold the results
results = {
    'model': [],
    'MAE': [],
    'R2': [],
    'training_time': []
}

# Iterate over the models
for model_name, model in models.items():
    # Create pipeline with preprocessor and model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    train_start = datetime.now()
    pipeline.fit(X_train, y_train)
    train_stop = datetime.now()
    train_time = train_stop - train_start

    y_pred = pipeline.predict(X_test)
    
    # Collect the results
    results['model'].append(model_name)
    results['MAE'].append(mean_absolute_error(y_test, y_pred))
    results['R2'].append(r2_score(y_test, y_pred))
    results['training_time'].append(train_time.total_seconds())  # Convert to seconds for easier readability

# Convert the results to a DataFrame
results_df = pd.DataFrame(results).set_index('model')

# Display the DataFrame
results_df




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 497
[LightGBM] [Info] Number of data points in the train set: 4174, number of used features: 95
[LightGBM] [Info] Start training from score 152932.704121


Unnamed: 0_level_0,MAE,R2,training_time
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
linear_regression,179394100000000.0,-2.766232e+19,0.175103
svm,137505.8,-0.1397364,0.035664
random_forest,40534.77,0.316334,10.359811
gradient_boosted_rf,52717.57,0.2844169,4.750468
xgboost,40971.55,0.3432632,0.230021
lightgbm,53001.2,0.2797819,0.133906


Linear regression doesn't handle the data well since I chose not to remove outliers. The others seem to have done pretty well.