In [11]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split,  KFold, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pickle


In [5]:
offers = pd.read_csv('total_offers.csv')
offers.head()

Unnamed: 0,Make,Model,Trim,Year,Mileage(km),kW,Fuel type,Body type,Color,Transmission,Emission standard,Country,City,Price(EUR)
0,Volkswagen,Sharan,2.0 tdi,2013,182000,104.0,Diesel,Van,Gray,Manual,5,Austria,Braunau Am Inn,15499.0
1,Audi,A6,2.7,2009,250000,141.0,Diesel,Sedan,Silver,Manual,5,Austria,Salzburg,8900.0
2,Porsche,Cayenne,3.0 diesel,2011,256000,182.0,Diesel,Suv,Black,Automatic,5,Austria,Braunau Am Inn,25999.0
3,Volkswagen,Passat,2.0 tdi scr,2019,94187,111.0,Diesel,Wagon,Silver,Manual,6,Austria,Wolfsberg,18790.0
4,Volkswagen,Passat,2.0 tdi scr,2019,77167,111.0,Diesel,Wagon,Black,Automatic,6,Austria,Wels,18990.0


In [16]:
# Function to generate new numerical column based on compound score by make
def generate_sia_column(X, comp_score_make_dict_path):
    with open(comp_score_make_dict_path, 'rb') as f:
        comp_score_make_dict = pickle.load(f)
    X['SIA_Compound'] = X['Make'].apply(lambda x: comp_score_make_dict.get(x, 0))#replace the missing key with value 0 neutral
    return X

In [14]:
# Path to the pickle file containing the compound score by make dictionary
comp_score_make_dict_path = 'comp_score_make_dict.p'

In [8]:
X=offers.drop(columns=['Price(EUR)'])
y=offers['Price(EUR)']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [17]:
# Define numerical, one-hot encoded, and target encoded features
numerical_features = ['Year', 'Mileage(km)', 'kW', 'Emission standard', 'SIA_Compound']
onehot_features = ['Fuel type', 'Body type', 'Color', 'Transmission', 'Country']
target_features = ['Make', 'Model', 'Trim']

# Preprocessing steps for numerical features
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing steps for one-hot encoded features
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing steps for target encoded features
target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])

# Combine preprocessing steps for numerical, one-hot encoded, and target encoded features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('onehot', onehot_transformer, onehot_features),
        ('target', target_transformer, target_features)
    ])

# Define the model
model = XGBRegressor()

# Construct the pipeline
pipeline = Pipeline(steps=[
    ('compound_score', FunctionTransformer(generate_sia_column, kw_args={'comp_score_make_dict_path': comp_score_make_dict_path})),
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameters for random search
param_grid = {
    'model__n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'model__max_depth': [None, 2, 5, 10, 20, 30],
    'model__max_features': ['sqrt', 'log2'],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=10)

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=50, cv=kf, scoring='r2', verbose=2, n_jobs=10, random_state=10)
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict on the testing set
y_pred = best_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Testing R2 Score:", r2)
print("Testing Mean Squared Error:", mse)
print("Testing Mean Absolute Error:", mae)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "bootstrap", "max_features", "min_samples_leaf", "min_samples_split" } are not used.



Testing R2 Score: 0.9064682426103239
Testing Mean Squared Error: 49531099.3606752
Testing Mean Absolute Error: 2083.9404441587617


In [18]:
random_search.best_score_

0.8755126397179911

In [19]:
best_model

In [20]:
best_model.get_params()

{'memory': None,
 'steps': [('compound_score',
   FunctionTransformer(func=<function generate_sia_column at 0x00000173CE50AAC0>,
                       kw_args={'comp_score_make_dict_path': 'comp_score_make_dict.p'})),
  ('preprocessor',
   ColumnTransformer(transformers=[('num', SimpleImputer(),
                                    ['Year', 'Mileage(km)', 'kW',
                                     'Emission standard', 'SIA_Compound']),
                                   ('onehot',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['Fuel type', 'Body type', 'Color',
                                     'Transmission', 'Country']),
                                   ('target',
            

In [21]:
random_search.cv_results_

{'mean_fit_time': array([  0.76114888, 212.03101616, 308.17902679,  11.98297758,
         27.41611147,   7.87049274,  16.42097898,  13.30639162,
          8.69052434,   0.75988922,   1.61368494,   1.04235225,
         25.96658864, 264.11741996,  81.61071339,   4.54558053,
         17.70088081,   4.98906331,   0.51068115,   9.10585022,
          1.39283261,  30.55401335,  72.94042621,   1.98096519,
          0.88658547,   1.36095824,   4.2853385 , 193.65085063,
         38.04117723,   4.21415019,   7.1600461 ,   2.78928246,
         10.93866434,   0.3938971 ,   0.7666225 ,   1.81291018,
          2.49830775,   6.12492967,   2.13121367,   5.43528819,
          2.20277739, 290.80611429,   1.2709537 , 176.95692396,
         92.91676497,   1.04770365,   1.59869437,  43.05525236,
          0.37779512,   1.01209455]),
 'std_fit_time': array([2.26424874e-02, 2.36322572e+01, 4.53700900e+01, 2.37277607e+00,
        6.31221400e+00, 1.82790160e+00, 4.41583753e+00, 2.99149151e+00,
        1.5949101

In [22]:
random_search.best_params_

{'model__n_estimators': 800,
 'model__min_samples_split': 10,
 'model__min_samples_leaf': 2,
 'model__max_features': 'log2',
 'model__max_depth': None,
 'model__bootstrap': False}

The scores are much better than the XGB Regressor model without SIA compound score. Try another roound of random search

In [27]:
# Define numerical, one-hot encoded, and target encoded features
numerical_features = ['Year', 'Mileage(km)', 'kW', 'Emission standard', 'SIA_Compound']
onehot_features = ['Fuel type', 'Body type', 'Color', 'Transmission', 'Country']
target_features = ['Make', 'Model', 'Trim']

# Preprocessing steps for numerical features
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing steps for one-hot encoded features
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing steps for target encoded features
target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])

# Combine preprocessing steps for numerical, one-hot encoded, and target encoded features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('onehot', onehot_transformer, onehot_features),
        ('target', target_transformer, target_features)
    ])

# Define the model
model = XGBRegressor()

# Construct the pipeline
pipeline = Pipeline(steps=[
    ('compound_score', FunctionTransformer(generate_sia_column, kw_args={'comp_score_make_dict_path': comp_score_make_dict_path})),
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameters for random search
param_grid = {
    'model__n_estimators': [80, 150, 300, 500, 700, 900, 1100, 1300, 1500, 1600, 1700, 1900],
    'model__max_depth': [None, 1, 3, 4, 5, 6, 7, 8, 9],
    'model__max_features': ['sqrt', 'log2'],
    'model__min_samples_split': [8, 9, 10, 11, 12, 13],
    'model__min_samples_leaf': [1, 2, 3, 4, 5],
    'model__bootstrap': [True, False]
}

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=10)

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=50, cv=kf, scoring='r2', verbose=2, n_jobs=10, random_state=10)
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict on the testing set
y_pred = best_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Testing R2 Score:", r2)
print("Testing Mean Squared Error:", mse)
print("Testing Mean Absolute Error:", mae)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "bootstrap", "max_features", "min_samples_leaf", "min_samples_split" } are not used.



Testing R2 Score: 0.9065224658426563
Testing Mean Squared Error: 49502384.661162734
Testing Mean Absolute Error: 2057.6936190042184


In [28]:
random_search.best_score_

0.8755073975445711

In [29]:
random_search.best_params_

{'model__n_estimators': 1300,
 'model__min_samples_split': 13,
 'model__min_samples_leaf': 4,
 'model__max_features': 'sqrt',
 'model__max_depth': None,
 'model__bootstrap': True}

In [31]:
random_search.cv_results_

{'mean_fit_time': array([ 0.8065064 ,  2.3572485 , 14.01219258,  0.35058684, 11.16539378,
         1.08239474,  0.51136556,  0.75571785,  7.08847771,  2.15043378,
         0.92235527,  7.65541625,  3.42270269,  0.58553615,  9.13774257,
        10.78446856,  1.27674341, 10.78420806,  0.9214365 , 10.56009545,
         2.54546328,  0.51179662,  4.06323524, 13.81395969,  1.02019787,
         8.83646431,  1.77891259,  3.23536296,  2.4904798 , 10.67101502,
         2.44238825, 10.06065369,  0.66640944,  0.32756963,  3.60083838,
         6.86123939,  1.65527134,  3.42241321,  0.33687472,  0.35219994,
         0.28663321,  1.09247236,  5.91562953, 14.48415413,  1.1880024 ,
         1.53593459,  5.95104704,  0.64569111,  8.33522544,  4.13838811]),
 'std_fit_time': array([0.09896089, 0.04330435, 2.78031779, 0.0363704 , 0.1100633 ,
        0.21900266, 0.07586786, 0.12551829, 1.15891338, 0.46760692,
        0.2000938 , 1.60475587, 0.67637912, 0.10851408, 1.99127474,
        1.4471322 , 0.00527825,

The performance is not improved. Next, narrow down the hyperparameters by using Grid search

In [32]:
# Define numerical, one-hot encoded, and target encoded features
numerical_features = ['Year', 'Mileage(km)', 'kW', 'Emission standard', 'SIA_Compound']
onehot_features = ['Fuel type', 'Body type', 'Color', 'Transmission', 'Country']
target_features = ['Make', 'Model', 'Trim']

# Preprocessing steps for numerical features
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing steps for one-hot encoded features
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing steps for target encoded features
target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])

# Combine preprocessing steps for numerical, one-hot encoded, and target encoded features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('onehot', onehot_transformer, onehot_features),
        ('target', target_transformer, target_features)
    ])

# Define the model
model = XGBRegressor()

# Construct the pipeline
pipeline = Pipeline(steps=[
    ('compound_score', FunctionTransformer(generate_sia_column, kw_args={'comp_score_make_dict_path': comp_score_make_dict_path})),
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameters for random search
param_grid = {
    'model__n_estimators': [800, 850, 950, 1050, 1150, 1250, 1300],
    'model__max_depth': [None, 1, 2],
    'model__max_features': ['sqrt', 'log2'],
    'model__min_samples_split': [10, 13],
    'model__min_samples_leaf': [2, 4],
    'model__bootstrap': [True, False]
}

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=10)

# Randomized search for hyperparameter tuning
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kf, n_jobs=10)
grid_search.fit(X_train, y_train)

# Get the best model
best_model_grid = grid_search.best_estimator_

# Predict on the testing set
y_pred = best_model_grid.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Testing R2 Score:", r2)
print("Testing Mean Squared Error:", mse)
print("Testing Mean Absolute Error:", mae)


Parameters: { "bootstrap", "max_features", "min_samples_leaf", "min_samples_split" } are not used.



Testing R2 Score: 0.9064682426103239
Testing Mean Squared Error: 49531099.3606752
Testing Mean Absolute Error: 2083.9404441587617


In [33]:
grid_search.best_score_

0.8755126397179911

In [34]:
grid_search.best_params_

{'model__bootstrap': True,
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_leaf': 2,
 'model__min_samples_split': 10,
 'model__n_estimators': 800}

The performance is not better than the model without SIA compound score