In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split,  KFold, cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error



In [2]:
offers = pd.read_csv('total_offers.csv')
offers.head()

Unnamed: 0,Make,Model,Trim,Year,Mileage(km),kW,Fuel type,Body type,Color,Transmission,Emission standard,Country,City,Price(EUR)
0,Volkswagen,Sharan,2.0 tdi,2013,182000,104.0,Diesel,Van,Gray,Manual,5,Austria,Braunau Am Inn,15499.0
1,Audi,A6,2.7,2009,250000,141.0,Diesel,Sedan,Silver,Manual,5,Austria,Salzburg,8900.0
2,Porsche,Cayenne,3.0 diesel,2011,256000,182.0,Diesel,Suv,Black,Automatic,5,Austria,Braunau Am Inn,25999.0
3,Volkswagen,Passat,2.0 tdi scr,2019,94187,111.0,Diesel,Wagon,Silver,Manual,6,Austria,Wolfsberg,18790.0
4,Volkswagen,Passat,2.0 tdi scr,2019,77167,111.0,Diesel,Wagon,Black,Automatic,6,Austria,Wels,18990.0


In [3]:
offers.shape

(10250, 14)

In [4]:
X=offers.drop(columns=['Price(EUR)'])
y=offers['Price(EUR)']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

In [6]:
X_train.head()

Unnamed: 0,Make,Model,Trim,Year,Mileage(km),kW,Fuel type,Body type,Color,Transmission,Emission standard,Country,City
4693,Bmw,X4,xdrive20d,2017,130451,141.0,Diesel,Suv,White,Automatic,6,Italy,Padova
9391,Mazda,Cx-5,2.2 d,2012,187000,110.0,Diesel,Crossover,,Manual,5,Romania,Bacău
2717,Renault,Clio,0.9 tce,2019,68074,89.0,Petrol,Sedan,Gray,Manual,6,Spain,Valencia
2957,Audi,Sq2,2.0 tfsi s tronic,2024,7,224.0,Petrol,Crossover,,Automatic,6,Switzerland,
8160,Chevrolet,Trax,1.4 ltz,2014,137000,97.0,Diesel,Crossover,Black,Automatic,6,Italy,Ossona


In [7]:
# Define numerical, one-hot encoded, and target encoded features
numerical_features = ['Year', 'Mileage(km)', 'kW', 'Emission standard']
onehot_features = ['Fuel type', 'Body type', 'Color', 'Transmission', 'Country']
target_features = ['Make', 'Model', 'Trim']

# Preprocessing steps for numerical features
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing steps for one-hot encoded features
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing steps for target encoded features
target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])

# Combine preprocessing steps for numerical, one-hot encoded, and target encoded features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('onehot', onehot_transformer, onehot_features),
        ('target', target_transformer, target_features)
    ])

# Define the model
model = ExtraTreesRegressor(n_jobs=-1)

# Construct the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameters for random search
param_grid = {
    'model__n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'model__max_depth': [None, 2, 5, 10, 20, 30],
    'model__max_features': ['sqrt', 'log2'],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=10)

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=50, cv=kf, scoring='r2', verbose=2, n_jobs=10, random_state=10)
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict on the testing set
y_pred = best_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Testing R2 Score:", r2)
print("Testing Mean Squared Error:", mse)
print("Testing Mean Absolute Error:", mae)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Testing R2 Score: 0.8807892863163971
Testing Mean Squared Error: 63129763.29226203
Testing Mean Absolute Error: 3179.4302983226025


In [8]:
best_model

In [9]:
random_search.best_score_

0.8494318987873035

In [10]:
best_model.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num', SimpleImputer(),
                                    ['Year', 'Mileage(km)', 'kW',
                                     'Emission standard']),
                                   ('onehot',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['Fuel type', 'Body type', 'Color',
                                     'Transmission', 'Country']),
                                   ('target',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('target_e

In [11]:
random_search.best_params_

{'model__n_estimators': 200,
 'model__min_samples_split': 5,
 'model__min_samples_leaf': 1,
 'model__max_features': 'sqrt',
 'model__max_depth': 30,
 'model__bootstrap': False}

The result is very similar to the Best model given by Extra Tree Regressor in pycaret
Training (R2: 0.8481, MAE: 2639.4325, MSE: 135654375 )
Testing (R2: 0.8811, MAE: 2300.0798, MSE: 68845246)

Next Random search is to narrow down the hyperparameters

In [13]:
# Define numerical, one-hot encoded, and target encoded features
numerical_features = ['Year', 'Mileage(km)', 'kW', 'Emission standard']
onehot_features = ['Fuel type', 'Body type', 'Color', 'Transmission', 'Country']
target_features = ['Make', 'Model', 'Trim']

# Preprocessing steps for numerical features
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing steps for one-hot encoded features
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing steps for target encoded features
target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])

# Combine preprocessing steps for numerical, one-hot encoded, and target encoded features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('onehot', onehot_transformer, onehot_features),
        ('target', target_transformer, target_features)
    ])

# Define the model
model = ExtraTreesRegressor(n_jobs=-1)

# Construct the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameters for random search
param_grid = {
    'model__n_estimators': [150, 180, 200, 220, 250],
    'model__max_depth': [1, 3, 8, 15, 25, 30, 32, 35, 40],
    'model__max_features': ['sqrt', 'log2'],
    'model__min_samples_split': [3, 4, 5, 6, 7, 8, 9],
    'model__min_samples_leaf': [1, 2, 3],
    'model__bootstrap': [True, False]
}


# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=10)

# Randomized search for hyperparameter tuning
random_search_next = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=50, cv=kf, scoring='r2', verbose=2, n_jobs=10, random_state=10)
random_search_next.fit(X_train, y_train)

# Get the best model
best_model_next = random_search_next.best_estimator_

# Predict on the testing set
y_pred = best_model_next.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Testing R2 Score:", r2)
print("Testing Mean Squared Error:", mse)
print("Testing Mean Absolute Error:", mae)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Testing R2 Score: 0.8749269024940962
Testing Mean Squared Error: 66234273.71413987
Testing Mean Absolute Error: 3472.862699714454


In [14]:
random_search_next.best_params_

{'model__n_estimators': 200,
 'model__min_samples_split': 7,
 'model__min_samples_leaf': 1,
 'model__max_features': 'sqrt',
 'model__max_depth': 35,
 'model__bootstrap': False}

The Next Random Research gives slightly worse score. Last we use Grid search to narrow down the hyperparameters even further 

In [16]:
# Define numerical, one-hot encoded, and target encoded features
numerical_features = ['Year', 'Mileage(km)', 'kW', 'Emission standard']
onehot_features = ['Fuel type', 'Body type', 'Color', 'Transmission', 'Country']
target_features = ['Make', 'Model', 'Trim']

# Preprocessing steps for numerical features
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing steps for one-hot encoded features
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing steps for target encoded features
target_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])

# Combine preprocessing steps for numerical, one-hot encoded, and target encoded features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('onehot', onehot_transformer, onehot_features),
        ('target', target_transformer, target_features)
    ])

# Define the model
model = ExtraTreesRegressor(n_jobs=-1)

# Construct the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define hyperparameters for random search
param_grid = {
    'model__n_estimators': [190, 200, 210],
    'model__max_depth': [29, 30, 31],
    'model__max_features': ['sqrt'],
    'model__min_samples_split': [5, 6, 7],
    'model__min_samples_leaf': [1],
    'model__bootstrap': [False]
}

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=10)

# Randomized search for hyperparameter tuning
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kf, n_jobs=10)
grid_search.fit(X_train, y_train)

# Get the best model
best_model_grid = grid_search.best_estimator_



In [17]:
# Predict on the testing set
y_pred = best_model_grid.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Testing R2 Score:", r2)
print("Testing Mean Squared Error:", mse)
print("Testing Mean Absolute Error:", mae)


Testing R2 Score: 0.8818256593040567
Testing Mean Squared Error: 62580936.9378868
Testing Mean Absolute Error: 3170.6878798827815


In [20]:
grid_search.best_score_

0.8511500582231116

In [22]:
grid_search.best_params_

{'model__bootstrap': False,
 'model__max_depth': 30,
 'model__max_features': 'sqrt',
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 5,
 'model__n_estimators': 190}

In [23]:
import pickle

In [24]:
pickle.dump(best_model_grid, open('etr_model.pkl', 'wb'))

In [None]:
X_test.to_pickle('xtest.p')
y_test.to_pickle('ytest.p')