In [1]:
import xgboost as xgb
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, XGBClassifier,XGBRFRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('autoscout_cleaned.csv')

In [3]:
df = df.drop('emission_class',axis = 1)

In [4]:
df.shape

(15919, 202)

In [10]:
df.to_csv('autoscout_cleaned2.csv')

In [8]:
for i in df.columns:
    print(i)

price
km
registration
hp_kw
comfort_convenience_air_conditioning
comfort_convenience_air_suspension
comfort_convenience_armrest
comfort_convenience_automatic_climate_control
comfort_convenience_auxiliary_heating
comfort_convenience_cruise_control
comfort_convenience_electric_starter
comfort_convenience_electric_tailgate
comfort_convenience_electrical_side_mirrors
comfort_convenience_electrically_adjustable_seats
comfort_convenience_electrically_heated_windshield
comfort_convenience_heads_up_display
comfort_convenience_heated_steering_wheel
comfort_convenience_hill_holder
comfort_convenience_keyless_central_door_lock
comfort_convenience_leather_seats
comfort_convenience_leather_steering_wheel
comfort_convenience_light_sensor
comfort_convenience_lumbar_support
comfort_convenience_massage_seats
comfort_convenience_multi_function_steering_wheel
comfort_convenience_navigation_system
comfort_convenience_panorama_roof
comfort_convenience_park_distance_control
comfort_convenience_parking_assis

In [5]:
X = df.drop('price', axis=1)
y = df['price']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [7]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor().fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [8]:
def eval_metrics(actual, pred):
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mean_squared_error(actual, pred))
    score = r2_score(actual, pred)
    return print("r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

In [9]:
eval_metrics(y_test, y_pred)

r2_score: 0.9567880886181679 
 mae: 922.0930080701359 
 mse: 2345111.313088822 
 rmse: 1531.37562769192


In [15]:
xgb = XGBRegressor()
xgb_params = {"n_estimators": [50, 100, 300], "subsample":[0.5,0.8,1], "max_depth":[3,5,7], "learning_rate":[0.1,0.01,0.3]}
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 3, n_jobs = -1, verbose = 2).fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed: 29.1min finished


In [16]:
xgb_cv_model.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}

In [19]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 7, 
                         n_estimators= 300, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9620051131250813 
 mae: 858.5102278359572 
 mse: 2061983.2865658381 
 rmse: 1435.9607538389891


In [20]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 7, 
                         n_estimators= 350, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9622851750556805 
 mae: 848.7530937769905 
 mse: 2046784.3198732936 
 rmse: 1430.658701393625


In [21]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 7, 
                         n_estimators= 400, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.962342536667314 
 mae: 843.5489807128906 
 mse: 2043671.304038604 
 rmse: 1429.570321473765


In [22]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 7, 
                         n_estimators= 500, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9624010862449915 
 mae: 835.339544401696 
 mse: 2040493.8172624512 
 rmse: 1428.4585458676954


In [23]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 7, 
                         n_estimators= 1000, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9622888724231048 
 mae: 824.2201009012347 
 mse: 2046583.6636676758 
 rmse: 1430.5885724650802


In [24]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 9, 
                         n_estimators= 500, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9625472110091504 
 mae: 811.0700786341375 
 mse: 2032563.623327647 
 rmse: 1425.6800564389077


In [25]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 15, 
                         n_estimators= 500, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9611094500612094 
 mae: 806.584771084426 
 mse: 2110590.9393318016 
 rmse: 1452.7873000999841


In [26]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 15, 
                         n_estimators= 1000, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9610020215538256 
 mae: 808.0700100846027 
 mse: 2116421.0866212337 
 rmse: 1454.7924548268848


In [28]:
xgb_tuned = XGBRegressor(learning_rate= 0.15, max_depth= 7, 
                         n_estimators= 300, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9621557459131751 
 mae: 847.3064509252807 
 mse: 2053808.4420800305 
 rmse: 1433.1114548701473


In [29]:
xgb_tuned = XGBRegressor(learning_rate= 0.2, max_depth= 7, 
                         n_estimators= 300, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.962196987811894 
 mae: 838.9241248662747 
 mse: 2051570.2433943893 
 rmse: 1432.3303541412467


In [33]:
xgb_tuned = XGBRegressor(learning_rate= 0.2, max_depth= 7, 
                         n_estimators= 600, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.962230307503898 
 mae: 829.0428405455009 
 mse: 2049761.9830289315 
 rmse: 1431.6989847830903


In [34]:
xgb_tuned = XGBRegressor(learning_rate= 0.2, max_depth= 7, 
                         n_estimators= 700, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.962224967446696 
 mae: 827.8473585885973 
 mse: 2050051.7880423355 
 rmse: 1431.8001913822807


In [35]:
xgb_tuned = XGBRegressor(learning_rate= 0.2, max_depth= 7, 
                         n_estimators= 800, 
                         subsample= 0.8).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9622279345513421 
 mae: 826.7533512594712 
 mse: 2049890.7632125954 
 rmse: 1431.7439586785745


In [40]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 15, 
                         n_estimators= 550, 
                         subsample= 0.9).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9619754701329204 
 mae: 801.9925435895296 
 mse: 2063592.0123557253 
 rmse: 1436.5208012262563


In [41]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 16, 
                         n_estimators= 550, 
                         subsample= 0.9).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9609718729053722 
 mae: 807.9307623628396 
 mse: 2118057.254388441 
 rmse: 1455.3546833636262


In [43]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 15, 
                         n_estimators= 590, 
                         subsample= 0.9).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9619760349631811 
 mae: 801.8833740847793 
 mse: 2063561.3590059418 
 rmse: 1436.510131884193


In [47]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 9, 
                         n_estimators= 650, 
                         subsample= 0.9).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9630028625641658 
 mae: 810.9428559116383 
 mse: 2007835.4041324568 
 rmse: 1416.9810881350734


In [10]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 33, 
                         n_estimators= 650, max_features= 50,
                         subsample= 0.9).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

Parameters: { max_features } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


r2_score: 0.9622445410601117 
 mae: 814.8487237518157 
 mse: 2048989.5276425679 
 rmse: 1431.4291905793202


In [None]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, max_depth= 33, 
                         n_estimators= 650, max_features= 50,
                         subsample= 0.9).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

In [11]:
# xgb = XGBRegressor()
# xgb_params = {"n_estimators": [600, 700], "subsample":[0.8,0.9], "max_depth":[9,11], "learning_rate":[0.1]}
# xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 3, n_jobs = -1, verbose = 2).fit(X_train, y_train)