In [1]:
!pip install category_encoders &> /dev/null

In [2]:
# Importing libraries
# Numpy for array operations and also to do calculations
import numpy as np
# Pandas for load dataset and manipulation of tabular data
import pandas as pd
# For plotting
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
%matplotlib inline

#from matplotlib import rcParams
#import matplotlib.gridspec as gridspec
#from IPython.core.pylabtools import figsize
#from datasist.structdata import detect_outliers

# For importing dataset from url
import urllib.request
# To interact with the underlying Operating System
import os
# For numerical computation and ML modeling phase
#import tensorflow as tf
# to manipulate date and time
#from datetime import datetime, timedelta
# Generate words cloud
#from wordcloud import WordCloud
# Import Images to notebook
#from IPython.display import Image
# For statistical analysis
from scipy import stats
# Set warnings to be ignored
import warnings
warnings.filterwarnings("ignore")

In [3]:

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from category_encoders import OrdinalEncoder
from category_encoders import BinaryEncoder
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.metrics import r2_score,mean_absolute_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [4]:
def remove_outliers(df, columns, iqr_coefficient=1.5):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_fence = Q1 - iqr_coefficient * IQR
    upper_fence = Q3 + iqr_coefficient * IQR
    return df[~((df[columns] < lower_fence) | (df[columns] > upper_fence)).any(axis=1)]

In [5]:
df = df = pd.read_csv('cleaned_autos.csv')

In [6]:
df.head()

Unnamed: 0,seller,offerType,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,privat,Angebot,1500,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
1,privat,Angebot,3600,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein
2,privat,Angebot,2200,cabrio,2004,manuell,109,2_reihe,150000,8,benzin,peugeot,nein
3,privat,Angebot,2000,limousine,2004,manuell,105,3_reihe,150000,12,benzin,mazda,nein
4,privat,Angebot,2799,kombi,2005,manuell,140,passat,150000,12,diesel,volkswagen,ja


In [7]:
df.shape

(185575, 13)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185575 entries, 0 to 185574
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   seller               185575 non-null  object
 1   offerType            185575 non-null  object
 2   price                185575 non-null  int64 
 3   vehicleType          185575 non-null  object
 4   yearOfRegistration   185575 non-null  int64 
 5   gearbox              185575 non-null  object
 6   powerPS              185575 non-null  int64 
 7   model                185575 non-null  object
 8   kilometer            185575 non-null  int64 
 9   monthOfRegistration  185575 non-null  int64 
 10  fuelType             185575 non-null  object
 11  brand                185575 non-null  object
 12  notRepairedDamage    185575 non-null  object
dtypes: int64(5), object(8)
memory usage: 18.4+ MB


In [9]:
df.describe()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration
count,185575.0,185575.0,185575.0,185575.0,185575.0
mean,6200.991243,2004.011499,127.963341,124152.876196,6.271071
std,4795.735412,6.077993,145.953033,38625.278471,3.432309
min,1001.0,1923.0,0.0,5000.0,0.0
25%,2400.0,2001.0,90.0,100000.0,3.0
50%,4600.0,2005.0,118.0,150000.0,6.0
75%,8700.0,2008.0,155.0,150000.0,9.0
max,21243.0,2018.0,20000.0,150000.0,12.0


In [10]:
df.columns

Index(['seller', 'offerType', 'price', 'vehicleType', 'yearOfRegistration',
       'gearbox', 'powerPS', 'model', 'kilometer', 'monthOfRegistration',
       'fuelType', 'brand', 'notRepairedDamage'],
      dtype='object')

In [11]:
df = df.drop(['yearOfRegistration','monthOfRegistration'], axis=1)

In [12]:
df[(df['powerPS'] > 0) & (df['powerPS'] < 50)].shape

(1299, 11)

In [13]:
df = df[df['powerPS'] >=40]

In [14]:
df = remove_outliers(df, ['powerPS'])

In [15]:
df.powerPS.describe()

count    175481.000000
mean        125.721218
std          45.956403
min          40.000000
25%          90.000000
50%         120.000000
75%         150.000000
max         265.000000
Name: powerPS, dtype: float64

In [16]:
df.columns

Index(['seller', 'offerType', 'price', 'vehicleType', 'gearbox', 'powerPS',
       'model', 'kilometer', 'fuelType', 'brand', 'notRepairedDamage'],
      dtype='object')

In [17]:
print(df.seller.unique())
print(df.offerType.unique())
print(df.gearbox.unique())
print(df.notRepairedDamage.unique())
print("")
print(df.vehicleType.unique())
print(df.fuelType.unique())
print("")
print(df.model.unique())
print("")
print(df.brand.unique())


['privat' 'gewerblich']
['Angebot' 'Gesuch']
['manuell' 'automatik']
['nein' 'ja']

['kleinwagen' 'cabrio' 'limousine' 'kombi' 'suv' 'bus' 'coupe' 'andere']
['benzin' 'diesel' 'lpg' 'andere' 'hybrid' 'cng' 'elektro']

['golf' 'fabia' '2_reihe' '3_reihe' 'passat' 'navara' 'twingo' 'c_max'
 'a_klasse' 'scirocco' '5er' 'civic' 'transporter' 'e_klasse' '3er'
 'andere' 'one' '1er' 'b_klasse' 'fortwo' 'a8' 'jetta' 'c_klasse' 'micra'
 'vito' 'sprinter' 'forester' 'fiesta' 'scenic' 'a1' 'focus' 'tt' 'astra'
 'a6' 'jazz' 'polo' 'slk' 'combo' '80' '147' 'z_reihe' 'ibiza' 'eos' 'a4'
 'touran' 'getz' 'insignia' 'megane' 'a3' 'lupo' 'clio' 'berlingo' '7er'
 'tiguan' 'mustang' '6_reihe' 'c4' 'panda' 'up' 'i_reihe' 'ceed' '5_reihe'
 'yeti' 'octavia' 'zafira' 'mii' 'rx_reihe' 'corsa' '6er' 'punto' 'fox'
 'vectra' 'matiz' 'beetle' 'rio' 'touareg' 'logan' 'cuore' 's_max' 'modus'
 'a2' 'a5' 'galaxy' 'c3' 'viano' 'mondeo' 'sharan' 'avensis' 'roomster'
 'sl' 'kaefer' 'santa' 'cooper' 'sportage' 'caddy' 'cl

In [18]:
len(df.model.unique())

247

In [19]:
# Splitting the data
X = df.drop('price', axis=1)
y = df['price']

In [20]:
X.columns

Index(['seller', 'offerType', 'vehicleType', 'gearbox', 'powerPS', 'model',
       'kilometer', 'fuelType', 'brand', 'notRepairedDamage'],
      dtype='object')

In [21]:
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [22]:
Encoder = ColumnTransformer(
    transformers=[
        ("BE", BinaryEncoder(), ['seller', 'offerType','gearbox','notRepairedDamage']),
         ('TE', TargetEncoder(),['vehicleType', 'fuelType','model', 'brand'])],
    remainder = "passthrough")

In [97]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , StandardScaler()))
steps.append(("Model" , KNeighborsRegressor()))
pipeline = Pipeline(steps=steps)

In [98]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
results = cross_validate(pipeline , X ,y , cv = kfold , scoring="r2" , return_train_score=True)


In [99]:
results["train_score"].mean()

0.8008112409049897

In [100]:
results["test_score"].mean()

0.7419029113023534

In [143]:
models = list()
models.append(("LR" , LinearRegression()))
models.append(("KNN" , KNeighborsRegressor()))
models.append(("DT" , DecisionTreeRegressor()))
models.append(("RF" , RandomForestRegressor()))
models.append(("xg" , XGBRegressor()))

In [144]:
for model in models:
    steps = list()
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps = steps)
    kfold = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_validate(pipeline , X , y , scoring="r2"  ,cv = kfold , return_train_score=True)
    print(model[0])
    print("Train_r2" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_r2" , scores["test_score"].mean())
    print("-" * 20)
    print("\n")

LR
Train_r2 0.5900514160084663
----------
Test_r2 0.5894575196865732
--------------------


KNN
Train_r2 0.7997059201011641
----------
Test_r2 0.7361963499864377
--------------------


CART
Train_r2 0.8897854153893089
----------
Test_r2 0.7341029083009953
--------------------


RF
Train_r2 0.8796922963277473
----------
Test_r2 0.7871476855522468
--------------------


xg
Train_r2 0.794094967759326
----------
Test_r2 0.7747066093004443
--------------------




best model is XGregressor

In [23]:
steps = list()
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("xg" , XGBRegressor()))
pipeline = Pipeline(steps = steps)
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_validate(pipeline , X , y , scoring="r2"  ,cv = 5 , return_train_score=True , return_estimator=True)

In [24]:
scores["estimator"][0]["xg"].get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [33]:
param_grid = {
    "xg__learning_rate": [0.2],
    "xg__max_depth": [10],
    "xg__n_estimators": [240],
    "xg__reg_lambda" : [3]

}
# best parameters are 0.2 10 240 4 with train & test 0.8666 0.8020

In [34]:
steps = []
steps.append(("Encoder" , Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("xg" , XGBRegressor()))
pipeline = Pipeline(steps=steps)
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=pipeline , param_grid=param_grid , cv =kfold ,scoring="r2" , return_train_score=True , n_jobs = -1)
grid_search.fit(X,y)


In [35]:
grid_search.best_params_

{'xg__learning_rate': 0.2,
 'xg__max_depth': 10,
 'xg__n_estimators': 240,
 'xg__reg_lambda': 3}

In [36]:
grid_search.cv_results_["mean_train_score"].mean()

0.8666530567851561

In [37]:
grid_search.cv_results_["mean_test_score"].mean()

0.8020188190355118