In [1]:
# Boosting

# xgBoost ( Extreme Gradient Boosting)

# xgB-Reg

!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('cardekho.csv')

df.drop(['Unnamed: 0','car_name','brand'],axis=1,inplace=True)

X = df.drop('selling_price',axis=1)
y = df['selling_price']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

num_cols = X_train.select_dtypes(exclude='object').columns
cat_cols = X_train.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

oh_cols = ['model', 'seller_type', 'fuel_type', 'transmission_type']

preprocessor = ColumnTransformer(
    [('OneHotEn', OneHotEncoder(handle_unknown='ignore', sparse_output=False), oh_cols)],
    remainder='passthrough'
)

model_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('XGB-Reg',XGBRegressor())
])

model_pipeline.fit(X_train,y_train)

y_pred = model_pipeline.predict(X_test)

In [3]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

score = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)

print(f'r2_score={score}')
print(f'mae = {mae}')
print(f'rmse = {rmse}')

r2_score=0.9409081339836121
mae = 98409.546875
rmse = 210910.6090077026


In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'XGB-Reg__colsample_bytree':[0.5,0.8,1,0.3],
    'XGB-Reg__max_depth':[5,10,None],
    'XGB-Reg__n_estimators':[100,200,300],
    'XGB-Reg__learning_rate':[0.1,0.001]
}

randomcv = RandomizedSearchCV(model_pipeline,param_distributions=params , cv=3 ,n_iter=28)

randomcv.fit(X_train,y_train)

model = randomcv.best_estimator_

y_pred_ = model.predict(X_test)

score = r2_score(y_test,y_pred_)
mae = mean_absolute_error(y_test,y_pred_)
mse = mean_squared_error(y_test,y_pred_)
rmse = np.sqrt(mse)

print(f'r2_score={score}')
print(f'mae = {mae}')
print(f'rmse = {rmse}')

In [None]:
sns.scatterplot(y_test,y_pred_)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()],'p--')