# Importing the packages

In [1]:
!pip install xgboost
!pip install plotly

import warnings

warnings.filterwarnings("ignore")

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Library to split data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)
# for cross validation and hyperparameter tuning to find optimal
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Libraries different ensemble classifiers
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

from xgboost import XGBClassifier

# Libraries to get different metric scores
from sklearn import metrics
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

from sklearn import metrics 
import plotly.express as px
from sklearn.linear_model import LinearRegression# import linear regression models
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor# import random forest regressor
# importing the perofrmace metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
# check xgboost version
from  xgboost import XGBRegressor



In [3]:
# loading the car price dataset
dataframe = pd.read_csv("Car_prices_project_2.csv")
dataframe

Unnamed: 0,mark,model,generation_name,year,mileage,vol_engine,fuel,city,province,price
0,seat,leon,gen-i-1999-2005-leon,1999,169000,1600,Gasoline,Gubin,Lubuskie,3999
1,volvo,xc-60,gen-ii-2017,2019,65942,1969,Diesel,Warszawa,Mazowieckie,169999
2,mazda,cx-3,,2019,76000,1998,Gasoline,Zielona Góra,Lubuskie,95900
3,citroen,xsara-picasso,,2007,179000,1587,Gasoline,Rybnik,Śląskie,8450
4,honda,accord,gen-vii-2002-2008,2008,265000,2204,Diesel,Łódź,Łódzkie,16800
...,...,...,...,...,...,...,...,...,...,...
106129,volvo,v50,,2007,219000,1798,Gasoline,Starachowice,Świętokrzyskie,17000
106130,renault,clio,gen-iv-2012-clio,2018,161853,1461,Diesel,Wrocław,Dolnośląskie,35200
106131,audi,a6,gen-c6-2004-2011,2004,262100,3123,Gasoline,Kraków,Małopolskie,21900
106132,mercedes-benz,e-klasa,gen-w124-1993-1997,1978,227792,2717,Gasoline,Gdynia,Pomorskie,75000


# Data Analysis

In [4]:
# some basic stats about the data
dataframe.describe()

Unnamed: 0,year,mileage,vol_engine,price
count,106134.0,106134.0,106134.0,106134.0
mean,2012.928647,140964.2,1813.301383,70402.88
std,5.688403,92447.68,644.012722,84878.79
min,1945.0,0.0,0.0,500.0
25%,2009.0,67000.0,1461.0,21000.0
50%,2013.0,146146.5,1796.0,41900.0
75%,2018.0,203000.0,1995.0,83900.0
max,2022.0,2800000.0,7600.0,1966770.0


In [5]:
# check data types
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106134 entries, 0 to 106133
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   mark             106134 non-null  object
 1   model            106134 non-null  object
 2   generation_name  78999 non-null   object
 3   year             106134 non-null  int64 
 4   mileage          106134 non-null  int64 
 5   vol_engine       106134 non-null  int64 
 6   fuel             106134 non-null  object
 7   city             106134 non-null  object
 8   province         106134 non-null  object
 9   price            106134 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 8.1+ MB


In [6]:
# check for the missing values
dataframe.isna().sum()

mark                   0
model                  0
generation_name    27135
year                   0
mileage                0
vol_engine             0
fuel                   0
city                   0
province               0
price                  0
dtype: int64

#### Observations:
- There are lots of missing 27k missing values in the feature genreration _name

In [7]:
# Replace the missign values by NA 
dataframe = dataframe.fillna("NA")

In [8]:
# again check for the missing values
dataframe.isna().sum()

mark               0
model              0
generation_name    0
year               0
mileage            0
vol_engine         0
fuel               0
city               0
province           0
price              0
dtype: int64

In [9]:
# seperate the class label from features
X= dataframe.drop("price",axis=1)
y= dataframe["price"]

In [10]:
numeric_cols = X.select_dtypes(['float64','int64']).columns
categoric_cols = X.select_dtypes('object').columns
X_numeric = X[numeric_cols]
X_categoric = X[categoric_cols]

In [11]:
# label encoding the categoric feature
le = LabelEncoder()
for i in X_categoric.columns:
    X_categoric[i]= le.fit_transform(X_categoric[i])

In [12]:
X_categoric

Unnamed: 0,mark,model,generation_name,fuel,city,province
0,18,179,104,3,969,5
1,22,319,207,1,3625,6
2,10,100,0,3,4008,5
3,4,323,0,3,2935,21
4,7,33,322,1,4123,20
...,...,...,...,...,...,...
106129,22,298,0,3,3185,22
106130,17,84,260,1,3804,2
106131,1,29,43,3,1542,7
106132,11,109,329,3,812,14


In [13]:
X = pd.concat([X_numeric,X_categoric], axis=1)

In [14]:
# scaling of numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [15]:
# split dataset for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Machine Learning Models

In [16]:
def performance_metrics(model,X,y):
    print(model)
    y_pred = model.predict(X)
    print("MSE : ",mean_squared_error(y_pred,y))
    print("MAE :",mean_absolute_error(y_pred,y))
    print("R2_score :",r2_score(y_pred,y))
    print("Actual Values:", (y[:10]))
    print("Predicted Values:",(y_pred[:10]))
    fig=px.scatter(x=y,y=y_pred,title="Actual vs Predicted plot")
    fig.show()

In [17]:
xgb = XGBRegressor().fit(X_train, y_train)
rs = RandomForestRegressor(random_state=17, n_jobs=-1).fit(X_train, y_train)



In [18]:
print("For Training Data")
performance_metrics(xgb ,X_train, y_train)
print("For Testing Data")
performance_metrics(xgb,X_test, y_test)

For Training Data
XGBRegressor()
MSE :  768433243.8482447
MAE : 14498.521775993491
R2_score : 0.8742416040667821
Actual Values: 43553     35900
98617     51000
28078     22900
67203     31900
50285     61890
81063     27900
42926     49900
64033    284400
56004     76900
10580     23900
Name: price, dtype: int64
Predicted Values: [ 32376.318  51177.215  19588.758  37224.723  49071.8    25607.582
  47970.594 261639.86   88220.33   21584.21 ]


For Testing Data
XGBRegressor()
MSE :  848811758.7576594
MAE : 14390.986047429305
R2_score : 0.8552422484946036
Actual Values: 61966     41900
67759    101900
88061     38900
54355     42900
35893     69900
35313     32000
65378     28900
77523     11600
96110     64900
49904     18800
Name: price, dtype: int64
Predicted Values: [ 45469.32  131196.11   53097.81   57751.62   65959.96   31694.422
  36700.684  12393.067  95888.63   20210.113]


In [19]:
print("For Training Data")
performance_metrics(rs,X_train, y_train)
print("For Testing Data")
performance_metrics(rs,X_test, y_test)

For Training Data
RandomForestRegressor(n_jobs=-1, random_state=17)
MSE :  75196358.35791944
MAE : 3515.8489534711107
R2_score : 0.9892932568226638
Actual Values: 43553     35900
98617     51000
28078     22900
67203     31900
50285     61890
81063     27900
42926     49900
64033    284400
56004     76900
10580     23900
Name: price, dtype: int64
Predicted Values: [ 35876.98        50905.98        21787.97        30576.66
  63867.49        26857.09        48626.9        274154.11111111
  79100.99        23335.97      ]


For Testing Data
RandomForestRegressor(n_jobs=-1, random_state=17)
MSE :  448412784.0092449
MAE : 8591.236814337344
R2_score : 0.9313777301085866
Actual Values: 61966     41900
67759    101900
88061     38900
54355     42900
35893     69900
35313     32000
65378     28900
77523     11600
96110     64900
49904     18800
Name: price, dtype: int64
Predicted Values: [44230.   97582.01 48789.79 45527.   54714.69 22634.03 26637.24  8329.75
 68342.49 19285.42]


# HyperParameter Tuning

In [20]:
def grid_search (estimator, param):
    gscv = GridSearchCV(estimator, param, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
    gscv.fit(X_train, y_train)
    return gscv.best_estimator_

def random_search (estimator, param):
    rscv = RandomizedSearchCV(estimator, param, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
    rscv.fit(X_train, y_train)
    return rscv.best_estimator_

In [21]:
rs_params = {
    "max_depth": list(np.arange(5, 15, 5)),
    "max_features": ["sqrt", "log2"],
    "min_samples_split": [5, 7],
    "n_estimators": np.arange(100, 551, 50),
}

xgb_params = parameters = {
    "n_estimators": np.arange(150, 250, 50),
    "scale_pos_weight": [1, 2],
    "subsample": [0.9, 1],
    "learning_rate": np.arange(0.1, 0.21, 0.1),
    "gamma": [3, 5],
    "colsample_bytree": [0.8, 0.9],
    "colsample_bylevel": [0.9, 1],
}

rscv_param = [xgb_params, rs_params]


In [None]:
best_est = []
n=0
for model in [xgb, rs]:
    best_est.append(random_search(model, rscv_param[n]))
    n+=1

In [None]:
# Performance Evaluation
for model in best_est:
    print("For Training Data")
    performance_metrics(model,X_train, y_train)
    print("For Testing Data")
    performance_metrics(model,X_test, y_test)

Observations:
- XGB Regressor works  better as compared to the Random Forest Regressor.

In [None]:
feature_names = dataframe.drop('price',axis=1).columns
importances = xgb.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()