In [1]:
# Explore and compare the performance of Bagging and Random Forest regression techniques applied to regression tasks in predicting the prices of used cars based on their features. A dataset containing information about thousands of used cars sold in a particular region. Each data point includes various features such as mileage, age, brand, model, fuel type, and engine size, along with the corresponding price of the car.
# Task is to develop predictive models using Bagging and Random Forest regression techniques to estimate the prices of used cars accurately.

In [2]:
# mileage, age, brand, model, fuel type, and engine size, along with the corresponding price of the car.

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [4]:
df=pd.read_csv("archive/used_cars.csv")

In [5]:
#Preprocessing

#missing value check
df.isnull().sum()
#duplicate value check
df.duplicated().sum()
#ata types
df.dtypes
#unique elements in dataset
df.nunique()
#mean, median, mode, etc
df.describe()

Unnamed: 0,model_year
count,4009.0
mean,2015.51559
std,6.104816
min,1974.0
25%,2012.0
50%,2017.0
75%,2020.0
max,2024.0


In [6]:
#modify milage column (taking only value by removing , and mi.) and converting to float
df['milage'] = df['milage'].str.replace(',', '').str.replace(' mi.', '').astype(float)
df['milage']

0       51000.0
1       34742.0
2       22372.0
3       88900.0
4        9835.0
         ...   
4004      714.0
4005    10900.0
4006     2116.0
4007    33000.0
4008    43000.0
Name: milage, Length: 4009, dtype: float64

In [7]:
# Removing $ sign and , and converting to float
df['price']=df['price'].str.replace('$','').str.replace(',','').astype(float)
df['price']

0        10300.0
1        38005.0
2        54598.0
3        15500.0
4        34999.0
          ...   
4004    349950.0
4005     53900.0
4006     90998.0
4007     62999.0
4008     40000.0
Name: price, Length: 4009, dtype: float64

In [8]:
# Replacing null values with most frequent value
most_frequent_val=df['fuel_type'].value_counts().idxmax()
most_frequent_val

'Gasoline'

In [9]:
df['fuel_type']=df['fuel_type'].fillna(most_frequent_val)

In [10]:
df['fuel_type'].value_counts()

fuel_type
Gasoline          3479
Hybrid             194
E85 Flex Fuel      139
Diesel             116
–                   45
Plug-In Hybrid      34
not supported        2
Name: count, dtype: int64

In [11]:
# Replacing "-" and "not supported" value with Electric (No strong reason)
df['fuel_type']=df['fuel_type'].replace('–', 'Electric')
df['fuel_type']=df['fuel_type'].replace('not supported', 'Electric')
df['fuel_type'].value_counts()

fuel_type
Gasoline          3479
Hybrid             194
E85 Flex Fuel      139
Diesel             116
Electric            47
Plug-In Hybrid      34
Name: count, dtype: int64

In [12]:
# Age = current_year - year_of_birth_of_model
df['age']=2024-df['model_year']
df.drop('model_year',axis=1,inplace=True)

In [13]:
df.head()

Unnamed: 0,brand,model,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,age
0,Ford,Utility Police Interceptor Base,51000.0,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300.0,11
1,Hyundai,Palisade SEL,34742.0,Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005.0,3
2,Lexus,RX 350 RX 350,22372.0,Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,54598.0,2
3,INFINITI,Q50 Hybrid Sport,88900.0,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,15500.0,9
4,Audi,Q3 45 S line Premium Plus,9835.0,Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,34999.0,3


In [14]:
# !pip install category_encoders

In [15]:
# https://towardsdatascience.com/dealing-with-categorical-variables-by-using-target-encoder-a0f1733a4c69
# Target encoders (read from above link) 
import category_encoders as ce
encoder = ce.TargetEncoder(cols=['brand'])
df['brand_target'] = encoder.fit_transform(df['brand'], df['price'])
encoder = ce.TargetEncoder(cols=['model'])
df['model_target'] = encoder.fit_transform(df['model'], df['price'])
encoder = ce.TargetEncoder(cols=['fuel_type'])
df['fuel_type_target'] = encoder.fit_transform(df['fuel_type'], df['price'])
encoder = ce.TargetEncoder(cols=['engine'])
df['engine_target'] = encoder.fit_transform(df['engine'], df['price'])
df.head()

Unnamed: 0,brand,model,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,age,brand_target,model_target,fuel_type_target,engine_target
0,Ford,Utility Police Interceptor Base,51000.0,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300.0,11,36240.880829,39313.75062,22744.263199,39431.914328
1,Hyundai,Palisade SEL,34742.0,Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005.0,3,19087.3589,42799.774211,45021.046853,44029.715721
2,Lexus,RX 350 RX 350,22372.0,Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,54598.0,2,35668.526945,45860.105184,45021.046853,46198.538269
3,INFINITI,Q50 Hybrid Sport,88900.0,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,15500.0,9,23356.701517,40773.124054,51426.195686,40749.497945
4,Audi,Q3 45 S line Premium Plus,9835.0,Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,34999.0,3,39907.430071,43310.109195,45021.046853,48588.566851


In [16]:
# Making x and y set
x=df[['milage','age','brand_target','model_target','fuel_type_target','engine_target']]
y=df[['price']]

In [17]:
# train test split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [18]:
# Bagging Regressor
from sklearn.ensemble import BaggingRegressor
bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(x_train,y_train.values.ravel())
# https://medium.com/@megha.natarajan/ravel-vs-values-navigating-the-nuances-in-python-9d92868fd094

In [19]:
y_pred=bag_regressor.predict(x_test)
r2_score(y_test,y_pred)

0.25411199667641604

In [20]:
# GridSeearchCv : Instead of looping done in assignment-1 and 2.
# There is inbuilt sklearn implmentation of finding best hyperparameter
# by just specifying combinations to try for getting best result.
# Then GridSearchCV finds the best results and along with
# values of hyperparameter

%%time
import warnings
warnings.filterwarnings('ignore')
n_samples = x.shape[0]
n_features = x.shape[1]+1

# max_samples = How many rows for samples to take
# max_features = How much fraction of features to take for sample
# bootstrap = True (Replacement allowed on rows)
# bootstrap_features = True (Replacement allowed on columns) 
params = {
          'n_estimators': [20,50,100],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

# n_jobs=-1 (using all cores) : parallel computing
# verbose = getting proper information of running status
bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=3, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(x_train, y_train.values.ravel())

print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(x_train, y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(x_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Train R^2 Score : 0.999
Test R^2 Score : 0.244
Best R^2 Score Through Grid Search : 0.910
Best Parameters :  {'bootstrap': False, 'bootstrap_features': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}
CPU times: user 408 ms, sys: 218 ms, total: 625 ms
Wall time: 11.8 s


In [24]:
# Random Forest Regressor
rfr=RandomForestRegressor(random_state=1)
rfr.fit(x_train,y_train.values.ravel())

In [25]:
y_pred=rfr.predict(x_test)
r2_score(y_test,y_pred)

0.3734990763548862

In [26]:
# GridSearchCV
%%time

n_samples = x.shape[0]
n_features = x.shape[1]+1

params = {
          'n_estimators': [20,50,100],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],}


random_forest_regressor_grid = GridSearchCV(RandomForestRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=3, n_jobs=-1, verbose=1)
random_forest_regressor_grid.fit(x_train, y_train.values.ravel())

print('Train R^2 Score : %.3f'%random_forest_regressor_grid.best_estimator_.score(x_train, y_train))
print('Test R^2 Score : %.3f'%random_forest_regressor_grid.best_estimator_.score(x_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%random_forest_regressor_grid.best_score_)
print('Best Parameters : ',random_forest_regressor_grid.best_params_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Train R^2 Score : 0.984
Test R^2 Score : 0.328
Best R^2 Score Through Grid Search : 0.908
Best Parameters :  {'bootstrap': True, 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100}
CPU times: user 952 ms, sys: 28.4 ms, total: 980 ms
Wall time: 2.75 s
