**Name : Niket Ralebhat** \
**Section : Cse 2** \
**Scholar Number : 211112268**  

# Lab 6

### Explore and compare the performance of Bagging and Random Forest regression techniques applied to regression tasks in predicting the prices of used cars based on their features. A dataset containing information about thousands of used cars sold in a particular region. Each data point includes various features such as mileage, age, brand, model, fuel type, and engine size, along with the corresponding price of the car.Task is to develop predictive models using Bagging and Random Forest regression techniques to estimate the prices of used cars accurately.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn
import warnings
 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('used_cars.csv')
df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,price
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,10300
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,38005
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,54598
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,15500
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,34999


In [3]:
df.drop_duplicates(subset=None, keep='first', inplace=True)
df.shape

(4009, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   brand       4009 non-null   object
 1   model       4009 non-null   object
 2   model_year  4009 non-null   int64 
 3   milage      4009 non-null   int64 
 4   fuel_type   3839 non-null   object
 5   engine      4009 non-null   object
 6   price       4009 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 219.4+ KB


In [5]:
df.describe()

Unnamed: 0,model_year,milage,price
count,4009.0,4009.0,4009.0
mean,2015.51559,64717.55101,44553.19
std,6.104816,52296.599459,78710.64
min,1974.0,100.0,2000.0
25%,2012.0,23044.0,17200.0
50%,2017.0,52775.0,31000.0
75%,2020.0,94100.0,49990.0
max,2024.0,405000.0,2954083.0


In [6]:
y = df.iloc[:,-1].values  #Target variable
y

array([10300, 38005, 54598, ..., 90998, 62999, 40000], dtype=int64)

In [7]:
df.drop(['price'], axis=1 , inplace=True)
df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo


# Random Forest regression

In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

#  Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = df.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = df.select_dtypes(exclude=['object']).values
x = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1).values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)

# Fit the regressor with x and y data
regressor.fit(X_train, y_train)

# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score

# Making predictions on the same data or new data
predictions = regressor.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')


Mean Squared Error: 17566613703.477356
R-squared: 0.14055859279352978


# Bagging

In [9]:
from sklearn.tree import DecisionTreeClassifier

class BaggingClassifier:
	def __init__(self, base_classifier, n_estimators):
		self.base_classifier = base_classifier
		self.n_estimators = n_estimators
		self.classifiers = []
		
	def fit(self, X, y):
		for _ in range(self.n_estimators):
			# Bootstrap sampling with replacement
			indices = np.random.choice(len(X), len(X), replace=True)
			X_sampled = X[indices]
			y_sampled = y[indices]

			# Create a new base classifier and train it on the sampled data
			classifier = self.base_classifier.__class__()
			classifier.fit(X_sampled, y_sampled)

			# Store the trained classifier in the list of classifiers
			self.classifiers.append(classifier)
		return self.classifiers
	def predict(self, X):
		# Make predictions using all the base classifiers
		predictions = [classifier.predict(X) for classifier in self.classifiers]
		# Aggregate predictions using majority voting
		majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

		return majority_votes


# Create the base classifier
dc = DecisionTreeClassifier()
model = BaggingClassifier(base_classifier=dc, n_estimators=10)
classifiers = model.fit(X_train, y_train)
 
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

Mean Squared Error: 20650429199.577305
R-squared: -0.010316173070343826
