# <p style="font-family: Helvetica, fantasy; line-height: 1.3; font-size: 26px; letter-spacing: 3px; text-align: center; color: #99e600">Prediction of Cars prices using Linear Regressors & Ensemble methods</p>

![](https://www.newneuromarketing.com/media/zoo/images/NNM-2015-019-Cost-consciousness-increase-product-sales-with-Price-Primacy_6a73d15598e2d828b0e141642ebb5de3.png)

# Beginning libraries 📚

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import numpy as np

# Reading the data & gathering short information 📖

In [None]:
df = pd.read_csv('../input/cars-dataset-audi-bmw-ford-hyundai-skoda-vw/cars_dataset.csv')
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
categoricals = list(df.select_dtypes('object').columns)
numericals = [col for col in df.columns if col not in categoricals]
print(categoricals)
print(numericals)

In [None]:
# Turn 'object' into category for less memory usage.
df[categoricals] = df[categoricals].astype('category')
df[categoricals].dtypes

# Exploratory Data Analysis (EDA) 🧭

In [None]:
# Relationships between numerical features
sns.pairplot(df, corner=True)

In [None]:
for col in df[categoricals]:
    print(f'We have {len(df[col].unique())} unique values in --{col}-- column: {df[col].unique()}', '\n')

In [None]:
# Let's induce some categorical variables on our relationships --- transmission
sns.pairplot(df, hue='transmission', corner=True)

In [None]:
# Let's induce some categorical variables on our relationships --- fuelType
sns.pairplot(df, hue='fuelType', corner=True)

In [None]:
# Let's induce some categorical variables on our relationships --- Make
sns.pairplot(df, hue='Make', corner=True)

## Explore categorical features 📊

In [None]:
# Count plots for 'transmission', 'fuelType', 'Make'.
x=0
fig=plt.figure(figsize=(20,10))
plt.subplots_adjust(wspace = 0.5)
plt.suptitle("Count of 'transmission', 'fuelType', and 'Make'", x=0.4 ,y=0.95, family='Sherif', size=18, weight='bold')
for i in df[categoricals[1:]]:
    ax = plt.subplot(241+x)
    ax = sns.countplot(data=df, y=i, color='#a6ff4d')
    plt.grid(axis='x')
    x+=1

In [None]:
colors = ['#101907', '#314c17', '#63992e', '#95e545', '#aeff5e', '#c0ff82', '#dbffb7']

In [None]:
# Count of 'transmission' by  'Make'
fig=plt.figure(figsize=(15,8))
plt.suptitle("Count of 'transmission' by 'Make'", x=0.5 ,y=0.92, family='Sherif', size=18, weight='bold')
sns.countplot(data=df, x='transmission', hue='Make', palette=colors)
plt.grid(axis='y')

In [None]:
# Count of 'fuelType' by 'Make'
fig=plt.figure(figsize=(15,8))
plt.suptitle("Count of 'fuelType' by 'Make'", x=0.5 ,y=0.92, family='Sherif', size=18, weight='bold')
sns.countplot(data=df, x='fuelType', hue='Make', palette=colors)
plt.grid(axis='y')

## Explore numerical features 📈

In [None]:
# Variance of numerical features
df.var()

In [None]:
# Correlation matrix
corr = df.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)]=True
with sns.axes_style('white'):
    fig, ax = plt.subplots(figsize=(18,10))
    sns.heatmap(corr,  mask=mask, annot=True, cmap=colors, center=0, square=True)

 - Moderate positive correlation between price & engineSize
 - Moderate negative correlation between year & mileage

In [None]:
# Show spines (black border of the plot)
plt.rcParams['axes.spines.left'] = True
plt.rcParams['axes.spines.right'] = True
plt.rcParams['axes.spines.top'] = True
plt.rcParams['axes.spines.bottom'] = True

In [None]:
# Distribution of numerical features
x=0
fig=plt.figure(figsize=(15,10),constrained_layout =True)
plt.subplots_adjust(wspace = 0.5)
plt.suptitle("Distribution of numerical variables",y=0.95, family='Sherif', size=18, weight='bold')
for i in df[numericals]:
    ax = plt.subplot(231+x)
    ax = sns.histplot(data=df, x=i, bins=20, color='#a6ff4d')
    x+=1

## Investigate both categorical & numerical features 🤝


Answering some questions:

 - **What is the mean price of a car by its 'Make'?**

 - **What is the mean price of a car by its 'transmission'?**

 - **What is the mean price of a car by its 'fuelType'?**

In [None]:
# Hide spines (black border of the plot)
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = False

In [None]:
mp_make = df.groupby('Make')['price'].mean().sort_values()
mp_transmission = df.groupby('transmission')['price'].mean().sort_values()
mp_fueltype = df.groupby('fuelType')['price'].mean().sort_values()


fig, ax = plt.subplots(1,3, figsize=[15,7], constrained_layout=True)
plt.suptitle("Mean price of a car by certain feature",y=1.15, family='Sherif', size=18, weight='bold')

# First plot
vals_0 = [round(i) for i in mp_make]
ax[0].barh(mp_make.index, mp_make, color = '#95e545')
ax[0].set_title("Make")
ax[0].set_xticks([])
for index, value in enumerate(vals_0):
    ax[0].text(value, index, str(value))

# Second plot
vals_1 = [round(i) for i in mp_transmission]
ax[1].barh(mp_transmission.index, mp_transmission, 0.45, color = '#c0ff82')
ax[1].set_title("Transmission")
ax[1].set_xticks([])
for index, value in enumerate(vals_1):
    ax[1].text(value, index, str(value))

# Third plot
vals_2 = [round(i) for i in mp_fueltype]
ax[2].barh(mp_fueltype.index, mp_fueltype, 0.6, color = '#537f26')
ax[2].set_title("fuelType")
ax[2].set_xticks([])
for index, value in enumerate(vals_2):
    ax[2].text(value, index, str(value))

# Layout spacing
fig.set_constrained_layout_pads(w_pad=2 / 72, h_pad=2 / 72, hspace=0.2,
                                wspace=0.2)

The above approach can be applied to further investigate with other aggregate functions or other numerical columns such as: 'mileage', 'tax', 'mpg' or 'engineSize'.

**Explore 'mileage' on 'year'**

In [None]:
# Show spines (black border of the plot)
plt.rcParams['axes.spines.left'] = True
plt.rcParams['axes.spines.right'] = True
plt.rcParams['axes.spines.top'] = True
plt.rcParams['axes.spines.bottom'] = True

In [None]:

fig, axes = plt.subplots(4, 1, figsize=(18, 12), constrained_layout =True)

# First plot
ax = sns.lineplot(ax = axes[0], data=df, x="year", y='mileage', ci=None)
ax.set_xticks(np.arange(1996, 2020, 1))
ax.set_title("Mean 'mileage' by 'year'")
ax.set_xlim(1996,2020)

# Second plot
ax1 = sns.lineplot(ax = axes[1], data=df, x='year', y='mileage', hue='transmission', ci=None)
ax1.set_xticks(np.arange(1996, 2020, 1))
ax1.set_title("Mean 'mileage' by 'year' for each 'transmission'")
ax1.set_xlim(1996,2020)

# Third plot
ax2 = sns.lineplot(ax = axes[2], data=df, x='year', y='mileage', hue='fuelType', ci=None)
ax2.set_xticks(np.arange(1996, 2020, 1))
ax2.set_title("Mean 'mileage' by 'year' for each 'fuelType'")
ax2.set_xlim(1996,2020)

# Third plot
ax3 = sns.lineplot(ax = axes[3], data=df, x='year', y='mileage', hue='Make', ci=None)
ax3.set_xticks(np.arange(1996, 2020, 1))
ax3.set_title("Mean 'mileage' by 'year' for each 'Make'")
ax3.set_xlim(1996,2020)
ax3.legend(loc='upper right')

plt.show()

In [None]:
# Bins number preset
b = 30

fig, axes = plt.subplots(2, 3, figsize=(20, 7), constrained_layout =True)

# First plot
ax = sns.histplot(ax = axes[0,0], data=df, x='price', hue='transmission', element='poly', bins = b)
ax.set_title("'price' distribution by 'transmission'")

# Second plot
ax1 = sns.histplot(ax = axes[0,1], data=df, x='mileage', hue='transmission', element='poly', bins = b, legend=False)
ax1.set_title("'mileage' distribution by 'transmission'")

# Third plot
ax2 = sns.histplot(ax = axes[0,2], data=df, x='year', hue='transmission', element='poly', bins = b, legend=False)
ax2.set_title("'year' distribution by 'transmission'")


# Fourth plot
ax3 = sns.histplot(ax = axes[1,0], data=df, x='mpg', hue='transmission', element='poly', bins = b, legend=False)
ax3.set_title("'mpg' distribution by 'transmission'")

# Fifth plot
ax4 = sns.histplot(ax = axes[1,1], data=df, x='engineSize', hue='transmission', element='poly', bins = b, legend=False)
ax4.set_title("'engine' distribution by 'transmission'")

# Sixth plot
ax5 = sns.histplot(ax = axes[1,2], data=df, x='tax', hue='transmission', element='poly', bins = b, legend=False)
ax5.set_title("'tax' distribution by 'transmission'")


plt.show()

In [None]:
# Split data by 'make'
bmw = df[df['Make'] == 'BMW']
ford = df[df['Make'] == 'Ford']
hyundai = df[df['Make'] == 'Hyundai']
audi = df[df['Make'] == 'audi']
skoda = df[df['Make'] == 'skoda']
toyota = df[df['Make'] == 'toyota']
vw = df[df['Make'] == 'vw']

In [None]:
# Bins number & color preset 
b = 40
c='#a6ff4d'

# Price distribution of each 'Make'
fig, axes = plt.subplots(3, 3, figsize=(20, 7), constrained_layout =True)
axes[-1, -1].axis('off') # hide axes
axes[-1, -2].axis('off') # hide axes
plt.suptitle("Price distribution for each 'Make'",y=1.15, family='Sherif', size=18, weight='bold')

# First plot
ax = sns.histplot(ax = axes[0,0], data=bmw, x='price', element='poly', bins = b, color = c)
ax.set_title("BMW")

# Second plot
ax = sns.histplot(ax = axes[0,1], data=ford, x='price', element='poly', bins = b, color = c)
ax.set_title("Ford")

# Third plot
ax = sns.histplot(ax = axes[0,2], data=hyundai, x='price', element='poly', bins = b, color = c)
ax.set_title("Hyundai")

# Fourth plot
ax = sns.histplot(ax = axes[1,0], data=audi, x='price', element='poly', bins = b, color = c)
ax.set_title("Audi")

# Fifth plot
ax = sns.histplot(ax = axes[1,1], data=skoda, x='price', element='poly', bins = b, color = c)
ax.set_title("Skoda")

# Sixth plot
ax = sns.histplot(ax = axes[1,2], data=toyota, x='price', element='poly', bins = b, color = c)
ax.set_title("Toyota")

# Seventh plot
ax = sns.histplot(ax = axes[2,0], data=vw, x='price', element='poly', bins = b, color = c)
ax.set_title("VW")

plt.show()

To investigate the distribution other numerical features for each 'Make', the above approach can be applied by changing the 'x' of each plot to a different numerical variable.

# Data Cleaning 🧹

In [None]:
transmission_other = df[df['transmission'] == 'Other']
print(transmission_other)

# Replace 'other' with most frequent unique ('Manual') since  it only contains only 4 rows (I consider not enough information for the model)
df['transmission'] = df['transmission'].replace('Other', 'Manual')

In [None]:
electric_fuel = df[df['fuelType'] == 'Electric']
print(electric_fuel)

other_fuel = df[df['fuelType'] == 'Other']
print(other_fuel)

In [None]:
# Assign 'Electric' fuelType  unique  to 'Other' since  it only contains only 5 rows (I consider not enough information for the model).
df['fuelType'] = df['fuelType'].replace('Electric', 'Other')

There are several values of 'mpg' higher than 400, and seems to appear as outliers if we check the scatterplots.

However, these values seems to be very explainable for the BMW 'Make', since the values are for these cars. Considering that, the values will be kept.

# Feature engineering ⚙️

In [None]:
# Turn categoricals into numeric
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for c in categoricals:
    df[c] = le.fit_transform(df[c])

In [None]:
# Convert our categorical columns to dummies
for col in categoricals:
    dumm = pd.get_dummies(df[col], prefix = str(col)+'_', dtype=int)
    df = pd.concat([df,dumm], axis=1)

In [None]:
# Drop the original categories since we one hot encoded them
df.drop(categoricals, axis=1, inplace=True)
df.shape

# Data preparation 🍳

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features & target
y = df['price']
X = df.drop('price', axis = 1)
print(X.shape, y.shape)
print('\n')

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)


# Split into training (80%) and testing set (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# Modelling 🏗️

# Linear regressors

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, ElasticNet, Lars, Lasso, BayesianRidge, HuberRegressor
from sklearn.metrics import mean_squared_error

r_squared = []
rmses = []
 
lin_reg = [('LR', LinearRegression()), ('Ridge', Ridge()), ('SGDR', SGDRegressor()), 
            ('ElasticNet', ElasticNet()), ('Lars', Lars()), ('Lasso', Lasso()),
            ('BayesianRidge', BayesianRidge()), ('HuberRegressor', HuberRegressor())] 


for name, model in lin_reg:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    rs = model.score(X_test, y_test)
    r_squared.append(rs)
    rmses.append(rmse)
    print(f'The accuracy of {name} is {rmse:.3f}')
    print(f'The R^2 of {name} is {rs:.3f}')
    print('\n')

In [None]:
# Create a dataframe that contains relevant performances of the linear regressors
models = [name for name, model in lin_reg]

# Exclude worst models & their performances
remove_indices = [0,2,4]
models = [i for j, i in enumerate(models) if j not in remove_indices]
r_squared = [i for j, i in enumerate(r_squared) if j not in remove_indices]
rmses =  [i for j, i in enumerate(rmses) if j not in remove_indices]

scores_lin_reg = pd.DataFrame({'Model': models, 'Test_R^2': r_squared, 'Test_RMSE': rmses})

In [None]:
# Hide spines (black border of the plot)
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = False

# Plot linear regressors performance
fig, axes = plt.subplots(1, 2, figsize=(15, 5), constrained_layout =True)
plt.suptitle("Linear Regressors performance", x=0.5 ,y=1.15, family='Sherif', size=18, weight='bold')

ax = sns.barplot(ax = axes[0], data=scores_lin_reg.sort_values('Test_RMSE'), x='Model', y='Test_RMSE', palette=colors)
ax.set_title('Root Mean Squared Error')
ax.grid(axis='y')

ax1 = sns.barplot(ax = axes[1], data=scores_lin_reg.sort_values('Test_R^2', ascending = False), x='Model', y='Test_R^2', palette=colors)
ax1.set_title('R^2')
ax1.grid(axis='y')

As can be observed there are 3 models with similar results.

However, the one that slighty wins, is the BayesianRidge Regressor:
 - RMSE: 3104.238
 - R^2: 0.891

 Let's try other modeling approaches.

# Ensemble methods 

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
SEED = 123
rf = RandomForestRegressor(random_state=SEED)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, preds))
rs_rf = rf.score(X_test, y_test)

print(f'Random Forest RMSE is: {rmse_rf}')
print(f'Random Forest R^2 is: {rs_rf}')

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(random_state=SEED)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)
rmse_gb = np.sqrt(mean_squared_error(y_test, preds))
rs_gb = gb.score(X_test, y_test)

print(f'Gradient Boosting RMSE is: {rmse_gb}')
print(f'Gradient Boosting R^2 is: {rs_gb}')

## Extreme Gradient Boosting (XGBoost)

In [None]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor(seed=SEED)
xgb_reg.fit(X_train, y_train)
preds = xgb_reg.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, preds))
rs_xgb = xgb_reg.score(X_test, y_test)

print(f'XGBost RMSE is: {rmse_xgb}')
print(f'XGBoost R^2 is: {rs_gb}')

In [None]:
# Create dataframe with ensemble methods performances
models_en = ['RandomForest', 'GradientBoosting', 'XGBoost']
rmses_en = [rmse_rf, rmse_gb, rmse_xgb]
r_squared_en = [rs_rf, rs_gb, rs_xgb]

scores_en = pd.DataFrame({'Model': models_en, 'Test_R^2': r_squared_en, 'Test_RMSE': rmses_en})

In [None]:
# Plot ensemble methods performance
fig, axes = plt.subplots(1, 2, figsize=(15, 5), constrained_layout =True)
plt.suptitle("Ensemble methods performance", x=0.5 ,y=1.15, family='Sherif', size=18, weight='bold')

ax = sns.barplot(ax = axes[0], data=scores_en.sort_values('Test_RMSE'), x='Model', y='Test_RMSE', palette=colors)
ax.set_title('Root Mean Squared Error')
ax.grid(axis='y')

ax1 = sns.barplot(ax = axes[1], data=scores_en.sort_values('Test_R^2', ascending = False), x='Model', y='Test_R^2', palette=colors)
ax1.set_title('R^2r')
ax1.grid(axis='y')

It can be observed that the ensemble methods perform much better than the linear regressors.

The best performance is obtained by the RandoMForest:
 - RMSE: 1821.37
 - R^2:  0.96


 Let's see if these results can be improved.

# Hyperparameters tuning for Random Forest ✨

In [None]:
from sklearn.model_selection import GridSearchCV

params_rf = {'n_estimators':[50, 100, 200],
               'max_depth':[None, 1, 2],
               'min_samples_leaf':[0.5,1,1.5,2]}

grid_rf = GridSearchCV(estimator=rf, 
                       param_grid=params_rf,
                       cv=5, 
                       scoring = 'neg_mean_squared_error',
                       )

grid_rf.fit(X_train, y_train)

best_hyperparams = grid_rf.best_params_

print(f'The best hyperparameters found for RF are: {best_hyperparams}')

## Evaluation

In [None]:
best_rf = grid_rf.best_estimator_

preds = best_rf.predict(X_test)
rmse_best_rf = np.sqrt(mean_squared_error(y_test, preds))
rs_best_rf = best_rf.score(X_test, y_test)

print(f'Best RF RMSE is: {rmse_best_rf}')
print(f'Best RF R^2 is: {rs_best_rf}')

The performance of the RandomForest Regressor was slightly improved.

The RMSE was reduced to 1814.25