<a href="https://www.kaggle.com/code/i200605salehahmad/model-performance-before-and-after-eda?scriptVersionId=131127547" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import os
import tqdm as tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor

# Read Data Set

In [None]:
Original_DF = pd.read_csv('/kaggle/input/user-car-prices-barcelona-2022/used_cars_data.csv').drop('Unnamed: 0',axis=1).rename({'price (eur)':'Price'},axis=1)
DF = Original_DF.copy(deep=True)
Original_DF

# Id2Label and Label2Id

In [None]:
Label2Id_brand = {k:i for i,k in enumerate(DF.brand.unique())}
Label2Id_model = {k:i for i,k in enumerate(DF.model.unique())}
Label2Id_engine = {k:i for i,k in enumerate(DF.engine.unique())}
Label2Id_fuel = {k:i for i,k in enumerate(DF.fuel.unique())}
Label2Id_gearbox = {k:i for i,k in enumerate(DF.gearbox.unique())}
Label2Id_location = {k:i for i,k in enumerate(DF.location.unique())}
Label2Id = {'brand':Label2Id_brand, 'model':Label2Id_model, 'engine':Label2Id_engine, 'fuel':Label2Id_fuel, 'gearbox':Label2Id_gearbox, 'location':Label2Id_location}

Id2Label_brand = {i:k for i,k in enumerate(DF.brand.unique())}
Id2Label_model = {i:k for i,k in enumerate(DF.model.unique())}
Id2Label_engine = {i:k for i,k in enumerate(DF.engine.unique())}
Id2Label_fuel = {i:k for i,k in enumerate(DF.fuel.unique())}
Id2Label_gearbox = {i:k for i,k in enumerate(DF.gearbox.unique())}
Id2Label_location = {i:k for i,k in enumerate(DF.location.unique())}
Id2Label = {'brand':Id2Label_brand, 'model':Id2Label_model, 'engine':Id2Label_engine, 'fuel':Id2Label_fuel, 'gearbox':Id2Label_gearbox, 'location':Id2Label_location}

# Models Declaration

In [None]:
Models = {'LinearRegression': LinearRegression(), 'Ridge': Ridge(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'DecisionTreeRegressor': DecisionTreeRegressor(),
          'RandomForestRegressor': RandomForestRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 
          'ExtraTreesRegressor': ExtraTreesRegressor(), 'BaggingRegressor': BaggingRegressor()}

# Model Estimation on Basic Preprocessing

In [None]:
def Train_Predict_Metrics(model,X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    MSE = mean_squared_error(Y_test, Y_pred, squared=False)
    return MSE

def Map_String_to_Ids(DF):
    for idx,col in enumerate(DF.columns):
        if DF[col].dtypes == 'object':
            DF[col] = DF[col].map(Label2Id[col])
    return DF
            
def Map_Ids_to_Strings(DF):
    for idx,col in enumerate(DF.columns):
        if DF[col].dtypes == 'object':
            DF[col] = DF[col].map(Id2Label[col])
    return DF

TestMetrics = []
DF = Map_String_to_Ids(DF)
xtrain, xtest, ytrain, ytest = train_test_split(DF.drop('Price',axis=1), DF.Price, test_size=0.2, random_state=42)
for ModelName,Model in Models.items():
    TempMetrics = Train_Predict_Metrics(Model,xtrain, xtest, ytrain, ytest)
    TestMetrics.append(TempMetrics)
    print(ModelName, " has been trained")

DF_Metrics = pd.DataFrame(TestMetrics, columns=['Root Mean Squared Error'], index=Models.keys())
DF_Metrics.sort_values(by=['Root Mean Squared Error'], ascending=True, inplace=True)
DF_Metrics

# Exploratory Data Analysis

In [None]:
DF = Original_DF.copy(deep=True)

In [None]:
#Checking for nulls in all columns if any
for key,val in DF.isnull().sum().items():
    if val > 0:
        print('Columns:', key, 'Nulls:', val)

In [None]:
DF.describe() #Some basic insights

In [None]:
DF.info() #Some basic insights

In [None]:
print(DF.brand.nunique())
for key,val in DF.brand.value_counts().items():
    print(key, val)

In [None]:
print(DF.model.nunique())
for key,val in DF.model.value_counts().items():
    print(key, val)

In [None]:
SumOfBrandPrice = DF.groupby(['brand'])['Price'].mean().reset_index().sort_values(by='Price',ascending=False)

plt.figure(figsize=(20,10))
plt.xticks(rotation=90)
plt.plot(SumOfBrandPrice['brand'],SumOfBrandPrice['Price'])

In [None]:
plt.figure(figsize=(10,5))
plt.hist(DF.Price, rwidth=0.9, label='Price')
plt.xlabel('Price')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.hist(DF.year, rwidth=0.9, label='Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.displot(DF['mileage (kms)'], label='mileage (kms)')
plt.xlabel('mileage (kms)')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(5,4))
plt.hist(DF['fuel'], rwidth=0.9, label='Fuel')
plt.xlabel('Fuel')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(5,3))
plt.hist(DF['gearbox'], rwidth=0.9, label='Gearbox')
plt.xlabel('Gearbox')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.hist(DF['location'], rwidth=0.9, label='Location')
plt.xlabel('Location')
plt.ylabel('Count')
plt.legend()
plt.show()

# Model Training

## Some Advanced Preprocessing for Model Performance Improvement

In [None]:
BrandsToRemove = ['Chevrolet','Skoda','Cupra']
YearsToRemove = [2010]
FuelsToRemove = ['GLP','Eléctrico']

DF = DF[~DF.brand.isin(BrandsToRemove)]
DF = DF[DF.year > 2010]
DF = DF.reset_index(drop=True)

Scaler = MinMaxScaler((0.01,5))
DF[['year','mileage (kms)']] = Scaler.fit_transform(DF[['year','mileage (kms)']])

In [None]:
def Train_Predict_Metrics(model,X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    MSE = mean_squared_error(Y_test, Y_pred, squared=False)
    return MSE

def Map_String_to_Ids(DF):
    for idx,col in enumerate(DF.columns):
        if DF[col].dtypes == 'object':
            DF[col] = DF[col].map(Label2Id[col])
    return DF
            
def Map_Ids_to_Strings(DF):
    for idx,col in enumerate(DF.columns):
        if DF[col].dtypes == 'object':
            DF[col] = DF[col].map(Id2Label[col])
    return DF

TestMetrics = []
DF = Map_String_to_Ids(DF)
xtrain, xtest, ytrain, ytest = train_test_split(DF.drop('Price',axis=1), DF.Price, test_size=0.2, random_state=42)
for ModelName,Model in Models.items():
    TempMetrics = Train_Predict_Metrics(Model,xtrain, xtest, ytrain, ytest)
    TestMetrics.append(TempMetrics)
    print(ModelName, " has been trained")

DF_Metrics = pd.DataFrame(TestMetrics, columns=['Root Mean Squared Error'], index=Models.keys())
DF_Metrics.sort_values(by=['Root Mean Squared Error'], ascending=True, inplace=True)
DF_Metrics