In [2]:
#Importing the basic librarires fot analysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('coches-de-segunda-mano-sample.csv')

# Look at the variables names
print(df.columns)

# Check the data types
print(df.dtypes)

# Summary Statistics
print(df.describe)

df.head()

In [None]:
# Check for missing values
print(df.isnull().sum())

# Delete column price_financed (not useful and lots of na)
df.drop(['price_financed'], axis = 1, inplace = True)

# Delete other non useful columns
df.drop(columns=['url', 'company', 'publish_date', 'insert_date'], inplace=True)

# Drop rows with missing values
cars = df.dropna()

# Show new dataframe
cars.shape[0]

In [None]:
# Calculate the average price by make
avg_price_by_make = cars.groupby("make")["price"].mean()
print(avg_price_by_make)

# Find the cars with high km and low price
high_kms_low_price =  cars[(cars['kms'] > 100000) & (cars['price'] < 5000)]
print(high_kms_low_price)

# MODELS 

## Linear Regression

In [29]:
# Build a linear regression model to predict

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# Select the variables to use in the model
X = cars[["kms", "year","power","doors"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Error: 4981.140719377973
Root Mean Squared Error: 10364.418870650548
Mean Absolute Percentage Error: 55.42219767138427


### Object to numeric Codification

In [None]:
from sklearn.preprocessing import LabelEncoder 

# Create a LabelEncoder object
make_le = LabelEncoder()
model_le = LabelEncoder()
version_le = LabelEncoder()
fuel_le = LabelEncoder()
shift_le = LabelEncoder()
color_le = LabelEncoder()
province_le = LabelEncoder()

# Fit the encoder to the "make" column
make_le.fit(cars["make"])
model_le.fit(cars["model"])
version_le.fit(cars["version"])
fuel_le.fit(cars["fuel"])
shift_le.fit(cars["shift"])
color_le.fit(cars["color"]) 
province_le.fit(cars["province"])

# Transform the "make" column into numeric values
make_encoded = make_le.transform(cars["make"])
model_encoded = model_le.transform(cars["model"])
version_encoded = version_le.transform(cars["version"])
fuel_encoded = fuel_le.transform(cars["fuel"])
shift_encoded = shift_le.transform(cars["shift"])
color_encoded = color_le.transform(cars["color"])  
province_encoded = province_le.transform(cars["province"]) 

# Add the encoded values to the dataset
cars["make_encoded"] = make_encoded
cars["model_encoded"] = model_encoded
cars["version_encoded"] = version_encoded
cars["fuel_encoded"] = fuel_encoded
cars["shift_encoded"] = shift_encoded
cars["color_encoded"] = color_encoded 
cars["province_encoded"] = province_encoded

## Linear regression endcoded

In [31]:
# Build a linear regression model to predict

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded", "kms", "year", "power", "doors", "shift_encoded", "color_encoded", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Error: 4911.869906607731
Root Mean Squared Error: 9979.93540226983
Mean Absolute Percentage Error: 60.1027186278361


In [None]:
# We Calculate Pearson's correlation coefficient again but this time we have encoded the object data types to numeric
corr = cars.corr()['price'].sort_values()
print(corr)

## Decision Tree Regressor Model 

In [12]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded", "kms", "year", "power", "doors", "shift_encoded", "color_encoded", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 3048.527705942568
Root Mean Squared Error: 7690.492365134777
Mean Absolute Percentage Error: 23.06490087117314


## Random Forest Regressor Model

In [27]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a random forest model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 2084.4004781877516
Root Mean Squared Error: 4378.824474296932
Mean Absolute Percentage Error: 16.99128811865897


## Multi-layer Perceptron Regressor (NeuralNetwork)

In [25]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a neural network model
model = MLPRegressor(hidden_layer_sizes=(32,16,8), activation="relu", solver="adam")
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 4009.6578186718025
Root Mean Squared Error: 6685.446174152231
Mean Absolute Percentage Error: 40.150904374197445


## K-Nearest Neighbors Regressor

In [39]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a KNN regressor
model = KNeighborsRegressor(n_neighbors=3, weights='uniform')
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 6993.415440978396
Root Mean Squared Error: 14521.684774843667
Mean Absolute Percentage Error: 65.85970377933408


## Extreme Gradient Boosting Regressor

In [41]:
# Import the necessary libraries and packages
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train an XGBoost model
model = XGBRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 2145.538816545436
Root Mean Squared Error: 5763.356462236575
Mean Absolute Percentage Error: 16.17041733237943


# Example Evaluation

In [44]:

new_row_2 = {'make':'CITROEN', 'model': 'C1', 'version': 'CITROEN C1 PureTech 60KW 82CV Feel 5p.', 'fuel': 'Gasolina', 'year': 2000.0, 'kms': 55505, 'power': 82.0, 
'doors': 5, 'shift': 'Manual', 'color':'Blanco', 'is_professional': True, 'province': 'Tenerife'}

new_row = {'make':'BMW', 'model': 'Serie 3', 'version': 'BMW Serie 3 320D 4p.', 'fuel': 'Diésel', 'year': 2004.0, 'kms': 220000.0, 'power': 150.0, 
'doors': 5, 'shift': 'Manual', 'color':'Azul', 'is_professional': False, 'province': 'Valencia'}

my_car = pd.DataFrame(new_row, index=[0])

# Transform the "make" column into numeric values
make_encoded = make_le.transform(my_car["make"])
model_encoded = model_le.transform(my_car["model"])
version_encoded = version_le.transform(my_car["version"])
fuel_encoded = fuel_le.transform(my_car["fuel"])
shift_encoded = shift_le.transform(my_car["shift"])
color_encoded = color_le.transform(my_car["color"])  
province_encoded = province_le.transform(my_car["province"]) 

# Add the encoded values to the dataset
my_car["make_encoded"] = make_encoded
my_car["model_encoded"] = model_encoded
my_car["version_encoded"] = version_encoded
my_car["fuel_encoded"] = fuel_encoded
my_car["shift_encoded"] = shift_encoded
my_car["color_encoded"] = color_encoded 
my_car["province_encoded"] = province_encoded

X = my_car[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "is_professional", "province_encoded"]]

my_car.head(1)

y_pred = model.predict(X)

print("An aproximated fair Price would be: ", y_pred,  "€")

An aproximated fair Price would be:  [3735.23]
