In [19]:
#Importing the basic librarires fot analysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('coches-de-segunda-mano-sample.csv')

# Look at the variables names
print(df.columns)

# Check the data types
print(df.dtypes)

# Summary Statistics
print(df.describe)

df.head()

In [None]:
# Histogram  of prices 
sns.histplot(df['price'])
plt.show()

# Scatter plot of price vs. km
sns.scatterplot(x='price', y='color', data=df)
plt.show()

# Box plot of price by make
sns.boxplot(x="make", y="price", data=df)
plt.show()

In [None]:
# Check for missing values
print(df.isnull().sum())

# Delete column price_financed (not useful and lots of na)
df.drop(['price_financed'], axis = 1, inplace = True)

# Drop rows with missing values
cars = df.dropna()

# Show new dataframe
cars.shape[0]

In [None]:
# Calculate the average price by make
avg_price_by_make = cars.groupby("make")["price"].mean()
print(avg_price_by_make)

# Find the cars with high km and low price
high_kms_low_price =  cars[(cars['kms'] > 100000) & (cars['price'] < 5000)]
print(high_kms_low_price)

In [None]:
# Visualize the relationships between the variables
sns.pairplot(cars)
plt.show()

# Calculate Pearson's correlation coefficient
corr = cars.corr()
print(corr)

In [25]:
# Build a linear regression model to predict

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# Select the variables to use in the model
X = cars[["kms", "year","power","doors"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Error: 5134.039085329691
Root Mean Squared Error: 10325.95646774424
Mean Absolute Percentage Error: 55.50875942241478


In [None]:
from sklearn.preprocessing import LabelEncoder 

# Create a LabelEncoder object
make_le = LabelEncoder()
model_le = LabelEncoder()
version_le = LabelEncoder()
fuel_le = LabelEncoder()
shift_le = LabelEncoder()
color_le = LabelEncoder()

# Fit the encoder to the "make" column
make_le.fit(cars["make"])
model_le.fit(cars["model"])
version_le.fit(cars["version"])
fuel_le.fit(cars["fuel"])
shift_le.fit(cars["shift"])
color_le.fit(cars["color"])

# Transform the "make" column into numeric values
make_encoded = make_le.transform(cars["make"])
model_encoded = model_le.transform(cars["model"])
version_encoded = version_le.transform(cars["version"])
fuel_encoded = fuel_le.transform(cars["fuel"])
shift_encoded = shift_le.transform(cars["shift"])
color_encoded = color_le.transform(cars["color"]) 

# Add the encoded values to the dataset
cars["make_encoded"] = make_encoded
cars["model_encoded"] = model_encoded
cars["version_encoded"] = version_encoded
cars["fuel_encoded"] = fuel_encoded
cars["shift_encoded"] = shift_encoded

In [31]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded", "kms", "year", "power", "doors", "shift_encoded", "color_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 2984.5687867033025
Root Mean Squared Error: 7760.163329674625
Mean Absolute Percentage Error: 22.677228778256975


In [30]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "version_encoded", "fuel_encoded", "kms", "year", "power", "doors", "shift_encoded", "color_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a random forest model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 2194.693146370009
Root Mean Squared Error: 5745.761038859308
Mean Absolute Percentage Error: 16.60734193096273
