In [1]:
#Importing the basic librarires fot analysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('coches-de-segunda-mano-sample.csv')

# Look at the variables names
print(df.columns)

# Check the data types
print(df.dtypes)

# Summary Statistics
print(df.describe)

df.head()

Index(['url', 'company', 'make', 'model', 'version', 'price', 'price_financed',
       'fuel', 'year', 'kms', 'power', 'doors', 'shift', 'color', 'photos',
       'is_professional', 'dealer', 'province', 'country', 'publish_date',
       'insert_date'],
      dtype='object')
url                 object
company             object
make                object
model               object
version             object
price                int64
price_financed     float64
fuel                object
year               float64
kms                  int64
power              float64
doors                int64
shift               object
color               object
photos               int64
is_professional       bool
dealer              object
province            object
country             object
publish_date        object
insert_date         object
dtype: object
<bound method NDFrame.describe of                                     url                           company  \
0      e158ae0ca53119ca199c28c36

Unnamed: 0,url,company,make,model,version,price,price_financed,fuel,year,kms,...,doors,shift,color,photos,is_professional,dealer,province,country,publish_date,insert_date
0,e158ae0ca53119ca199c28c36b5c2fcd,9881bcdd5a0ad4733037b3fb25e69c3a,SEAT,Toledo,SEAT Toledo 4p.,950,,Diésel,2000.0,227000,...,4,Manual,Verde,5,False,0f4bb8455d27349b8273109b66a847f3,Navarra,Spain,2020-12-18 10:47:13,2021-01-15 00:00:00
1,ff267ebb7e700246f47f84f3db660b4b,9881bcdd5a0ad4733037b3fb25e69c3a,CITROEN,C1,CITROEN C1 PureTech 60KW 82CV Feel 5p.,6200,,Gasolina,2017.0,50071,...,5,Manual,Blanco,6,True,Autos Raymara,Tenerife,Spain,2021-01-02 11:25:40,2021-01-15 00:00:00
2,de4b02db28ea7786c622b969be10c7c7,9881bcdd5a0ad4733037b3fb25e69c3a,FORD,Transit Connect,FORD Transit Connect Van 1.5 TDCi 100cv Ambien...,7851,7024.0,Diésel,2016.0,103000,...,4,Manual,Blanco,10,True,Auto 96,Barcelona,Spain,2020-12-16 10:51:45,2021-01-15 00:00:00
3,0449972a4d07594acf92e9a7dd28b39c,9881bcdd5a0ad4733037b3fb25e69c3a,VOLKSWAGEN,Caravelle,VOLKSWAGEN Caravelle Largo 2.0 TDI 140 Comfort...,19426,,Diésel,2014.0,120000,...,4,Manual,Blanco,9,True,Inniauto,Navarra,Spain,2020-11-25 11:09:14,2021-01-15 00:00:00
4,12c4fa49bd4fdf23f19ecf396d3f02ef,9881bcdd5a0ad4733037b3fb25e69c3a,FORD,Transit,FORD Transit 350 96kW L4 Ambiente Propulsion T...,22850,22800.0,Diésel,2017.0,107000,...,2,Manual,Blanco,4,True,"Autofleet España,s.l",Sevilla,Spain,2021-01-12 20:00:34,2021-01-15 00:00:00


In [3]:
# Check for missing values
print(df.isnull().sum())

# Delete column price_financed (not useful and lots of na)
df.drop(['price_financed'], axis = 1, inplace = True)

# Delete other non useful columns
df.drop(columns=['url', 'company', 'publish_date', 'insert_date', 'dealer', 'country'], inplace=True)

# Drop rows with missing values
cars = df.dropna()

# Show new dataframe
cars.shape[0]

url                    0
company                0
make                   2
model                  5
version                0
price                  0
price_financed     26437
fuel                  46
year                   2
kms                    0
power               8528
doors                  0
shift                111
color                594
photos                 0
is_professional        0
dealer                 0
province               6
country                0
publish_date           0
insert_date            0
dtype: int64


41245

In [4]:
# Calculate the average price by make
avg_price_by_make = cars.groupby("make")["price"].mean()
print(avg_price_by_make)

# Find the cars with high km and low price
high_kms_low_price =  cars[(cars['kms'] > 100000) & (cars['price'] < 5000)]
print(high_kms_low_price)

make
ABARTH          19098.333333
ALFA ROMEO      16249.721992
ALPINE          66000.000000
ASTON MARTIN    89578.666667
AUDI            21693.847582
                    ...     
TESLA           53672.956522
TOYOTA          13848.601591
UMM              8000.000000
VOLKSWAGEN      15160.267028
VOLVO           23194.260749
Name: price, Length: 71, dtype: float64
             make        model  \
12     VOLKSWAGEN         Polo   
43        PEUGEOT         1007   
75        PEUGEOT          406   
83     VOLKSWAGEN       Passat   
88           FIAT  Doblò Cargo   
...           ...          ...   
49937        SEAT        Ibiza   
49944     RENAULT       Mégane   
49958        FORD        Focus   
49982     CITROEN     Berlingo   
49999     CITROEN           C2   

                                                 version  price  \
12                 VOLKSWAGEN Polo 75 Trendline Auto 3p.   1300   
43                   PEUGEOT 1007 1.6 Sporty 2Tronic 3p.   4200   
75                        

# MODELS 

## Linear Regression

In [5]:
# Build a linear regression model to predict

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# Select the variables to use in the model
X = cars[["kms", "year","power","doors"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Error: 5075.437847824343
Root Mean Squared Error: 10314.337195311751
Mean Absolute Percentage Error: 58.12185101910225


### Object to numeric Codification

In [6]:
from sklearn.preprocessing import LabelEncoder 

# Create a LabelEncoder object
make_le = LabelEncoder()
model_le = LabelEncoder()
fuel_le = LabelEncoder()
shift_le = LabelEncoder()
color_le = LabelEncoder()
province_le = LabelEncoder()

# Fit the encoder to the "make" column
make_le.fit(cars["make"])
model_le.fit(cars["model"]) 
fuel_le.fit(cars["fuel"])
shift_le.fit(cars["shift"])
color_le.fit(cars["color"]) 
province_le.fit(cars["province"])

# Transform the "make" column into numeric values
make_encoded = make_le.transform(cars["make"])
model_encoded = model_le.transform(cars["model"]) 
fuel_encoded = fuel_le.transform(cars["fuel"])
shift_encoded = shift_le.transform(cars["shift"])
color_encoded = color_le.transform(cars["color"])  
province_encoded = province_le.transform(cars["province"]) 

# Add the encoded values to the dataset
cars["make_encoded"] = make_encoded
cars["model_encoded"] = model_encoded
cars["fuel_encoded"] = fuel_encoded
cars["shift_encoded"] = shift_encoded
cars["color_encoded"] = color_encoded 
cars["province_encoded"] = province_encoded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars["make_encoded"] = make_encoded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars["model_encoded"] = model_encoded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars["fuel_encoded"] = fuel_encoded
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

## Linear regression endcoded

In [7]:
# Build a linear regression model to predict

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "fuel_encoded", "kms", "year", "power", "doors", "shift_encoded", "color_encoded", "photos", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)

Mean Absolute Error: 4911.665824847058
Root Mean Squared Error: 9745.86050526411
Mean Absolute Percentage Error: 58.06627529155273


In [8]:
# We Calculate Pearson's correlation coefficient again but this time we have encoded the object data types to numeric
corr = cars.corr()['price'].sort_values()
print(corr)

shift_encoded      -0.424406
kms                -0.404284
make_encoded       -0.057559
doors              -0.029989
province_encoded   -0.004448
color_encoded       0.019932
model_encoded       0.039197
fuel_encoded        0.103929
photos              0.194913
is_professional     0.205932
year                0.410204
power               0.693618
price               1.000000
Name: price, dtype: float64


  corr = cars.corr()['price'].sort_values()


## Decision Tree Regressor Model 

In [31]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "fuel_encoded", "kms", "year", "power", "doors", "shift_encoded", "color_encoded", "photos", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 3043.620499454479
Root Mean Squared Error: 6874.627093387844
Mean Absolute Percentage Error: 22.332703212738185


## Random Forest Regressor Model

In [36]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "photos", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a random forest model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 2177.448084168933
Root Mean Squared Error: 5298.654756951197
Mean Absolute Percentage Error: 17.174194612471535


## Multi-layer Perceptron Regressor (NeuralNetwork)

In [11]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "photos", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a neural network model
model = MLPRegressor(hidden_layer_sizes=(32,16,8), activation="relu", solver="adam")
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 4327.088390829207
Root Mean Squared Error: 7653.607892528918
Mean Absolute Percentage Error: 47.34976894649453


## K-Nearest Neighbors Regressor

In [12]:
# Import the necessary libraries and packages

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "photos", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a KNN regressor
model = KNeighborsRegressor(n_neighbors=3, weights='uniform')
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 7045.240046333712
Root Mean Squared Error: 13474.93651194146
Mean Absolute Percentage Error: 64.98677265962746


## Extreme Gradient Boosting Regressor

In [13]:
# Import the necessary libraries and packages
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Select the variables to use in the model
X = cars[["make_encoded", "model_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "photos", "is_professional", "province_encoded"]]
y = cars["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train an XGBoost model
model = XGBRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)

# Calculate the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Percentage Error:", mape)


Mean Absolute Error: 2115.631680928255
Root Mean Squared Error: 4567.94622404456
Mean Absolute Percentage Error: 16.475996327354895


# Example Evaluation

In [14]:

new_row_2 = {'make':'CITROEN', 'model': 'C1', 'fuel': 'Gasolina', 'year': 2000.0, 'kms': 55505, 'power': 82.0, 
'doors': 5, 'shift': 'Manual', 'color':'Blanco', 'photos': 7, 'is_professional': True, 'province': 'Tenerife'}

new_row = {'make':'BMW', 'model': 'Serie 3', 'fuel': 'Diésel', 'year': 2004.0, 'kms': 220000.0, 'power': 150.0, 
'doors': 5, 'shift': 'Manual', 'color':'Azul', 'photos': 7, 'is_professional': False, 'province': 'Valencia'}

my_car = pd.DataFrame(new_row, index=[0])

# Transform the "make" column into numeric values
make_encoded = make_le.transform(my_car["make"])
model_encoded = model_le.transform(my_car["model"])
fuel_encoded = fuel_le.transform(my_car["fuel"])
shift_encoded = shift_le.transform(my_car["shift"])
color_encoded = color_le.transform(my_car["color"])  
province_encoded = province_le.transform(my_car["province"]) 

# Add the encoded values to the dataset
my_car["make_encoded"] = make_encoded
my_car["model_encoded"] = model_encoded
my_car["fuel_encoded"] = fuel_encoded
my_car["shift_encoded"] = shift_encoded
my_car["color_encoded"] = color_encoded 
my_car["province_encoded"] = province_encoded

X = my_car[["make_encoded", "model_encoded", "fuel_encoded",  "year", "kms", "power", "doors", "shift_encoded", "color_encoded", "photos", "is_professional", "province_encoded"]]

my_car.head(1)

y_pred = model.predict(X)

print("An aproximated fair Price would be: ", y_pred,  "€")

An aproximated fair Price would be:  [3418.0222] €


In [15]:
le_make_mapping = dict(zip(make_le.classes_, make_le.transform(make_le.classes_)))
print(le_make_mapping)

{'ABARTH': 0, 'ALFA ROMEO': 1, 'ALPINE': 2, 'ASTON MARTIN': 3, 'AUDI': 4, 'AUSTIN': 5, 'BENTLEY': 6, 'BMW': 7, 'CADILLAC': 8, 'CHEVROLET': 9, 'CHRYSLER': 10, 'CITROEN': 11, 'CORVETTE': 12, 'CUPRA': 13, 'DACIA': 14, 'DAEWOO': 15, 'DAIHATSU': 16, 'DFSK': 17, 'DODGE': 18, 'DS': 19, 'FERRARI': 20, 'FIAT': 21, 'FORD': 22, 'GALLOPER': 23, 'HONDA': 24, 'HUMMER': 25, 'HYUNDAI': 26, 'INFINITI': 27, 'ISUZU': 28, 'IVECO': 29, 'IVECO-PEGASO': 30, 'JAGUAR': 31, 'JEEP': 32, 'KIA': 33, 'LAMBORGHINI': 34, 'LANCIA': 35, 'LAND-ROVER': 36, 'LDV': 37, 'LEXUS': 38, 'LOTUS': 39, 'MAHINDRA': 40, 'MASERATI': 41, 'MAXUS': 42, 'MAZDA': 43, 'MERCEDES-BENZ': 44, 'MG': 45, 'MINI': 46, 'MITSUBISHI': 47, 'MORGAN': 48, 'NISSAN': 49, 'OPEL': 50, 'PEUGEOT': 51, 'PIAGGIO': 52, 'PONTIAC': 53, 'PORSCHE': 54, 'RENAULT': 55, 'ROVER': 56, 'SAAB': 57, 'SANTANA': 58, 'SEAT': 59, 'SKODA': 60, 'SMART': 61, 'SSANGYONG': 62, 'SUBARU': 63, 'SUZUKI': 64, 'TATA': 65, 'TESLA': 66, 'TOYOTA': 67, 'UMM': 68, 'VOLKSWAGEN': 69, 'VOLVO': 70

In [16]:
le_model_mapping = dict(zip(model_le.classes_, model_le.transform(model_le.classes_)))
print(le_model_mapping)

{'100': 0, '1007': 1, '100D': 2, '106': 3, '107': 4, '108': 5, '124 Spider': 6, '140D': 7, '147': 8, '156': 9, '159': 10, '164': 11, '19': 12, '190': 13, '200': 14, '2008': 15, '205': 16, '206': 17, '206 +': 18, '206 SW': 19, '207': 20, '207 +': 21, '208': 22, '208 XAD': 23, '220': 24, '230': 25, '240': 26, '25': 27, '260': 28, '2CV': 29, '300': 30, '300 GT': 31, '300 ZX': 32, '3008': 33, '3008 Hybrid': 34, '300C': 35, '300M': 36, '306': 37, '307': 38, '307 SW': 39, '308': 40, '309': 41, '323': 42, '33': 43, '350': 44, '350Z': 45, '355': 46, '360': 47, '370Z': 48, '4007': 49, '4008': 50, '406': 51, '407': 52, '407 SW': 53, '45': 54, '458': 55, '488': 56, '4Runner': 57, '500': 58, '5008': 59, '500C': 60, '500L': 61, '500X': 62, '505': 63, '508': 64, '508 Hybrid': 65, '580': 66, '599': 67, '600': 68, '607': 69, '626': 70, '718': 71, '740': 72, '75': 73, '806': 74, '807': 75, '812': 76, '9-3': 77, '9-5': 78, '90': 79, '911': 80, '928': 81, '944': 82, '960': 83, '968': 84, 'A1': 85, 'A110'

In [17]:
le_fuel_mapping = dict(zip(fuel_le.classes_, fuel_le.transform(fuel_le.classes_)))
print(le_fuel_mapping)

{'Diésel': 0, 'Eléctrico': 1, 'Gas licuado (GLP)': 2, 'Gas natural (CNG)': 3, 'Gasolina': 4, 'Híbrido': 5, 'Híbrido enchufable': 6}


In [18]:
le_shift_mapping = dict(zip(shift_le.classes_, shift_le.transform(shift_le.classes_)))
print(le_shift_mapping)

{'Automático': 0, 'Manual': 1}


In [19]:
le_province_mapping = dict(zip(province_le.classes_, province_le.transform(province_le.classes_)))
print(le_province_mapping)

{'A Coruña': 0, 'Albacete': 1, 'Alicante': 2, 'Almería': 3, 'Asturias': 4, 'Badajoz': 5, 'Baleares': 6, 'Barcelona': 7, 'Burgos': 8, 'Cantabria': 9, 'Castellón': 10, 'Ceuta': 11, 'Ciudad Real': 12, 'Cuenca': 13, 'Cáceres': 14, 'Cádiz': 15, 'Córdoba': 16, 'Girona': 17, 'Granada': 18, 'Guadalajara': 19, 'Guipúzcoa': 20, 'Huelva': 21, 'Huesca': 22, 'Jaén': 23, 'La Rioja': 24, 'Las Palmas': 25, 'León': 26, 'Lleida': 27, 'Lugo': 28, 'Madrid': 29, 'Melilla': 30, 'Murcia': 31, 'Málaga': 32, 'Navarra': 33, 'Orense': 34, 'Palencia': 35, 'Pontevedra': 36, 'Salamanca': 37, 'Segovia': 38, 'Sevilla': 39, 'Soria': 40, 'Tarragona': 41, 'Tenerife': 42, 'Teruel': 43, 'Toledo': 44, 'Valencia': 45, 'Valladolid': 46, 'Vizcaya': 47, 'Zamora': 48, 'Zaragoza': 49, 'Álava': 50, 'Ávila': 51}


In [20]:
le_color_mapping = dict(zip(color_le.classes_, color_le.transform(color_le.classes_)))
print(le_color_mapping)

{'"5CA 5CD 5DN 5DP 5DQ 5DR Pintura metalizada (excepto bronce magnÃ©tico)': 0, '(( FULL, 4X4, NAVI, CALEFACTADOS, HK ))': 1, '(( NACIONAL, FULL EXTRAS, BLU-RAY ))': 2, '(( NACIONAL, IMPECABLE, FULL EXTRAS ))': 3, '(( NACIONAL, LLANTA 21", PANORÁMICO ))': 4, '(( NACIONAL, S-LINE, LLANTA 22", FULL ))': 5, '(( TODO EN FERRARI, IMPECABLE ))': 6, '((( NACIONAL  CON  8.600 KM...  )))': 7, '(--EMBRAGUE ESTRENAR--)': 8, '(12225)': 9, '- Pintura No metalizada (excepto Azul Pacifico, sin coste)': 10, '019 Negro (sÃ³lido)': 11, '02 Plata Perla (metalizado)': 12, '040 Blanco (sÃ³lido)': 13, '040 Blanco Classic (sÃ³lido)': 14, '040 Negro (estÃ¡ndar)': 15, '070 Blanco Perlado (perlado)': 16, '08 Ice Blue (metalizado)': 17, '085 Blanco Sonic (mica)': 18, '09 Blanco (sÃ³lido)': 19, '0A0A Azul Arrecife (metalizado)': 20, '0A0A Azul Arrefice (metalizado)': 21, '0C0C Gis MonzÃ³n (metalizado)': 22, '0C0C Gris MonzÃ³n (metalizado)': 23, '0C0C Gris MozÃ³n (metalizado)': 24, '0C0C Gris Pirineos (metalizado)'

In [21]:
cars.loc[cars['color'] == 'Marrón']

Unnamed: 0,make,model,version,price,fuel,year,kms,power,doors,shift,color,photos,is_professional,province,make_encoded,model_encoded,fuel_encoded,shift_encoded,color_encoded,province_encoded
63,AUDI,A4,AUDI A4 Avant 2.0 TDI 120cv 5p.,11900,Diésel,2014.0,165999,120.0,5,Manual,Marrón,21,True,Málaga,4,89,0,1,2444,32
110,OPEL,Vivaro,OPEL Vivaro 1.6 CDTI SS 92kW L2 2.9t Combi Plu...,20900,Diésel,2018.0,61823,125.0,4,Manual,Marrón,16,True,Alicante,50,802,0,1,2444,2
156,FIAT,Tipo,FIAT Tipo 1.6 Lounge 88kW 120CVdiesel Mjet II ...,12450,Diésel,2018.0,41844,120.0,5,Manual,Marrón,48,True,Valencia,21,755,0,1,2444,45
332,VOLKSWAGEN,Passat,VOLKSWAGEN Passat BlueMotion 1.6 TDI 120CV 4p.,13900,Diésel,2016.0,159000,120.0,4,Manual,Marrón,20,True,Cantabria,69,565,0,1,2444,9
356,VOLKSWAGEN,Multivan,VOLKSWAGEN Multivan 2.0 TDI 140cv Comfortline 4p.,18990,Diésel,2010.0,220000,140.0,4,Manual,Marrón,23,True,Madrid,69,525,0,1,2444,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49703,SEAT,Alhambra,SEAT Alhambra 2.0 TDI 140cv Sport Plus 5p.,8800,Diésel,2007.0,110000,140.0,5,Manual,Marrón,8,False,Asturias,59,106,0,1,2444,4
49767,FORD,C-Max,FORD CMax 1.6 TDCi 90 Trend 5p.,3500,Diésel,2008.0,277003,90.0,5,Manual,Marrón,10,True,La Rioja,22,151,0,1,2444,24
49813,HYUNDAI,i30,HYUNDAI i30 1.6 CRDi 110cv Tecno 5p.,9450,Diésel,2012.0,80000,110.0,5,Manual,Marrón,10,True,Jaén,26,867,0,1,2444,23
49872,KIA,ceed,KIA ceed 1.4 CVVT 100cv Drive 5p.,7990,Gasolina,2015.0,74000,100.0,5,Manual,Marrón,10,True,Las Palmas,33,848,4,1,2444,25


In [22]:

# to_predict_list = ['BMW','Serie 3','Diésel',2004.0,220000.0,150.0,5,'Manual','Azul',7,False,'Valencia']

to_predict_list = [[7, 691, 0, 2004.0, 220000.0, 150.0, 5, 1 , 255 , 7, False, 45]]

# print(to_predict_list)
# to_predict_list = list(map(int, to_predict_list))
# print(to_predict_list)
# to_predict = np.array(to_predict_list).reshape(1, 12)
# print(to_predict)

y_pred = model.predict(to_predict_list)

print("An aproximated fair Price would be: ", y_pred,  "€")

ValueError: training data did not have the following fields: make_encoded, model_encoded, fuel_encoded, year, kms, power, doors, shift_encoded, color_encoded, photos, is_professional, province_encoded

In [None]:
import pickle

pickle.dump(model, open('model.pkl', 'wb'))