Problema tomado de: https://github.com/fferegrino/cf-ml/blob/main/car-prices/car-price.ipynb

# Predicción de automóviles usados

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
cars = pd.read_csv('cars.csv')

In [5]:
cars.head()

Unnamed: 0,maker,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,cclass,C Class,2020,Automatic,1200,Diesel,,,2.0,30495
1,cclass,C Class,2020,Automatic,1000,Petrol,,,1.5,29989
2,cclass,C Class,2020,Automatic,500,Diesel,,,2.0,37899
3,cclass,C Class,2019,Automatic,5000,Diesel,,,2.0,30399
4,cclass,C Class,2019,Automatic,4500,Diesel,,,2.0,29899


## Análisis Exploratorio de Datos

In [6]:
profile = ProfileReport(cars, title="Raw Car Dataset Analysis", explorative=True)
profile.to_file("cars-report.html")

Summarize dataset: 100%|██████████| 61/61 [00:12<00:00,  4.78it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 143.44it/s]


#### Eliminar valores duplicados

In [7]:
print(len(cars))
cars = cars.drop_duplicates(keep='first')
print(len(cars))

108540
106267


### Dividir Dataset

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
rest, test = train_test_split(cars, test_size=0.2, shuffle=True) # 20% of 100 = 20
train, val = train_test_split(rest, test_size=0.25, shuffle=True) # 25% of 80 = 20
distributions = np.array([len(train), len(val), len(test)])

print(distributions)
print(distributions / len(cars))

[63759 21254 21254]
[0.59998871 0.20000565 0.20000565]


### One-hot encode con variable categóricas

In [10]:
from sklearn.preprocessing import OneHotEncoder
maker_encoder = OneHotEncoder()

In [11]:
maker_encoder.fit(train[["maker"]])
mkr = maker_encoder.transform(train[["maker"]]).todense()

print(mkr.shape)

(63759, 11)


In [12]:
maker_encoder.categories_

[array(['audi', 'bmw', 'cclass', 'focus', 'ford', 'hyundi', 'merc',
        'skoda', 'toyota', 'vauxhall', 'vw'], dtype=object)]

In [13]:
df = pd.DataFrame(mkr, columns=maker_encoder.categories_, index=train[["maker"]].index)
df["actual"] = train[["maker"]]
df.sample(5)

Unnamed: 0,audi,bmw,cclass,focus,ford,hyundi,merc,skoda,toyota,vauxhall,vw,actual
48154,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,ford
47600,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,ford
41252,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,ford
56147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,vauxhall
47210,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,ford


In [14]:
test_maker = "audi"
pd.get_dummies([test_maker])

Unnamed: 0,audi
0,1


In [15]:
maker_encoder.transform([[test_maker]]).todense()

matrix([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

#### Feature Scaling
Existen algoritmos que basan su entrenamiento en únicamente números, sin contexto alguno. Algunos de ellos tienden a otorgar mayor importancia a aquellos números cuyo valor es más grande. Una apuesta segur es escalar los valores de una característica de tal modo que todos se encuentren en la misma escala, pero preservando las distancias relativas entre ellos

In [16]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler
scaler = MaxAbsScaler()

In [17]:
scaler.fit(train[["mileage"]])
scaled = scaler.transform(train[["mileage"]])

In [18]:
values = pd.DataFrame({"mileage": train["mileage"].values, "scaled": scaled.squeeze() })
values.sample(5)

Unnamed: 0,mileage,scaled
28808,24015,0.07435
14682,7500,0.02322
45986,7497,0.023211
26824,40812,0.126353
46472,8952,0.027715


# Artefactos
Hemos visto una diversidad de herramientas que nos sirven para transformar una de nuestras observaciones del munddo real, como el diálogo emitido por una persona o un automóvil, a un grupo de números.

Cosas como el OneHotEncoder, CountVectorizer y MaxAbsScaler forman parte de este conjunto de herramientas que, una vez preparadas con fit, debemos preservar para poder re-usarlas en producción. Estas herramientas son conocidas como artefactos.

In [19]:
import pickle

with open("scaler.pickle", "wb") as wb:
    pickle.dump(scaler, wb)

In [20]:
with open("scaler.pickle", "rb") as rb:
    scaler_loaded = pickle.load(rb)

In [21]:
scaler_loaded.transform([[40000]])


array([[0.12383901]])

# Pipelines
A lo largo del modelado creamos un montón de artefactos que debemos conservar para asegurarnos de que usaremos los mismos valores, parámetros e hiperparámetros. Una alternativa sería guardar cada uno de los OneHotEncoder, MinMaxScaler y cualquier otro objeto que creamos para entrenar nuestro modelo de ML.

Otra forma de hacerlo, un poco más organizada es hacer uso de un Pipeline de scikit learn:

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn import set_config

In [22]:
# One-Hot encode maker, transmission y fuelType
one_hot_encode = ColumnTransformer([
    (
        'one_hot_encode', # Nombre de la transformación
        OneHotEncoder(sparse=False), # Transformación a aplicar
        ["maker", "transmission", "fuelType"] # Columnas involucradas
    )
])

In [23]:
# Robust encode mileage
robust_encoding = ColumnTransformer([
    ('robust_encoding', RobustScaler(), ["mileage"])
])

In [25]:
# Impute and standard scale mpg and tax
impute_and_scale = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

standard_scaling = ColumnTransformer([
    ('standard_scaling', impute_and_scale, ["mpg", "tax"])
])

In [26]:
# Just pass year and engineSize
passthrough = ColumnTransformer([('passthrough', 'passthrough', ['year', "engineSize"])])

In [27]:
# Ensambla todo el pipeline
pipe = Pipeline([
    (
        'features',
        FeatureUnion([
            ('one_hot_encode', one_hot_encode),
            ('robust_encoding', robust_encoding),
            ('just_passs', passthrough),
            ('scale_and_impute', standard_scaling)
        ])
    )
])

In [28]:
from sklearn import set_config

set_config(display="diagram")
pipe

In [31]:
pipe.fit(train)

pd.DataFrame(pipe.transform(train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.557576,2017.0,1.5,0.124548,0.250000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.529576,2019.0,1.5,0.082678,0.250000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.500889,2019.0,1.0,0.110521,0.250000
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.295313,2017.0,1.0,0.127099,0.034483
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.927798,2016.0,2.1,0.132837,0.051724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,-0.035838,2017.0,2.0,0.129862,0.258621
63755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.298465,2016.0,1.0,0.135813,0.034483
63756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,-0.088929,2018.0,1.4,0.117109,0.250000
63757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.533333,2019.0,1.0,0.096281,0.250000


In [32]:
pd.DataFrame(pipe.transform(test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,-0.073939,2018.0,1.6,0.145802,0.250000
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,-0.610626,2020.0,2.0,0.116839,0.206950
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.639717,2017.0,1.5,0.116839,0.206950
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.711232,2020.0,2.5,0.428268,0.232759
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.194384,2017.0,2.0,0.142402,0.034483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.267232,2017.0,1.4,0.117109,0.250000
21250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.711111,2019.0,2.0,0.080553,0.250000
21251,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.854343,2013.0,2.0,0.112646,0.250000
21252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.558667,2016.0,2.0,0.086291,0.344828


# Modelado

In [33]:
from sklearn.linear_model import LinearRegression

In [34]:
lr = LinearRegression()

In [35]:
predicting_pipeline = Pipeline([
    ('feature', pipe),
    ('estimator', lr)
])


In [36]:
predicting_pipeline.fit(train, train['price'])

In [37]:
train_pred = predicting_pipeline.predict(train)
val_pred = predicting_pipeline.predict(val)

In [38]:
pd.DataFrame({'real':val['price'], 'predicted':val_pred})

Unnamed: 0,real,predicted
11698,48950,38475.25
10774,13898,14678.25
21006,8498,8684.50
63606,14990,15662.00
62080,4000,-1251.75
...,...,...
357,24990,23396.50
16164,31950,29213.00
97917,14286,23751.75
87873,11000,16866.00


# Evaluación de los modelos

In [39]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [44]:
train_mse = mean_absolute_error(train['price'], train_pred)
val_mse = mean_absolute_error(val['price'], val_pred)

print(f"Entrenamiento MSE: {train_mse:2.02f}\n"
      f"Validación MSE:    {val_mse:2.02f}")

Entrenamiento MSE: 2933.31
Validación MSE:    2936.39


#### Evaluación de los datos de prueba

In [45]:
test_pred = predicting_pipeline.predict(test)
test_mse = mean_absolute_error(test['price'], test_pred)

print(f"Prueba MSE: {test_mse:2.02f}")

Prueba MSE: 2938.75


# Guardar pipeline

In [46]:
from joblib import dump, load
dump(predicting_pipeline, 'car-prices.model') 

['car-prices.model']

#### Probando modelo con nuestros valores

In [47]:
saved_pipeline = load('car-prices.model')

Datos propios

In [60]:
maker = "ford"
model = "focus"
year = 2020
transmission = "Manual"
mileage = 50
fuelType = "Petrol"
tax = 100
mpg = 300
engineSize = 1.5
price= 0

mi_automóvil = pd.DataFrame({
    "maker": [maker], "model": [model], "year": [year], "transmission": [transmission], 
    "mileage": [mileage], "fuelType": [fuelType], "tax": [tax], "mpg": [mpg], "engineSize": [engineSize],
    "price":[price],
})





In [61]:
price = saved_pipeline.predict(mi_automóvil).squeeze()

print(price)

7588.0
