In [1]:
from modelbuilder.dataset import Dataset
from modelbuilder.model import Model
from modelbuilder.validation.validate import build_metrics

from api.helpers.gcp import get_df_from_bq_query
from config.config import QUERY_TRAINDATA, KEYS, FEATURES, TARGET

In [2]:
from api.train.train import load_dataset, create_pipeline, get_splitter

In [3]:
dataset = load_dataset()

Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod__year',
       'Category', 'Leather_interior', 'Fuel_type', 'Cylinders',
       'Gear_box_type', 'Drive_wheels', 'Mileage', 'Engine_volume',
       'int64_field_14'],
      dtype='object')
         ID  Price Levy Manufacturer Model Prod__year   Category  \
0  45427273   8467    -   MITSUBISHI     I       2013  Hatchback   
1  45798484  51746    -          GAZ    20       1953      Sedan   
2  45792773   3200    -          GAZ    20       1953       Jeep   
3  45791976   7840    -          GAZ    21       1964      Sedan   
4  45812939   5331    -          GAZ    24       1990  Universal   

  Leather_interior Fuel_type Cylinders Gear_box_type Drive_wheels Mileage  \
0               No    Petrol       3.0     Automatic         Rear  126400   
1               No    Petrol       4.0        Manual         Rear       0   
2               No    Petrol       4.0        Manual          4x4  100000   
3               No    Petrol     

In [4]:
dataset.data.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod__year,Category,Leather_interior,Fuel_type,Cylinders,Gear_box_type,Drive_wheels,Mileage,Engine_volume,int64_field_14
0,45427273,8467,-,MITSUBISHI,I,2013,Hatchback,No,Petrol,3.0,Automatic,Rear,126400,7,
1,45798484,51746,-,GAZ,20,1953,Sedan,No,Petrol,4.0,Manual,Rear,0,2,
2,45792773,3200,-,GAZ,20,1953,Jeep,No,Petrol,4.0,Manual,4x4,100000,32,
3,45791976,7840,-,GAZ,21,1964,Sedan,No,Petrol,4.0,Manual,Rear,0,24,
4,45812939,5331,-,GAZ,24,1990,Universal,Yes,Petrol,8.0,Manual,4x4,2000,63,


In [5]:
dataset.columns.features

['Manufacturer',
 'Model',
 'Prod__year',
 'Category',
 'Leather_interior',
 'Fuel_type',
 'Engine_volume',
 'Mileage',
 'Cylinders',
 'Gear_box_type']

In [6]:
pipeline = create_pipeline()
splitter = get_splitter()
# Create object model and fit it
model_name = "car_price"
model = Model(model_name, pipeline)

In [7]:
data = dataset.data[dataset.columns.features]
data

Unnamed: 0,Manufacturer,Model,Prod__year,Category,Leather_interior,Fuel_type,Engine_volume,Mileage,Cylinders,Gear_box_type
0,MITSUBISHI,I,2013,Hatchback,No,Petrol,7,126400,3.0,Automatic
1,GAZ,20,1953,Sedan,No,Petrol,2,0,4.0,Manual
2,GAZ,20,1953,Jeep,No,Petrol,32,100000,4.0,Manual
3,GAZ,21,1964,Sedan,No,Petrol,24,0,4.0,Manual
4,GAZ,24,1990,Universal,Yes,Petrol,63,2000,8.0,Manual
...,...,...,...,...,...,...,...,...,...,...
19232,TOYOTA,Alphard,2003,Minivan,Yes,LPG,3,190000,6.0,Automatic
19233,MITSUBISHI,Pajero,2000,Jeep,Yes,CNG,32,210000,6.0,Automatic
19234,MERCEDES-BENZ,E 500 AVG,2005,Sedan,Yes,Petrol,5,56000,12.0,Tiptronic
19235,MERCEDES-BENZ,E 500,2003,Sedan,Yes,Petrol,5,150000,12.0,Tiptronic


In [8]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
dummies = ["Fuel_type"]

mapper = DataFrameMapper(
    [(d, LabelBinarizer()) for d in dummies]
)

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Dividimos las columnas en categóricas y numéricas
categorical_columns = ['Category']
numeric_columns = ['Engine_volume']

# Definimos el preprocesamiento para cada tipo de columna
# Para las categóricas usamos OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Tratamiento de valores faltantes
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Para las numéricas podríamos usar un SimpleImputer para tratar valores faltantes
# y opcionalmente un escalador como StandardScaler
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Tratamiento de valores faltantes
    ('scaler', StandardScaler())])  # Escalado de características

# Utilizamos ColumnTransformer para aplicar transformaciones por tipo de columna
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns),
        ('num', numeric_transformer, numeric_columns)])



In [26]:
preprocessor.fit(data[["Engine_volume", 'Category']])

result = preprocessor.transform(data[["Engine_volume", 'Category']])

In [30]:
import numpy as np
result.todense()

matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        , -0.02255051],
        [ 0.        ,  0.        ,  0.        , ...,  1.        ,
          0.        , -0.02398903],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        , -0.0153579 ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  1.        ,
          0.        , -0.02312591],
        [ 0.        ,  0.        ,  0.        , ...,  1.        ,
          0.        , -0.02312591],
        [ 0.        ,  0.        ,  0.        , ...,  1.        ,
          0.        , -0.02312591]])