In [58]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from feature_engine.imputation import (AddMissingIndicator, 
                                       MeanMedianImputer, 
                                       CategoricalImputer
                                       )


## Leer data gold

In [None]:
data_gold = pd.read_excel("../data/gold/car_gold.xlsx")
## selected feature
features = ["price", "antiguedad", "kilometraje", "vehicle_brand", "vehicle_line", 'location_state']
data_gold = data_gold[features]
data_gold.head()

In [26]:
## dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(data_gold.drop(labels=['price'], axis=1),
                                                    data_gold['price'],
                                                    test_size=0.3,
                                                    random_state=0) # reproducibilidad

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train)
print(y_test)

(5075, 5)

## missing indicator

In [None]:
vars_with_na = [var for var in data_gold.columns if data_gold[var].isnull().sum() > 0]
indicator = AddMissingIndicator(variables=vars_with_na)
indicator.fit(X_train)
X_train_transformed = indicator.transform(X_train)
X_train_transformed.head()

## Variables numericas con NA

In [None]:
num_vars = data_gold.select_dtypes(include=['int64', 'float64']).columns
num_vars_na = [var for var in num_vars if var in vars_with_na]
numerical_imputer = MeanMedianImputer(imputation_method='median', 
                                      variables=num_vars_na)

numerical_imputer.fit(X_train_transformed)
X_train_transformed = numerical_imputer.transform(X_train_transformed)


## Variables categoricas con NA

In [None]:
cat_vars = data_gold.select_dtypes(include=['object']).columns
cat_vars_na = [var for var in cat_vars if var in vars_with_na]
categorical_imputer = CategoricalImputer(imputation_method='missing',
                                        fill_value='missing',
                                        variables=cat_vars_na)
categorical_imputer.fit(X_train_transformed)
X_train_transformed = categorical_imputer.transform(X_train_transformed)
X_train_transformed.head()