In [1]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd

# Importación de las clases creadas en nuestro archivo de soporte
# -----------------------------------------------------------------------
# from src import soporte_nulos as sn

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Métodos estadísticos
# -----------------------------------------------------------------------
from scipy.stats import zscore  # para calcular el z-score
from sklearn.neighbors import LocalOutlierFactor  # para detectar outliers usando el método LOF
from sklearn.ensemble import IsolationForest  # para detectar outliers usando el método IF
from sklearn.neighbors import NearestNeighbors  # para calcular la epsilon
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler


# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Importación de las clases creadas en nuestro archivo de soporte
# -----------------------------------------------------------------------
from src import soporte_preprocesamiento as sp
from src import soporte_outliers as so
from src import soporte_encoding as se

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler

In [4]:
df = pd.read_csv("datos/datos_encoding.csv",index_col=0)

In [5]:
df.head(2)

Unnamed: 0,price,size,municipality,distance,hasVideo,status,hasPlan,has3DTour,floor,hasLift,propertyType_chalet,propertyType_countryHouse,propertyType_duplex,propertyType_flat,propertyType_penthouse,propertyType_studio,exterior_False,exterior_True,rooms_0,rooms_1,rooms_2,rooms_3,rooms_4,bathrooms_1,bathrooms_2,bathrooms_3,showAddress_False,showAddress_True,has360_False,has360_True,has360_desconocido
0,750.0,60.0,705.594114,7037,690.308696,695.243827,688.380091,693.661677,697.689843,702.252872,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,750.0,70.0,701.418584,16145,702.384032,695.243827,705.075598,693.661677,678.664014,687.633544,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 353 entries, 0 to 352
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   price                      353 non-null    float64
 1   size                       353 non-null    float64
 2   municipality               353 non-null    float64
 3   distance                   353 non-null    int64  
 4   hasVideo                   353 non-null    float64
 5   status                     353 non-null    float64
 6   hasPlan                    353 non-null    float64
 7   has3DTour                  353 non-null    float64
 8   floor                      353 non-null    float64
 9   hasLift                    353 non-null    float64
 10  propertyType_chalet        353 non-null    float64
 11  propertyType_countryHouse  353 non-null    float64
 12  propertyType_duplex        353 non-null    float64
 13  propertyType_flat          353 non-null    float64
 14 

### Feature Scaling

Hacemos el encoding,  a todas las columnas menos la variable respuesta price. Podemos hacerlo con Robust y con min_max. Los otros métodos no los contemplamos porque nuestras variables no se asemejan a la normal.

Entre Robust y min_max, en principio nos quedamos con Robust que es menos sensible a los outliers. Pero es cierto que como vimos en el EDA, tenemos pocos outliers en nuestros datos por lo que en el mix_max es posible que también sea una buena opción. Como vamos a estandarizar 31 vaiables, es poco util pretender ver de un vistazo con una visualización o un describe que metodo es mejor, así que min_max lo dejamos para otro modelo y compararemos si conduce a mejores predicciones que el Roubust que es el que nos quedamos de momento

In [None]:
# Llamamos a la funión de escalado de nuestro src, que por defecto si no lepasamos otro parámetro usa el RobustScaler()
df_scaled, scaler = sp.escalar_datos(data = df, cols = df.columns.drop("price"))
df_scaled = pd.concat([df["price"],df_scaled], axis = 1)

In [9]:
df_scaled.head()

Unnamed: 0,price,size,municipality,distance,hasVideo,status,hasPlan,has3DTour,floor,hasLift,propertyType_chalet,propertyType_countryHouse,propertyType_duplex,propertyType_flat,propertyType_penthouse,propertyType_studio,exterior_False,exterior_True,rooms_0,rooms_1,rooms_2,rooms_3,rooms_4,bathrooms_1,bathrooms_2,bathrooms_3,showAddress_False,showAddress_True,has360_False,has360_True,has360_desconocido
0,750.0,0.12,0.445948,-0.252799,0.0,0.0,0.0,0.0,0.069359,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0
1,750.0,0.52,0.038221,0.182949,1.0,0.0,1.0,0.0,-0.756628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,400.0,0.4,-4.512073,2.043824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0
3,590.0,0.52,-2.098184,2.118218,0.0,0.0,0.0,0.0,-1.11468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,684.0,-0.48,0.445948,-0.079657,-0.67786,-13.120517,-0.374758,-11.538367,-1.11468,-0.952713,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,-1.0,0.0,1.0
