In [30]:
# Data processing  
# -----------------------------------------------------------------------  
import pandas as pd  

# Pandas options  
# -----------------------------------------------------------------------  
pd.options.display.max_colwidth = None

# Path configuration for custom module imports  
# -----------------------------------------------------------------------  
import sys  
sys.path.append('../')  # Adds the parent directory to the path for custom module imports  

# Ignore warnings  
# -----------------------------------------------------------------------  
import warnings  
warnings.filterwarnings("ignore") 

# Custom functions
# -----------------------------------------------------------------------
from src.support_preprocess import *

In [31]:
df = pd.read_csv('../data/output/api_rent_madrid_eda.csv', index_col=0).reset_index(drop=True)

In [32]:
df.head()

Unnamed: 0,price,propertyType,size,exterior,rooms,distance,floor,district,hasLift,numPhotos
0,750.0,flat,60.0,True,1,7037,3,Hortaleza,True,12
1,684.0,studio,45.0,True,0,10656,unknown,unknown,True,17
2,550.0,flat,53.0,True,1,4008,1,Puente de Vallecas,False,17
3,700.0,studio,28.0,True,0,5569,1,Ciudad Lineal,False,19
4,700.0,flat,45.0,False,1,2046,5,Moncloa,False,18


### Feature scaling

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         170 non-null    float64
 1   propertyType  170 non-null    object 
 2   size          170 non-null    float64
 3   exterior      170 non-null    bool   
 4   rooms         170 non-null    int64  
 5   distance      170 non-null    int64  
 6   floor         170 non-null    object 
 7   district      170 non-null    object 
 8   hasLift       170 non-null    bool   
 9   numPhotos     170 non-null    int64  
dtypes: bool(2), float64(2), int64(3), object(3)
memory usage: 11.1+ KB


### Column Analysis

| Column         | Data Type   | Observations                                  |
|----------------|-------------|-----------------------------------------------|
| `price`        | `float64`   | Target variable. We do not scale it.          |
| `propertyType` | `object`    | Categorical, requires encoding.               |
| `size`         | `float64`   | Continuous, requires scaling.                 |
| `exterior`     | `bool`      | Binary, can be encoded as 0/1.                |
| `rooms`        | `int64`     | Discrete continuous, can be scaled.           |
| `distance`     | `int64`     | Continuous, represents distances; requires scaling. |
| `floor`        | `object`    | Mixed categorical and ordinal. Requires special transformation. |
| `district`     | `object`    | Categorical, requires encoding.               |
| `hasLift`      | `bool`      | Binary, can be encoded as 0/1.                |
| `numPhotos`    | `int64`     | Discrete continuous, can be scaled.           |

### Encoding Strategy

1. Nominal Categorical Variables:
   - `propertyType` and `district` are unordered categorical variables.
   - We use **One-Hot Encoding** for `propertyType` since there are few levels (4).
   - We use **Target Encoding** for `district` since there are many levels (21).

2. Ordinal Categorical Variables:
   - `floor` is ordinal, so we use **Ordinal Encoding** with logical ordering.
   - Since it contains mixed text and numbers, we first transform the numbers and assign specific values to categories like "bj" or "st".

3. Binary Variables:
   - `exterior` and `hasLift` are already in boolean format. We convert them to 0/1.

### Scaling Strategy

1. Continuous Variables:
   - `size`, `rooms`, `distance`, and `numPhotos` are numerical and vary in scale.  
   - We can use either **Standard Scaling** or **Min-Max Scaling** since we don't have strong outlier presence.

2. Target Variable (`price`):
   - Typically, this is not scaled so we leave it as it is.


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder

# Columnas categóricas y numéricas
target_encoding_feature = ['district']
one_hot_features = ['propertyType']
ordinal_features = ['floor']
binary_features = ['exterior', 'hasLift']
numeric_features = ['size', 'rooms', 'distance', 'numPhotos']

categories_order = [['ss', 'st', 'bj', 'en', '1', '2', '3', '4', '5', '6', '7', '8', '14', 'unknown']]

# Crear transformadores
categorical_transformer = OneHotEncoder()
ordinal_transformer = OrdinalEncoder(categories=categories_order)  # Ajustar categorías según los datos
numeric_transformer = StandardScaler()

# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', categorical_transformer, one_hot_features), # 
        ('ord', ordinal_transformer, ordinal_features),
        ('bin', 'passthrough', binary_features), # binary features pass directly to model
        ('num', numeric_transformer, numeric_features)
    ]
)

# Aplicar Target Encoding fuera del pipeline
target_encoder = TargetEncoder(cols=target_encoding_feature)

# Pipeline final
model = Pipeline(steps=[
    ('target_encoding', target_encoder),  # Target Encoding para `district`
    ('preprocessor', preprocessor),      # Transformaciones para las otras columnas
    ('regressor', LinearRegression())    # Modelo final
])

# Variables predictoras. Todas menos la VR
X = df.drop("price", axis=1)
# Variable respuesta
y = df[["price"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# Entrenar el modelo
X_train_encoded = target_encoder.fit_transform(X_train, y_train)  # Aplicar Target Encoding a los datos de entrenamiento
model.fit(X_train_encoded, y_train)

# Transformar y predecir
X_test_encoded = target_encoder.transform(X_test)  # Aplicar Target Encoding a los datos de prueba
y_pred = model.predict(X_test_encoded)


'\n# Entrenar el modelo\nX_train_encoded = target_encoder.fit_transform(X_train, y_train)  # Aplicar Target Encoding a los datos de entrenamiento\nmodel.fit(X_train_encoded, y_train)\n\n# Transformar y predecir\nX_test_encoded = target_encoder.transform(X_test)  # Aplicar Target Encoding a los datos de prueba\ny_pred = model.predict(X_test_encoded)\n'

In [35]:
X_train = df.drop(columns='price')

In [36]:
# Definir el orden lógico de las categorías
categories_order = [['ss', 'st', 'bj', 'en', '1', '2', '3', '4', '5', '6', '7', '8', '14', 'unknown']]

# Crear y aplicar el codificador ordinal
ordinal_encoder = OrdinalEncoder(categories=categories_order)
df['floor_encoded'] = ordinal_encoder.fit_transform(df[['floor']])