In [24]:
# Data processing  
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Pandas options  
# -----------------------------------------------------------------------
pd.options.display.max_colwidth = None

# Path configuration for custom module imports  
# -----------------------------------------------------------------------
import sys  
sys.path.append('../')  # Adds the parent directory to the path for custom module imports  

# Ignore warnings  
# -----------------------------------------------------------------------
import warnings  
warnings.filterwarnings("ignore") 


# Machine learning imports
# -----------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Custom functions
# -----------------------------------------------------------------------
from src.support_preprocess import Encoding

In [25]:
df = pd.read_csv('../data/output/api_rent_madrid_eda.csv', index_col=0).reset_index(drop=True)

In [26]:
df.head()

Unnamed: 0,price,propertyType,size,exterior,rooms,distance,floor,district,hasLift,numPhotos
0,750.0,flat,60.0,True,1,7037,3,Hortaleza,True,12
1,684.0,studio,45.0,True,0,10656,unknown,unknown,True,17
2,550.0,flat,53.0,True,1,4008,1,Puente de Vallecas,False,17
3,700.0,studio,28.0,True,0,5569,1,Ciudad Lineal,False,19
4,700.0,flat,45.0,False,1,2046,5,Moncloa,False,18


### Feature scaling

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         170 non-null    float64
 1   propertyType  170 non-null    object 
 2   size          170 non-null    float64
 3   exterior      170 non-null    bool   
 4   rooms         170 non-null    int64  
 5   distance      170 non-null    int64  
 6   floor         170 non-null    object 
 7   district      170 non-null    object 
 8   hasLift       170 non-null    bool   
 9   numPhotos     170 non-null    int64  
dtypes: bool(2), float64(2), int64(3), object(3)
memory usage: 11.1+ KB


### Column Analysis

| Column         | Data Type   | Observations                                  |
|----------------|-------------|-----------------------------------------------|
| `price`        | `float64`   | Target variable. We do not scale it.          |
| `propertyType` | `object`    | Categorical, requires encoding.               |
| `size`         | `float64`   | Continuous, requires scaling.                 |
| `exterior`     | `bool`      | Binary, can be encoded as 0/1.                |
| `rooms`        | `int64`     | Discrete continuous, can be scaled.           |
| `distance`     | `int64`     | Continuous, represents distances; requires scaling. |
| `floor`        | `object`    | Mixed categorical and ordinal. Requires special transformation. |
| `district`     | `object`    | Categorical, requires encoding.               |
| `hasLift`      | `bool`      | Binary, can be encoded as 0/1.                |
| `numPhotos`    | `int64`     | Discrete continuous, can be scaled.           |

### Encoding Strategy

1. Nominal Categorical Variables:
   - `propertyType` and `district` are unordered categorical variables.
   - We use **One-Hot Encoding** for `propertyType` since there are few levels (4).
   - We use **Target Encoding** for `district` since there are many levels (21).

2. Ordinal Categorical Variables:
   - `floor` is ordinal, so we use **Ordinal Encoding** with logical ordering.
   - Since it contains mixed text and numbers, we first transform the numbers and assign specific values to categories like "bj" or "st".

3. Binary Variables:
   - `exterior` and `hasLift` are already in boolean format. We convert them to 0/1.

### Scaling Strategy

1. Continuous Variables:
   - `size`, `rooms`, `distance`, and `numPhotos` are numerical and vary in scale.  
   - We can use either **Standard Scaling** or **Min-Max Scaling** since we don't have strong outlier presence.

2. Target Variable (`price`):
   - Typically, this is not scaled so we leave it as it is.


### Scaling

In [28]:
# StandardScaler
numeric_features = ['size', 'rooms', 'distance', 'numPhotos']
numeric_transformer = StandardScaler()

scaled_data = numeric_transformer.fit_transform(df[numeric_features])
df[numeric_features] = scaled_data

### Encoding

In [29]:
encoding_methods = {"onehot": ['propertyType'],
                    "target": ['district'],
                    "ordinal" : {
                        'floor': ['ss', 'st', 'bj', 'en', '1', '2', '3', '4', '5', '6', '7', '8', '14', 'unknown']
                        },
                    "frequency": []
                    }

encoder = Encoding(df, encoding_methods, 'price')

In [30]:
df_preprocessed = encoder.execute_all_encodings()

In [31]:
df_preprocessed.to_csv('../data/output/api_rent_madrid_preprocessed.csv')

---

In [None]:
# Predicciones para el test
y_test_pred = model.predict(X_test)

# Predecimos el entrenamiento
y_train_pred = model.predict(X_train)

In [None]:
metricas = {
    'train': {
        'r2_score': r2_score(y_train, y_train_pred),
        'MAE': mean_absolute_error(y_train, y_train_pred),
        'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred))
    },
    'test': {
        'r2_score': r2_score(y_test, y_test_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred))
    }
}

In [None]:
pd.DataFrame(metricas).round(4).T

---