In [1]:
# Data processing  
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import pickle

# Pandas options  
# -----------------------------------------------------------------------
pd.options.display.max_colwidth = None

# Path configuration for custom module imports  
# -----------------------------------------------------------------------
import sys  
sys.path.append('../')  # Adds the parent directory to the path for custom module imports  

# Ignore warnings  
# -----------------------------------------------------------------------
import warnings  
warnings.filterwarnings("ignore") 

# Machine learning imports
# -----------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

# Custom functions and classes
# -----------------------------------------------------------------------
from src.support_preprocess import Encoding

In [2]:
df = pd.read_csv('../data/output/api_rent_madrid_eda.csv', index_col=0).reset_index(drop=True)

In [3]:
df.head()

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,distance,floor,municipality,province,hasLift,numPhotos
0,550.0,chalet,371.0,False,6,3,40116,unknown,Numancia de la Sagra,Toledo,unknown,33
1,750.0,flat,60.0,True,1,1,7037,3,Madrid,Madrid,True,12
2,750.0,flat,70.0,True,2,1,16145,bj,San Sebastián de los Reyes,Madrid,False,21
3,400.0,penthouse,67.0,True,2,2,55041,2,Villamanrique de Tajo,Madrid,False,28
4,450.0,flat,89.0,False,2,1,47186,2,Recas,Toledo,True,22


### Feature scaling

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         440 non-null    float64
 1   propertyType  440 non-null    object 
 2   size          440 non-null    float64
 3   exterior      440 non-null    bool   
 4   rooms         440 non-null    int64  
 5   bathrooms     440 non-null    int64  
 6   distance      440 non-null    int64  
 7   floor         440 non-null    object 
 8   municipality  440 non-null    object 
 9   province      440 non-null    object 
 10  hasLift       440 non-null    object 
 11  numPhotos     440 non-null    int64  
dtypes: bool(1), float64(2), int64(4), object(5)
memory usage: 38.4+ KB


### Column Analysis

| Column         | Data Type   | Observations                                  |
|----------------|-------------|-----------------------------------------------|
| `price`        | `float64`   | Target variable. We do not scale it.          |
| `propertyType` | `object`    | Categorical, requires encoding.               |
| `size`         | `float64`   | Continuous, requires scaling.                 |
| `exterior`     | `bool`      | Binary, can be encoded as 0/1.                |
| `rooms`        | `int64`     | Discrete continuous, can be scaled.           |
| `distance`     | `int64`     | Continuous, represents distances; requires scaling. |
| `floor`        | `object`    | Mixed categorical and ordinal. Requires special transformation. |
| `municipality` | `object`    | Categorical, requires encoding.               |
| `province`     | `object`    | Categorical, requires encoding.               |
| `hasLift`      | `object`    | Categorical, requires encoding.               |
| `numPhotos`    | `int64`     | Discrete continuous, can be scaled.           |

### Encoding Strategy

1. Nominal Categorical Variables:
   - `propertyType`, `municipality`, `province` are unordered categorical variables.
   - We use **One-Hot Encoding** for `propertyType` since there are few levels.
   - We use **Target Encoding** for `municipality`, `province` and `hasLift` since there are many levels and makes more sense.

2. Ordinal Categorical Variables:
   - `floor` is ordinal, so we use **Ordinal Encoding** with logical ordering.
   - Since it contains mixed text and numbers, we first transform the numbers and assign specific values to categories like "bj" or "st".

3. Binary Variables:
   - `exterior` and `hasLift` are already in boolean format. We convert them to 0/1.

### Scaling Strategy

1. Continuous Variables:
   - `size`, `rooms`, `distance`, and `numPhotos` are numerical and vary in scale.  
   - We can use either **Standard Scaling** or **Min-Max Scaling** since we don't have strong outlier presence.

2. Target Variable (`price`):
   - Typically, this is not scaled so we leave it as it is.


### Scaling

In [5]:
# StandardScaler
numeric_features = ['size', 'rooms', 'distance', 'numPhotos']
numeric_transformer = StandardScaler()

scaled_data = numeric_transformer.fit_transform(df[numeric_features])
df[numeric_features] = scaled_data

### Encoding

In [6]:
encoding_methods = {"onehot": ['propertyType'],
                    "target": ['municipality', 'province', 'hasLift'],
                    "ordinal" : {
                        'floor': ['ss', 'st', 'bj', 'en', '1', '2', '3', '4', '5', '6', '7', '8', '14', 'unknown']
                        },
                    "frequency": []
                    }

encoder = Encoding(df, encoding_methods, 'price')

In [7]:
df_preprocessed = encoder.execute_all_encodings()

In [8]:
df_preprocessed.to_csv('../data/output/api_rent_madrid_preprocessed.csv')

### Save pickles

In [14]:
with open('../models/options/propertyType.pkl', 'wb') as archivo:
    pickle.dump(df["propertyType"].unique().tolist(), archivo)

with open('../models/options/municipality.pkl', 'wb') as archivo:
    pickle.dump(df["municipality"].unique().tolist(), archivo)

with open('../models/options/provinces.pkl', 'wb') as archivo:
    pickle.dump(df["province"].unique().tolist(), archivo)