In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OneHotEncoder


# For modeling
from sklearn.ensemble import RandomForestRegressor

# For evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# To ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


In [33]:
data = pd.read_csv('data/cleaned_rental_data.csv')



data.head()

Unnamed: 0,city,area_sqm,num_rooms,num_bathrooms,num_parking_spaces,floor_level,allows_animals,is_furnished,hoa_fee_brl,monthly_rent_brl,property_tax_brl,fire_insurance_brl,total_monthly_cost_brl
0,São Paulo,70,2,1,1,7.0,acept,furnished,2065,3300,211,42,5618
1,São Paulo,320,4,4,0,20.0,acept,not furnished,1200,4960,1750,63,7973
2,Porto Alegre,80,1,1,1,6.0,acept,not furnished,1000,2800,0,41,3841
3,Porto Alegre,51,2,1,0,2.0,acept,not furnished,270,1112,22,17,1421
4,São Paulo,25,1,1,0,1.0,not acept,not furnished,0,800,25,11,836


### Preprocessing

#### 1. Feature Engineering

In [34]:
data['high_hoa_fee'] = data['hoa_fee_brl'].apply(
    lambda x: 1 if x > data['hoa_fee_brl'].median() else 0
)

#### 2. Separation of data

In [38]:
categorical_features = ['city', 'allows_animals', 'is_furnished','high_hoa_fee']
numerical_features = [
    'area_sqm',           # Area in square meters
    'num_rooms',          # Number of rooms
    'num_bathrooms',      # Number of bathrooms
    'num_parking_spaces', # Number of parking spaces
    'floor_level',        # Floor level of the apartment
    'hoa_fee_brl',        # HOA fee in Brazilian reals
    'fire_insurance_brl', # Fire insurance cost in BRL
    'property_tax_brl',   # Property tax in BRL
    'monthly_rent_brl',   # Rent amount in BRL
]

print(data.columns)

missing_columns = set(numerical_features + categorical_features) - set(X.columns)
if missing_columns:
    print(f"Missing columns: {missing_columns}")

Index(['city', 'area_sqm', 'num_rooms', 'num_bathrooms', 'num_parking_spaces',
       'floor_level', 'allows_animals', 'is_furnished', 'hoa_fee_brl',
       'monthly_rent_brl', 'property_tax_brl', 'fire_insurance_brl',
       'total_monthly_cost_brl', 'high_hoa_fee'],
      dtype='object')
Missing columns: {'total_monthly_cost_brl'}


In [36]:

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first')) # One-hot encode the categorical columns
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [37]:
# Define models
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(random_state=42)
gradient_boosting_model = GradientBoostingRegressor(random_state=42)

# Prepare X (features) and y (target)
X = data.drop('total_monthly_cost_brl', axis=1)  # Adjusted to match the new column name
y = data['total_monthly_cost_brl']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipelines for each model
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', linear_model)
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', random_forest_model)
])

gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', gradient_boosting_model)
])

# Evaluate models using cross-validation
linear_scores = cross_val_score(linear_pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
random_forest_scores = cross_val_score(random_forest_pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
gradient_boosting_scores = cross_val_score(gradient_boosting_pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Print the cross-validated scores
print(f"Linear Regression MAE: {-linear_scores.mean():.4f}")
print(f"Random Forest MAE: {-random_forest_scores.mean():.4f}")
print(f"Gradient Boosting MAE: {-gradient_boosting_scores.mean():.4f}")

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'total_monthly_cost_brl'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/utils/_indexing.py", line 361, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'total_monthly_cost_brl'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 968, in fit_transform
    self._validate_column_callables(X)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/compose/_column_transformer.py", line 536, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
  File "/Users/adrian/.pyenv/versions/3.10.14/lib/python3.10/site-packages/sklearn/utils/_indexing.py", line 369, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
