In [1]:
from __future__ import annotations

import os
import os.path as P
import typing
from unicodedata import normalize

import cloudpickle
import numpy as np
import pandas as pd
import sklearn
from lightgbm import LGBMRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import (
    FunctionTransformer,
    MinMaxScaler,
    OneHotEncoder,
    StandardScaler,
)
from xgboost import XGBRegressor


### Dataset Loading

In [2]:
dataset_root_dir = os.path.join(os.path.dirname(os.path.abspath("")), "data", "raw")
if not os.path.isdir(dataset_root_dir):
    os.makedirs(dataset_root_dir)
    print("Data folder created!")

if "housing_sp_city.csv" not in os.listdir(dataset_root_dir):
    kaggle_cmd = f"kaggle datasets download -d ex0ticone/house-prices-of-sao-paulo-city -p {dataset_root_dir}"
    os.system(kaggle_cmd)

    zip_file = os.path.join(dataset_root_dir, "house-prices-of-sao-paulo-city.zip")
    unzip_cmd = f"unzip {zip_file} -d {dataset_root_dir}"
    os.system(unzip_cmd)

    rm_cmd = f"rm {zip_file}"
    os.system(rm_cmd)

    print("Dataset downloaded")
else:
    print("Dataset already exists!")

Dataset already exists!


In [3]:
dataset_file = os.path.join(dataset_root_dir, "housing_sp_city.csv")
sp_house_price = pd.read_csv(dataset_file, encoding="utf-8", encoding_errors="ignore")
sp_house_price

Unnamed: 0,logradouro,numero,bairro,cep,cidade,tipo_imovel,area_util,banheiros,suites,quartos,vagas_garagem,anuncio_criado,tipo_anuncio,preco_venda,taxa_condominio,periodicidade,preco_aluguel,iptu_ano
0,Rua Juvenal Galeno,53,Jardim da Saúde,4290030.0,São Paulo,Casa de dois andares,388.0,3.0,1.0,4.0,6.0,2017-02-07,Venda,700000,,,,
1,Rua Juruaba,16,Vila Santa Teresa (Zona Sul),4187320.0,São Paulo,Casa,129.0,2.0,1.0,3.0,2.0,2016-03-21,Venda,336000,,,,
2,Avenida Paulista,402,Bela Vista,1311000.0,São Paulo,Comercial,396.0,4.0,0.0,0.0,5.0,2018-12-18,Locação,24929,4900.0,MONTHLY,29829.0,4040.0
3,Rua Alvorada,1190,Vila Olímpia,4550004.0,São Paulo,Apartamento,80.0,2.0,1.0,3.0,2.0,2018-10-26,Venda,739643,686.0,,,1610.0
4,Rua Curitiba,380,Paraíso,4005030.0,São Paulo,Apartamento,3322.0,5.0,4.0,4.0,5.0,2018-12-14,Venda,7520099,6230.0,,,18900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133959,Rua Glicério,255,Liberdade,1514000.0,São Paulo,Apartamento,53.0,2.0,1.0,2.0,1.0,2018-11-28,Venda,249782,210.0,,,0.0
133960,Rua Laboriosa,,Jardim das Bandeiras,5434060.0,São Paulo,Escritório,450.0,3.0,1.0,3.0,4.0,2018-08-08,Venda,1085000,,,,507.0
133961,Rua José Pereira de Carvalho,10,Vila Lageado,5337090.0,São Paulo,Apartamento,20.0,3.0,2.0,3.0,2.0,2019-02-06,Venda,623000,,,,
133962,Rua Evangelista Rodrigues,234,Alto de Pinheiros,5463000.0,São Paulo,Casa de dois andares,357.0,4.0,1.0,4.0,4.0,2018-04-14,Venda,1820000,0.0,,,665.0


### Initial cleaning

In [4]:
sp_house_price = (
    sp_house_price.drop(
        labels=["logradouro", "numero", "cep", "cidade"], axis="columns"
    )
    .dropna(subset=["bairro"])
    .copy()
)

sp_house_price

Unnamed: 0,bairro,tipo_imovel,area_util,banheiros,suites,quartos,vagas_garagem,anuncio_criado,tipo_anuncio,preco_venda,taxa_condominio,periodicidade,preco_aluguel,iptu_ano
0,Jardim da Saúde,Casa de dois andares,388.0,3.0,1.0,4.0,6.0,2017-02-07,Venda,700000,,,,
1,Vila Santa Teresa (Zona Sul),Casa,129.0,2.0,1.0,3.0,2.0,2016-03-21,Venda,336000,,,,
2,Bela Vista,Comercial,396.0,4.0,0.0,0.0,5.0,2018-12-18,Locação,24929,4900.0,MONTHLY,29829.0,4040.0
3,Vila Olímpia,Apartamento,80.0,2.0,1.0,3.0,2.0,2018-10-26,Venda,739643,686.0,,,1610.0
4,Paraíso,Apartamento,3322.0,5.0,4.0,4.0,5.0,2018-12-14,Venda,7520099,6230.0,,,18900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133959,Liberdade,Apartamento,53.0,2.0,1.0,2.0,1.0,2018-11-28,Venda,249782,210.0,,,0.0
133960,Jardim das Bandeiras,Escritório,450.0,3.0,1.0,3.0,4.0,2018-08-08,Venda,1085000,,,,507.0
133961,Vila Lageado,Apartamento,20.0,3.0,2.0,3.0,2.0,2019-02-06,Venda,623000,,,,
133962,Alto de Pinheiros,Casa de dois andares,357.0,4.0,1.0,4.0,4.0,2018-04-14,Venda,1820000,0.0,,,665.0


In [5]:
def normalize_str(str_value: str) -> str:
    return normalize("NFKD", str_value).encode("ASCII", "ignore").decode("ASCII")

for col in ("bairro", "tipo_imovel", "tipo_anuncio"):
    sp_house_price[col] = sp_house_price[col].apply(
        lambda str_value: normalize("NFKD", str_value)
        .encode("ASCII", "ignore")
        .decode("ASCII")
    )

sp_house_price

Unnamed: 0,bairro,tipo_imovel,area_util,banheiros,suites,quartos,vagas_garagem,anuncio_criado,tipo_anuncio,preco_venda,taxa_condominio,periodicidade,preco_aluguel,iptu_ano
0,Jardim da Saude,Casa de dois andares,388.0,3.0,1.0,4.0,6.0,2017-02-07,Venda,700000,,,,
1,Vila Santa Teresa (Zona Sul),Casa,129.0,2.0,1.0,3.0,2.0,2016-03-21,Venda,336000,,,,
2,Bela Vista,Comercial,396.0,4.0,0.0,0.0,5.0,2018-12-18,Locacao,24929,4900.0,MONTHLY,29829.0,4040.0
3,Vila Olimpia,Apartamento,80.0,2.0,1.0,3.0,2.0,2018-10-26,Venda,739643,686.0,,,1610.0
4,Paraiso,Apartamento,3322.0,5.0,4.0,4.0,5.0,2018-12-14,Venda,7520099,6230.0,,,18900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133959,Liberdade,Apartamento,53.0,2.0,1.0,2.0,1.0,2018-11-28,Venda,249782,210.0,,,0.0
133960,Jardim das Bandeiras,Escritorio,450.0,3.0,1.0,3.0,4.0,2018-08-08,Venda,1085000,,,,507.0
133961,Vila Lageado,Apartamento,20.0,3.0,2.0,3.0,2.0,2019-02-06,Venda,623000,,,,
133962,Alto de Pinheiros,Casa de dois andares,357.0,4.0,1.0,4.0,4.0,2018-04-14,Venda,1820000,0.0,,,665.0


In [6]:
sp_house_price["anuncio_criado"] = pd.to_datetime(sp_house_price["anuncio_criado"])

sp_house_price

Unnamed: 0,bairro,tipo_imovel,area_util,banheiros,suites,quartos,vagas_garagem,anuncio_criado,tipo_anuncio,preco_venda,taxa_condominio,periodicidade,preco_aluguel,iptu_ano
0,Jardim da Saude,Casa de dois andares,388.0,3.0,1.0,4.0,6.0,2017-02-07,Venda,700000,,,,
1,Vila Santa Teresa (Zona Sul),Casa,129.0,2.0,1.0,3.0,2.0,2016-03-21,Venda,336000,,,,
2,Bela Vista,Comercial,396.0,4.0,0.0,0.0,5.0,2018-12-18,Locacao,24929,4900.0,MONTHLY,29829.0,4040.0
3,Vila Olimpia,Apartamento,80.0,2.0,1.0,3.0,2.0,2018-10-26,Venda,739643,686.0,,,1610.0
4,Paraiso,Apartamento,3322.0,5.0,4.0,4.0,5.0,2018-12-14,Venda,7520099,6230.0,,,18900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133959,Liberdade,Apartamento,53.0,2.0,1.0,2.0,1.0,2018-11-28,Venda,249782,210.0,,,0.0
133960,Jardim das Bandeiras,Escritorio,450.0,3.0,1.0,3.0,4.0,2018-08-08,Venda,1085000,,,,507.0
133961,Vila Lageado,Apartamento,20.0,3.0,2.0,3.0,2.0,2019-02-06,Venda,623000,,,,
133962,Alto de Pinheiros,Casa de dois andares,357.0,4.0,1.0,4.0,4.0,2018-04-14,Venda,1820000,0.0,,,665.0


### Select Sale rows only

In [7]:
sp_house_price = (
    sp_house_price[sp_house_price.tipo_anuncio == "Venda"]
    .drop(labels=["tipo_anuncio", "periodicidade", "preco_aluguel"], axis="columns")
    .reset_index(drop=True)
)

sp_house_price

Unnamed: 0,bairro,tipo_imovel,area_util,banheiros,suites,quartos,vagas_garagem,anuncio_criado,preco_venda,taxa_condominio,iptu_ano
0,Jardim da Saude,Casa de dois andares,388.0,3.0,1.0,4.0,6.0,2017-02-07,700000,,
1,Vila Santa Teresa (Zona Sul),Casa,129.0,2.0,1.0,3.0,2.0,2016-03-21,336000,,
2,Vila Olimpia,Apartamento,80.0,2.0,1.0,3.0,2.0,2018-10-26,739643,686.0,1610.0
3,Paraiso,Apartamento,3322.0,5.0,4.0,4.0,5.0,2018-12-14,7520099,6230.0,18900.0
4,Pinheiros,Apartamento,94.0,1.0,0.0,3.0,2.0,2018-05-29,630700,1120.0,489.0
...,...,...,...,...,...,...,...,...,...,...,...
105312,Bela Vista,Apartamento,60.0,1.0,,1.0,1.0,2017-12-13,251999,273.0,86.0
105313,Liberdade,Apartamento,53.0,2.0,1.0,2.0,1.0,2018-11-28,249782,210.0,0.0
105314,Jardim das Bandeiras,Escritorio,450.0,3.0,1.0,3.0,4.0,2018-08-08,1085000,,507.0
105315,Vila Lageado,Apartamento,20.0,3.0,2.0,3.0,2.0,2019-02-06,623000,,


### Excluding comercial properties

In [8]:
sp_house_price = sp_house_price[
    sp_house_price.tipo_imovel.isin(
        [
            "Casa",
            "Casa de dois andares",
            "Apartamento",
            "Condominio",
            "Flat",
            "Cobertura",
            "Kitnet",
            "Predio Residencial",
        ]
    )
].copy()

sp_house_price.tipo_imovel.unique()


array(['Casa de dois andares', 'Casa', 'Apartamento', 'Condominio',
       'Flat', 'Cobertura', 'Predio Residencial', 'Kitnet'], dtype=object)

### Remove outliers

In [9]:
def detect_outlier(df: pd.DataFrame, col: str) -> pd.DataFrame:
    q1, q3 = df[col].quantile([0.25, 0.75])
    iqr = q3 - q1

    return df[col].apply(lambda v: v > (q3 + 2.0 * iqr))


sp_house_price = sp_house_price[
    ~(
        detect_outlier(sp_house_price, "preco_venda")
        | detect_outlier(sp_house_price, "area_util")
        | detect_outlier(sp_house_price, "banheiros")
        | detect_outlier(sp_house_price, "quartos")
        | detect_outlier(sp_house_price, "suites")
        | detect_outlier(sp_house_price, "vagas_garagem")
    )
].reset_index(drop=True)
sp_house_price

Unnamed: 0,bairro,tipo_imovel,area_util,banheiros,suites,quartos,vagas_garagem,anuncio_criado,preco_venda,taxa_condominio,iptu_ano
0,Jardim da Saude,Casa de dois andares,388.0,3.0,1.0,4.0,6.0,2017-02-07,700000,,
1,Vila Santa Teresa (Zona Sul),Casa,129.0,2.0,1.0,3.0,2.0,2016-03-21,336000,,
2,Vila Olimpia,Apartamento,80.0,2.0,1.0,3.0,2.0,2018-10-26,739643,686.0,1610.0
3,Pinheiros,Apartamento,94.0,1.0,0.0,3.0,2.0,2018-05-29,630700,1120.0,489.0
4,Vila Santa Clara,Condominio,110.0,1.0,1.0,3.0,2.0,2018-04-16,385000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
88742,Vila Carmosina,Apartamento,48.0,1.0,0.0,2.0,1.0,2017-10-07,171150,244.0,0.0
88743,Bela Vista,Apartamento,60.0,1.0,,1.0,1.0,2017-12-13,251999,273.0,86.0
88744,Liberdade,Apartamento,53.0,2.0,1.0,2.0,1.0,2018-11-28,249782,210.0,0.0
88745,Vila Lageado,Apartamento,20.0,3.0,2.0,3.0,2.0,2019-02-06,623000,,


### Changing Features Names

In [10]:
sp_house_price.columns = [
    "neighborhood",
    "property_type",
    "usable_area",
    "bathrooms",
    "suites",
    "bedrooms",
    "parking_spots",
    "ad_date",
    "sale_price",
    "condominium_fee",
    "annual_iptu_tax",
]

In [11]:
property_type_mapping = {
    "Apartamento": "Apartament",
    "Casa de dois andares": "Two-story House",
    "Casa": "House",
    "Condominio": "Condominium",
    "Flat": "Flat",
    "Cobertura": "Penthouse",
    "Kitnet": "Studio Apartament",
    "Predio Residencial": "Residential Building"
}

sp_house_price["property_type"] = sp_house_price["property_type"].map(property_type_mapping)

sp_house_price["property_type"].unique()

array(['Two-story House', 'House', 'Apartament', 'Condominium', 'Flat',
       'Residential Building', 'Penthouse', 'Studio Apartament'],
      dtype=object)

## Final Cleaning

In [12]:
sp_house_price = sp_house_price[
    ~(sp_house_price["property_type"].isin(("Studio Apartament", "Residential Building"))) &
    ~(sp_house_price["bedrooms"].isin((0, 5))) &
    ~(sp_house_price["bathrooms"].isin((0, 7))) &
    ~(sp_house_price["suites"].isin((5, 6))) &
    ~(sp_house_price["parking_spots"].isin(list(range(5, 8))))
].reset_index(drop=True)

In [13]:
features = sp_house_price.copy()
features

Unnamed: 0,neighborhood,property_type,usable_area,bathrooms,suites,bedrooms,parking_spots,ad_date,sale_price,condominium_fee,annual_iptu_tax
0,Vila Santa Teresa (Zona Sul),House,129.0,2.0,1.0,3.0,2.0,2016-03-21,336000,,
1,Vila Olimpia,Apartament,80.0,2.0,1.0,3.0,2.0,2018-10-26,739643,686.0,1610.0
2,Pinheiros,Apartament,94.0,1.0,0.0,3.0,2.0,2018-05-29,630700,1120.0,489.0
3,Vila Santa Clara,Condominium,110.0,1.0,1.0,3.0,2.0,2018-04-16,385000,0.0,0.0
4,Aclimacao,Apartament,141.0,4.0,3.0,4.0,2.0,2019-01-09,1106000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
82072,Vila Carmosina,Apartament,48.0,1.0,0.0,2.0,1.0,2017-10-07,171150,244.0,0.0
82073,Bela Vista,Apartament,60.0,1.0,,1.0,1.0,2017-12-13,251999,273.0,86.0
82074,Liberdade,Apartament,53.0,2.0,1.0,2.0,1.0,2018-11-28,249782,210.0,0.0
82075,Vila Lageado,Apartament,20.0,3.0,2.0,3.0,2.0,2019-02-06,623000,,


## Data Preprocessing

In [14]:
prices = features.pop("sale_price")

display(features)
display(prices)

Unnamed: 0,neighborhood,property_type,usable_area,bathrooms,suites,bedrooms,parking_spots,ad_date,condominium_fee,annual_iptu_tax
0,Vila Santa Teresa (Zona Sul),House,129.0,2.0,1.0,3.0,2.0,2016-03-21,,
1,Vila Olimpia,Apartament,80.0,2.0,1.0,3.0,2.0,2018-10-26,686.0,1610.0
2,Pinheiros,Apartament,94.0,1.0,0.0,3.0,2.0,2018-05-29,1120.0,489.0
3,Vila Santa Clara,Condominium,110.0,1.0,1.0,3.0,2.0,2018-04-16,0.0,0.0
4,Aclimacao,Apartament,141.0,4.0,3.0,4.0,2.0,2019-01-09,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
82072,Vila Carmosina,Apartament,48.0,1.0,0.0,2.0,1.0,2017-10-07,244.0,0.0
82073,Bela Vista,Apartament,60.0,1.0,,1.0,1.0,2017-12-13,273.0,86.0
82074,Liberdade,Apartament,53.0,2.0,1.0,2.0,1.0,2018-11-28,210.0,0.0
82075,Vila Lageado,Apartament,20.0,3.0,2.0,3.0,2.0,2019-02-06,,


0         336000
1         739643
2         630700
3         385000
4        1106000
          ...   
82072     171150
82073     251999
82074     249782
82075     623000
82076    1820000
Name: sale_price, Length: 82077, dtype: int64

## The Sklearn's set_output API

In [15]:
sklearn.set_config(transform_output="pandas")

In [16]:
imputing_transformer = make_column_transformer(
    (
        SimpleImputer(fill_value=0.0),
        [
            "condominium_fee",
            "suites",
            "parking_spots",
            "bedrooms",
            "bathrooms"
        ]
    ), (
        SimpleImputer(strategy="mean"),
        ["annual_iptu_tax", "usable_area"]
    ),
    remainder="passthrough",
    verbose_feature_names_out=False
)

## Feature Engineering

### General tax

In [17]:
def add_general_tax(X: pd.DataFrame) -> pd.DataFrame:
    general_tax = pd.DataFrame()
    
    general_tax["general_tax"] = X["condominium_fee"] + X["annual_iptu_tax"] / 12
    
    return general_tax

### Area Score

In [18]:
def add_area_score(X: pd.DataFrame) -> pd.DataFrame:
    area_scores = pd.DataFrame()
    
    area_scores["general_tax_score"] = X["general_tax"] / X["usable_area"]
    
    return area_scores

### Number of Property Features

In [19]:
def add_number_of_features(X: pd.DataFrame) -> pd.DataFrame:
    n_features = pd.DataFrame()
    
    features = ["suites", "parking_spots", "bedrooms", "bathrooms"]
    
    for feat in ["suites", "parking_spots", "bedrooms", "bathrooms"]:
        n_features["has_" + feat] = (X[feat] > 0).astype(int)
    
    n_features["n_features"] = n_features[["has_" + feat for feat in features]].sum(axis="columns")
    
    return n_features

### Neighborhoodhood Scores

In [20]:
class NeighborhoodScores(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()

        self.neighborhood_metrics = None
        self.general_tax_score_mean = None

    def fit(
        self, X: pd.DataFrame, y: typing.Optional[typing.Any] = None
    ) -> NeighborhoodScores:
        required_cols = X[["neighborhood", "general_tax_score"]].copy()
        required_cols["neighborhood"] = required_cols["neighborhood"].apply(self._normalize_str)
        self.neighborhood_metrics = required_cols.groupby("neighborhood").mean()
        self.neighborhood_metrics.columns = ["neighborhood_general_tax_score"]

        self.general_tax_score_mean = self.neighborhood_metrics.mean()[0]

        return self

    def transform(
        self, X: pd.DataFrame, y: typing.Optional[typing.Any] = None
    ) -> pd.DataFrame:
        neighs_scores = X[["neighborhood", "general_tax_score"]].copy()
        neighs_scores["neighborhood"] = neighs_scores["neighborhood"].apply(self._normalize_str)

        joinded_df = neighs_scores.join(
            self.neighborhood_metrics, on="neighborhood", how="left"
        )
        
        result = pd.DataFrame()
        result["neighborhood_general_tax_score"] = joinded_df["neighborhood_general_tax_score"].fillna(self.general_tax_score_mean)
        
        return result

    def _normalize_str(self, string: str) -> str:
        return string.lower().strip()

## Putting Everything Together

In [21]:
feature_engineering_pipeline = make_pipeline(
    make_column_transformer(
        (
            SimpleImputer(fill_value=0.0),
            ["condominium_fee", "suites", "parking_spots", "bedrooms", "bathrooms"],
        ),
        (SimpleImputer(strategy="mean"), ["annual_iptu_tax", "usable_area"]),
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
    make_union("passthrough", FunctionTransformer(add_general_tax)),
    make_union("passthrough", FunctionTransformer(add_area_score)),
    make_union("passthrough", NeighborhoodScores()),
    make_union("passthrough", FunctionTransformer(add_number_of_features)),
    FunctionTransformer(
        lambda X: X.drop(
            [
                "neighborhood",
                "ad_date",
                "condominium_fee",
                "annual_iptu_tax",
                "general_tax",
                "general_tax_score",
            ],
            axis="columns",
        )
    ),
    make_column_transformer(
        (OneHotEncoder(sparse_output=False), ["property_type"]),
        (
            StandardScaler(),
            [
                "usable_area",
                "neighborhood_general_tax_score",
            ],
        ),
        remainder=MinMaxScaler(),
        verbose_feature_names_out=False,
    ),
)

In [22]:
transformed_features = feature_engineering_pipeline.fit_transform(features)

transformed_features

Unnamed: 0,property_type_Apartament,property_type_Condominium,property_type_Flat,property_type_House,property_type_Penthouse,property_type_Two-story House,usable_area,neighborhood_general_tax_score,suites,parking_spots,bedrooms,bathrooms,has_suites,has_parking_spots,has_bedrooms,has_bathrooms,n_features
0,0.0,0.0,0.0,1.0,0.0,0.0,0.311599,-0.208425,0.250000,0.50,0.666667,0.2,1.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,-0.462635,0.947172,0.250000,0.50,0.666667,0.2,1.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,-0.241425,0.575735,0.000000,0.50,0.666667,0.0,0.0,1.0,0.0,0.0,0.5
3,0.0,1.0,0.0,0.0,0.0,0.0,0.011386,-0.332291,0.250000,0.50,0.666667,0.0,1.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.501207,0.477077,0.750000,0.50,1.000000,0.6,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82072,1.0,0.0,0.0,0.0,0.0,0.0,-0.968257,-0.343076,0.000000,0.25,0.333333,0.0,0.0,1.0,0.0,0.0,0.5
82073,1.0,0.0,0.0,0.0,0.0,0.0,-0.778649,-0.170300,0.260507,0.25,0.000000,0.0,1.0,1.0,0.0,0.0,1.0
82074,1.0,0.0,0.0,0.0,0.0,0.0,-0.889253,-0.192438,0.250000,0.25,0.333333,0.2,1.0,1.0,0.0,0.0,1.0
82075,1.0,0.0,0.0,0.0,0.0,0.0,-1.410676,-0.184797,0.500000,0.50,0.666667,0.4,1.0,1.0,0.0,0.0,1.0


In [23]:
target_transform = FunctionTransformer(np.log, inverse_func=np.exp)
transformed_target = target_transform.transform(prices.to_frame())
transformed_target

Unnamed: 0,sale_price
0,12.724866
1,13.513923
2,13.354586
3,12.860999
4,13.916260
...,...
82072,12.050296
82073,12.437180
82074,12.428344
82075,13.342302


In [24]:
artifacts_root_dir = P.join(P.dirname(P.abspath("")), "artifacts")

if not P.isdir(artifacts_root_dir):
    os.makedirs(artifacts_root_dir)

In [25]:
preprocessing_pipeline_path = P.join(
    artifacts_root_dir, "preprocessing_pipeline.pickle"
)

with open(preprocessing_pipeline_path, "wb") as f:
    cloudpickle.dump(feature_engineering_pipeline, f)

In [26]:
target_transform_path = P.join(
    artifacts_root_dir, "target_transform.pickle"
)

with open(target_transform_path, "wb") as f:
    cloudpickle.dump(target_transform, f)

In [27]:
artifacts_root_dir = P.join(P.dirname(P.abspath("")), "artifacts")

In [28]:
random_state = 42

## Voting Regressor Training

In [29]:
# catboost_model = CatBoostRegressor(
#     **{
#         "iterations": 2443,
#         "learning_rate": 0.14189955530903206,
#         "depth": 8,
#         "subsample": 0.819617389353853,
#         "colsample_bylevel": 0.9271334384759783,
#         "min_data_in_leaf": 117,
#     },
#     allow_writing_files=False,
#     silent=True,
#     random_seed=random_state
# )

xgboost_model = XGBRegressor(
    **{
        "n_estimators": 2094,
        "learning_rate": 0.09783310789549944,
        "max_depth": 6,
        "subsample": 0.894317697746841,
        "min_child_weight": 1,
        "colsample_bylevel": 0.920743729299481,
    },
    random_state=random_state
)

# random_forest_model = RandomForestRegressor(
#     **{
#         "n_estimators": 624,
#         "max_depth": 23,
#         "min_samples_leaf": 2,
#         "min_samples_split": 2,
#     },
#     n_jobs=-1,
#     random_state=random_state
# )

lightgbm_model = LGBMRegressor(
    **{
        "n_estimators": 2122,
        "max_depth": 16,
        "num_leaves": 30,
        "subsample": 0.984665661176,
        "colsample_bytree": 0.7896896099223678,
    },
    n_jobs=-1,
    random_state=random_state
)

In [30]:
voting_regressor = VotingRegressor(
    estimators=[
        (model.__class__.__name__, model)
        for model in (
            # catboost_model,
            xgboost_model,
            # random_forest_model,
            lightgbm_model,
        )
    ],
)

voting_regressor.fit(transformed_features, transformed_target)

  y = column_or_1d(y, warn=True)


In [31]:
all_predictions = voting_regressor.predict(transformed_features)
mean_squared_error(transformed_target, all_predictions)

0.02313474709153107

In [32]:
transformed_predictions = target_transform.inverse_transform(all_predictions)
mean_absolute_error(prices, transformed_predictions)

61984.98599881703

## Exporting the Final Model

In [33]:
model_path = P.join(
    artifacts_root_dir, "regression_model.pickle"
)

with open(model_path, "wb") as f:
    cloudpickle.dump(voting_regressor, f)

## Exporting API data

In [34]:
property_types_list = features["property_type"].unique().tolist()

property_types_path = P.join(artifacts_root_dir, "property_types.pickle")

with open(property_types_path, "wb") as f:
    cloudpickle.dump(property_types_list, f)

In [35]:
neighborhood_list = features["neighborhood"].unique().tolist()

neighborhood_path = P.join(artifacts_root_dir, "neighborhoods.pickle")

with open(neighborhood_path, "wb") as f:
    cloudpickle.dump(neighborhood_list, f)