In [1]:
import os
import os.path as P
import pickle

import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import cross_validate

# Model Selection

In the last notebook we proceeded to create our preprocessing pipeline and export it to a reusable artifact object. Now it's time to use or transformed features to deep dive into the Machine Learning part, by training a bunch of regression models, check their performance and select some of them to be used for our future final model.

## Using our preprocessing pipeline

Let's retrieve our preprocessing pipeline.

In [2]:
artifacts_root_dir = P.join(P.dirname(P.abspath("")), "artifacts")

In [3]:
preprocessing_pipeline_path = P.join(
    artifacts_root_dir, "preprocessing_pipeline.pickle"
)

with open(preprocessing_pipeline_path, "rb") as f:
    preprocessing_pipeline = pickle.load(f)

preprocessing_pipeline

In [4]:
target_transform_path = P.join(artifacts_root_dir, "target_transform.pickle")

with open(target_transform_path, "rb") as f:
    target_transform = pickle.load(f)

target_transform

Note that this time we didn't even needed to use the [cloudpickle](https://github.com/cloudpipe/cloudpickle) package: pickle can deserialize the artifact right away.

## Don't Forget to Configure the Sklearn API
We **must** execute the following line

In [5]:
sklearn.set_config(transform_output="pandas")

## Load and Transform Data

With our transforms loaded, we can proceed to load and process the dataset.

In [6]:
preprocessed_dataset_root_dir = P.join(P.dirname(P.abspath("")), "data", "processed")

In [7]:
df_file = P.join(preprocessed_dataset_root_dir, "sp_sales_data.parquet")

features = pd.read_parquet(df_file)
target = features.pop("sale_price")

display(features)
display(target)

Unnamed: 0,neighborhood,property_type,usable_area,bathrooms,suites,bedrooms,parking_spots,ad_date,condominium_fee,annual_iptu_tax
0,Jardim da Saude,Two-story House,388.0,3.0,1.0,4.0,6.0,2017-02-07,,
1,Vila Santa Teresa (Zona Sul),House,129.0,2.0,1.0,3.0,2.0,2016-03-21,,
2,Vila Olimpia,Apartament,80.0,2.0,1.0,3.0,2.0,2018-10-26,686.0,1610.0
3,Pinheiros,Apartament,94.0,1.0,0.0,3.0,2.0,2018-05-29,1120.0,489.0
4,Vila Santa Clara,Condominium,110.0,1.0,1.0,3.0,2.0,2018-04-16,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
88742,Vila Carmosina,Apartament,48.0,1.0,0.0,2.0,1.0,2017-10-07,244.0,0.0
88743,Bela Vista,Apartament,60.0,1.0,,1.0,1.0,2017-12-13,273.0,86.0
88744,Liberdade,Apartament,53.0,2.0,1.0,2.0,1.0,2018-11-28,210.0,0.0
88745,Vila Lageado,Apartament,20.0,3.0,2.0,3.0,2.0,2019-02-06,,


0         700000
1         336000
2         739643
3         630700
4         385000
          ...   
88742     171150
88743     251999
88744     249782
88745     623000
88746    1820000
Name: sale_price, Length: 88747, dtype: int64

In [8]:
transformed_features = preprocessing_pipeline.transform(features)
transformed_target = target_transform.transform(target)

display(transformed_features)
display(transformed_target)

Unnamed: 0,property_type_Apartament,property_type_Condominium,property_type_Flat,property_type_House,property_type_Penthouse,property_type_Residential Building,property_type_Studio Apartament,property_type_Two-story House,usable_area,condominium_fee,annual_iptu_tax,condominium_per_area,iptu_per_area,neighborhood_condominium_per_area,neighborhood_iptu_per_area,suites,parking_spots,bedrooms,bathrooms
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.039322,4.616727e-18,0.000000,-0.031827,-0.015712,3.180273,0.073782,0.166667,0.857143,0.8,0.428571
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.229577,4.616727e-18,0.000000,-0.010645,-0.000007,-0.216482,0.135286,0.166667,0.285714,0.6,0.285714
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.491186,-1.225603e-02,0.019209,-0.006842,0.059671,1.011119,-0.096446,0.166667,0.285714,0.6,0.285714
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.285254,5.368350e-03,-0.005375,0.006999,-0.002027,0.616780,-0.083665,0.000000,0.285714,0.6,0.142857
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.049903,-4.011391e-02,-0.016099,-0.042378,-0.023535,-0.325976,-0.147722,0.166667,0.285714,0.6,0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88742,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.961888,-3.020527e-02,-0.016099,-0.021312,-0.023535,-0.338419,-0.162822,0.000000,0.142857,0.4,0.142857
88743,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.785375,-2.902761e-02,-0.014213,-0.023522,-0.017609,-0.139360,-0.105350,0.177136,0.142857,0.2,0.142857
88744,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.888341,-3.158599e-02,-0.016099,-0.025958,-0.023535,-0.189627,-0.062737,0.166667,0.142857,0.4,0.285714
88745,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.373752,4.616727e-18,0.000000,0.162300,0.128216,-0.181446,-0.077305,0.333333,0.285714,0.6,0.428571


0        13.458836
1        12.724866
2        13.513923
3        13.354586
4        12.860999
           ...    
88742    12.050296
88743    12.437180
88744    12.428344
88745    13.342302
88746    14.414347
Name: sale_price, Length: 88747, dtype: float64