# Introduction
This notebook presents the scenario a data scientist working on the creation of a `price prediction model` for the `House Pricing in Belo Horizonte` dataset based on our `Exploratory Data Analysis`.

This is a research phase where `mlflow` is used to manually track experiments and decide on a model to run in `production`.

In [1]:
import logging
from os import getenv

import awswrangler as wr
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import mlflow
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FunctionTransformer, Pipeline, make_pipeline

from library.dataset import get_dataset, split_test_dataset, prepare_features
from library.serve import predict
from library.train import train_simple_linear_regression

S3_ENDPOINT_URL = 'http://localhost:9000'
wr.config.s3_endpoint_url = S3_ENDPOINT_URL

In [2]:
DATASET_PATH = 's3://mlops-datasets/data_kaggle_2021.csv'
EXPERIMENT_NAME = 'belo-horizonte-estate-pricing'
EXPERIMENT_DESCRIPTION = 'Regression model to predict the price of a real estate property in Belo Horizonte, Brasil '\
    'using the Kaggle Dataset from 2021.'
RANDOM_STATE = 42
TRACKING_SERVER_URI = getenv('TRACKING_SERVER_URI', 'http://localhost:5000')

logging.basicConfig(level=logging.INFO)

## Download the dataset and clean it for training

In [3]:
df, variables = get_dataset(DATASET_PATH)

df.head()

INFO:library.dataset:Reading file s3://mlops-datasets/data_kaggle_2021.csv
INFO:botocore.credentials:Found credentials in environment variables.
INFO:library.dataset:The dataset s3://mlops-datasets/data_kaggle_2021.csv contains 5981 rows.
INFO:library.dataset:Cleaning the dataset:
INFO:library.dataset:Dropping duplicates...
INFO:library.dataset:   Dropped 0 duplicated rows.
INFO:library.dataset:Standardizing the column names...
INFO:library.dataset:Converting the numeric variables received as text...
INFO:library.dataset:Removing outliers...
INFO:library.dataset:  Removed 793 rows containing outliers.
INFO:library.dataset:Number of rows after cleaning the dataset: 5188


Unnamed: 0,address,adm_fees,garage_places,price,rooms,square_foot,neighborhood,city,latitude,longitude
0,"Avenida Raja Gabaglia, 1583",470.0,1,330000.0,1,40,Luxemburgo,Belo Horizonte,-19.936415,-43.953396
1,"Rua Espírito Santo, 1171",0.0,1,480000.0,2,55,Centro,Belo Horizonte,-18.864776,-41.121777
2,"Rua dos Expedicionários, 1082",0.0,5,1190000.0,4,411,Santa Amélia,Belo Horizonte,-20.225241,-44.39778
3,"Rua Marechal Hermes, 81",750.0,4,1590000.0,4,158,Gutierrez,Belo Horizonte,-19.937155,-43.958694
4,"Rua Juruna, 110",0.0,6,550000.0,3,278,Pindorama,Belo Horizonte,-19.912982,-44.019944


## Create an experiment in mlflow

In [4]:
mlflow.set_tracking_uri(TRACKING_SERVER_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.set_experiment_tag('mlflow.note.content',EXPERIMENT_DESCRIPTION)

2024/08/19 17:31:49 INFO mlflow.tracking.fluent: Experiment with name 'belo-horizonte-estate-pricing' does not exist. Creating a new experiment.


## Split our data into train and test datasets
We will save our 20% of our dataset to test and compare the future models.

In [5]:
df_train, df_test = split_test_dataset(df, 0.2, RANDOM_STATE)

INFO:library.dataset:Splitting the dataset.
INFO:library.dataset:Split the dataset into datasets of 4150 and 1038 rows.


## Train our base model
Train a simple linear regression for future reference.

In [6]:
model_data = train_simple_linear_regression(df_train, variables)

## Add the model to the model registry
We will register our base model in the model registry and give it the `@base` alias. Since it is also our only model, it is also our de facto `@champion`. This way, we will have future challengers try and defeat it.

In [7]:
MODEL_NAME = EXPERIMENT_NAME
model_uri = f'runs:/{model_data[0]}/model'

registered_model = mlflow.register_model(
    model_uri=model_uri,
    name=MODEL_NAME,
    tags={'mlflow.user': 'Quentin El Guay'}
)

client = mlflow.MlflowClient()
client.set_registered_model_alias(registered_model.name, 'base', registered_model.version)
client.set_registered_model_alias(registered_model.name, 'champion', registered_model.version)

Successfully registered model 'belo-horizonte-estate-pricing'.
2024/08/19 17:31:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: belo-horizonte-estate-pricing, version 1
Created version '1' of model 'belo-horizonte-estate-pricing'.


## Use the registered model to predict a price

In [8]:
inputs = {
        'adm_fees': '',
        'neighborhood': 'Miramar',
        'square_foot': '79',
        'rooms': '2',
        'garage_places': '--',
    }

X = prepare_features(inputs)

INFO:library.dataset:Standardizing the column names...
INFO:library.dataset:Converting the numeric variables received as text...


In [9]:
model_uri = f'models:/{MODEL_NAME}@champion'
try:
    model = mlflow.pyfunc.load_model(model_uri)

except ConnectionError:
    logger.warning('Unable to connect to MLFlow.')
    model = load_model_from_s3(MODEL_NAME)

print(f'Predicted value: BRL {round(model.predict(X)[0], 2)}')

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Predicted value: BRL 224973.66
