# House Price Prediction with Gradient Boosting Regressor (scikit-learn)

<a href="https://colab.research.google.com/github/VertaAI/examples/blob/main/deployment/sklearn/sklearn-housing-regression-gbr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Dependencies

This notebook has been tested with **Python 3.8.16** and the following package versions:

In [None]:
%%capture
!pip install verta==0.21.1
!pip install wget==3.2

## 2. Imports

In [None]:
import cloudpickle
import os
import pandas as pd
import wget

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from verta import Client
from verta.environment import Python
from verta.registry import VertaModelBase, verify_io
from verta.utils import ModelAPI

## 3. Model Training

### 3.1 Load Training Data

In [None]:
file_name = 'melb-data.csv'

if not os.path.isfile(file_name):
    wget.download(f"http://s3.amazonaws.com/verta-starter/{file_name}")

df = pd.read_csv(file_name)

In [None]:
df = df.dropna(axis=0)

In [None]:
target_col = 'Price'
feature_cols = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'Distance', 'YearBuilt', 'Car', 'Propertycount']

In [None]:
X = df.loc[:,feature_cols]
y = df.loc[:,target_col]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### 3.2 Train/Test Code

In [None]:
model = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.25, max_depth=5, loss='squared_error')

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
MAE = mean_absolute_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = mean_squared_error(y_test, predictions, squared=False) 

In [None]:
print(f"MAE: {round(MAE, 3)}")
print(f"MSE: {round(MSE, 3)}")
print(f"RMSE: {round(RMSE, 3)}")

## 4. Verta Set Up

In [None]:
# Use local env vars or uncomment and fill out the lines below:
# os.environ['VERTA_EMAIL'] = ''
# os.environ['VERTA_DEV_KEY'] = ''
# os.environ['VERTA_HOST'] = ''

In [None]:
client = Client()

In [None]:
MODEL_NAME = 'House Price Prediction (Example)'
VERSION = 'sklearn-gbm'
ENDPOINT_NAME = 'house-price-prediction-sklearn-gbm'

## 5. Model Class

In [None]:
class HousePricePredictor(VertaModelBase):
    def __init__(self, artifacts):
        self.model = cloudpickle.load(open(artifacts['serialized_model'], 'rb'))
        
    @verify_io
    def predict(self, data):
        return self.model.predict(data).tolist()

    def describe(self):
        return {
            'method': 'predict',
            'args': f"{self.example()}",
            'returns': '[0]',
            'description': "Predicts house prices based on scikit-learn's GradientBoostingRegressor trained model.",
            'input_description': 'List of house information, such as number of rooms, building area, land size, etc.',
            'output_description': 'House price prediction.'
        }
        
    def example(self):
        return [[3.0, 1.0, 206.0, 110.0, 8.4, 1980.0, 1.0, 8801.0]]

## 6. Model Register

In [None]:
with open('model.pkl', 'wb') as f:
    cloudpickle.dump(model, f)

In [None]:
artifacts = {'serialized_model': 'model.pkl'}

In [None]:
registered_model = client.get_or_create_registered_model(name=MODEL_NAME)

In [None]:
model = registered_model.create_standard_model(
    name = VERSION,
    model_cls = HousePricePredictor,
    environment = Python(requirements=['scikit-learn']),
    model_api = ModelAPI(X_train, y_train),
    artifacts = artifacts
)

## 7. Model Endpoint

In [None]:
endpoint = client.get_or_create_endpoint(ENDPOINT_NAME)

In [None]:
endpoint.update(model, wait=True)

In [None]:
deployed_model = endpoint.get_deployed_model()

## 8. Predictions

In [None]:
sample_data = X_test.values.tolist()[:5]

In [None]:
results = deployed_model.predict(sample_data)

In [None]:
df = pd.DataFrame(data=sample_data, columns=feature_cols)

In [None]:
df['PricePrediction'] = [round(i, 2) for i in results]

In [None]:
df

In [None]:
# Uncomment the line below if you want to delete the created endpoint:
# endpoint.delete()