# House Price Prediction with Gradient Boosting Regressor (scikit-learn)

## 1. Dependencies

This notebook has been tested with **Python 3.8.16** and the following package versions:

In [None]:
%%capture
!pip install verta==0.21.1
!pip install wget==3.2

## 2. Imports

In [None]:
import cloudpickle
import os
import pandas as pd
import wget

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from verta import Client
from verta.environment import Python
from verta.registry import VertaModelBase, verify_io
from verta.utils import ModelAPI

## 3. Model Training

### 3.1 Load Training Data

In [None]:
file_name = 'melb-data.csv'

if not os.path.isfile(file_name):
    wget.download(f"http://s3.amazonaws.com/verta-starter/{file_name}")

df = pd.read_csv(file_name)

In [None]:
df = df.dropna(axis=0)

In [None]:
target_col = 'Price'
feature_cols = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'Distance', 'YearBuilt', 'Car', 'Propertycount']

In [None]:
X = df.loc[:,feature_cols]
y = df.loc[:,target_col]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### 3.2 Train/Test Code

In [None]:
model = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.25, max_depth=5, loss='squared_error')

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
MAE = mean_absolute_error(y_test, predictions)
MSE = mean_squared_error(y_test, predictions)
RMSE = mean_squared_error(y_test, predictions, squared=False) 

In [None]:
print(f"MAE: {round(MAE, 3)}")
print(f"MSE: {round(MSE, 3)}")
print(f"RMSE: {round(RMSE, 3)}")

## 4. Verta Set Up

In [None]:
os.environ['VERTA_HOST'] = ''
os.environ['VERTA_EMAIL'] = ''
os.environ['VERTA_DEV_KEY'] = ''

In [None]:
client = Client(os.environ['VERTA_HOST'])

## 5. Model Register

In [None]:
with open('model.pkl', 'wb') as f:
    cloudpickle.dump(model, f)

In [None]:
class HousePricePredictor(VertaModelBase):
    def __init__(self, artifacts):
        self.model = cloudpickle.load(open(artifacts['serialized_model'], 'rb'))
        
    @verify_io
    def predict(self, data):
        results = []

        for item in data:
            results.append(self.model.predict(item).tolist())
        
        return results

In [None]:
artifacts = {'serialized_model': 'model.pkl'}

In [None]:
registered_model = client.get_or_create_registered_model(name='House Price Prediction', labels=['GB Regression'])

In [None]:
model = registered_model.create_standard_model(
    name = 'v0',
    model_cls = HousePricePredictor,
    environment = Python(requirements=['scikit-learn']),
    model_api = ModelAPI(X_train, y_train),
    artifacts = artifacts
)

## Model Endpoint

In [None]:
endpoint = client.get_or_create_endpoint('house-price-prediction')

In [None]:
endpoint.update(model, wait=True)

In [None]:
deployed_model = endpoint.get_deployed_model()

In [None]:
deployed_model.predict([X_train.values.tolist()[:5]])