In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle
import ipywidgets as widgets

from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install treeinterpreter
!pip install waterfallcharts

from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall

Collecting treeinterpreter
  Downloading https://files.pythonhosted.org/packages/af/19/fa8556093f6b8c7374825118e05cf5a99c71262392382c3642ab1fd8a742/treeinterpreter-0.2.3-py2.py3-none-any.whl
Installing collected packages: treeinterpreter
Successfully installed treeinterpreter-0.2.3
Collecting waterfallcharts
  Downloading https://files.pythonhosted.org/packages/4a/f2/990e1040124bb804d75635f439379f9a712c9418e59a7161b4f47a6e467e/waterfallcharts-3.8.tar.gz
Building wheels for collected packages: waterfallcharts
  Building wheel for waterfallcharts (setup.py) ... [?25l[?25hdone
  Created wheel for waterfallcharts: filename=waterfallcharts-3.8-cp37-none-any.whl size=3415 sha256=a109bf5fb2ec6f128f69ce5994eb2b40b6261bf4e9d5cab82222e5b4932ebee1
  Stored in directory: /root/.cache/pip/wheels/f9/be/d0/59ccb7fc5d874cbea7cbfcf89e42feccfe8fa18a90a1bf4370
Successfully built waterfallcharts
Installing collected packages: waterfallcharts
Successfully installed waterfallcharts-3.8


In [3]:
!cp /content/gdrive/MyDrive/Colab\ Notebooks/BluebookForBulldozers/bfb_app/bfb_app_rf_model.pkl .
!cp /content/gdrive/MyDrive/Colab\ Notebooks/BluebookForBulldozers/bfb_app/bfb_categorizer_app.json .
!cp /content/gdrive/MyDrive/Colab\ Notebooks/BluebookForBulldozers/bfb_app/bfb_model_dict_app.json .

In [4]:
with open("bfb_categorizer_app.json", "r") as fin:
  data = fin.read()

bfb_categorizer = json.loads(data)


with open("bfb_model_dict_app.json", "r") as fin:
  data = fin.read()

equip_model_dict = json.loads(data)

In [5]:
# features for the model
features = ['YearMade', 'ProductSize', 'Enclosure', 'year',
            'fiProductClassDesc', 'fiSecondaryDesc', 'fiBaseModel',
            'fiModelDescriptor']
with open("bfb_app_rf_model.pkl", "rb") as fin:
  model = pickle.load(fin)

In [6]:
product_class_dict = {}
for item in bfb_categorizer["fiProductClassDesc"]:
  product_group, desc = item.split(' - ')
  if product_group in product_class_dict.keys():
    product_class_dict[product_group] += [desc]
  else:
    product_class_dict[product_group] = [desc]

In [49]:
year_drop = widgets.Dropdown(
    options=list(np.arange(1950, 2013, 1)),
    value=2012,
    description='Year Made:',
    disabled=False,
)

size_drop = widgets.Dropdown(
    options=list(bfb_categorizer["ProductSize"]),
    value='Large',
    description='Size:',
    disabled=False,
)

enclosure_drop = widgets.Dropdown(
    options=list(bfb_categorizer["Enclosure"]),
    value='None or Unspecified',
    description='Enclosure:',
    disabled=False,
)

hydraulics_drop = widgets.Dropdown(
    options=list(bfb_categorizer["Hydraulics_Flow"]),
    value='None or Unspecified',
    description='Hydraulics Flow:',
    disabled=False,
)

product_class_drop = widgets.Dropdown(
    options=list(product_class_dict.keys()),
    value='Backhoe Loader',
    description='Class:',
    disabled=False,
)

product_desc_drop = widgets.Dropdown(
    options=list(product_class_dict[product_class_drop.value]),
    value='Unidentified',
    description='Descriptor:',
    disabled=False,
    continuous_update=True,
)

base_model_drop = widgets.Dropdown(
    options=list(equip_model_dict.keys()),
    description='Base Model:',
    disabled=False,
)

sec_model_drop = widgets.Dropdown(
    options=list(equip_model_dict[base_model_drop.value]["fiSecondaryDesc"]),
    description='Secondary Desc:',
    disabled=False,
)

model_desc_drop = widgets.Dropdown(
    options=list(equip_model_dict[base_model_drop.value]["fiModelDescriptor"]),
    description='Model Desc:',
    disabled=False,
)

def update_desc_dropdown(*args):
  product_desc_drop.options = product_class_dict[product_class_drop.value]

def update_sec_model_drop(*args):
  sec_model_drop.options = equip_model_dict[base_model_drop.value]["fiSecondaryDesc"]

def update_model_desc_dropdown(*args):
  model_desc_drop.options = equip_model_dict[base_model_drop.value]["fiModelDescriptor"]

product_class_drop.observe(update_desc_dropdown, 'value')
base_model_drop.observe(update_sec_model_drop, 'value')
base_model_drop.observe(update_model_desc_dropdown, 'value')

In [50]:
%matplotlib inline

output_plot = widgets.Output()

def create_model_inputs():
  class_desc = f"{product_class_drop.value} - {product_desc_drop.value}"
  test_features = {'YearMade': year_drop.value,
                 'ProductSize': size_drop.value,
                 'Enclosure': enclosure_drop.value,
                 'year': 2012,
                 'fiProductClassDesc': class_desc,
                 'fiSecondaryDesc': sec_model_drop.value,
                 'fiBaseModel': base_model_drop.value,
                 'fiModelDescriptor': model_desc_drop.value}
  test_inputs = []
  for feature_name, feature_value in test_features.items():
    values = bfb_categorizer.get(feature_name, feature_value)
    if type(values) is list:
      value = values.index(feature_value)
    else:
      value = feature_value
    test_inputs.append(value)
  return test_inputs

def get_predicted_price():
  price = model.predict(np.array(create_model_inputs()).reshape(1, -1))[0]
  return price

def get_predicted_breakdown():
  pred, bias, contribution = treeinterpreter.predict(model,
                                np.array(create_model_inputs()).reshape(1, -1))
  # c_in_usd = []
  # current = b[0]
  # for cont in contribution[0]:
  #   old = current
  #   current += cont
  #   c_in_usd.append(np.exp(current) - np.exp(old))
  # c_in_usd = np.array(c_in_usd)
  # pred_in_usd = np.exp(pred[0, 0])
  # bias_in_usd = np.exp(bias[0])
  return pred[0, 0], bias[0], contribution[0]

def plot_predicted_breakdown(c_in_usd=None):
  if c_in_usd is None:
    _, _, c_in_usd = get_predicted_breakdown()
  feature_names = ['Base Price', 'Year Made', 'Size', 'Enclosure', 'year',
                   'Class', 'SecondaryDesc', 'Base', 'Descriptor']
  plt.figure(figsize=(13, 8))
  wf_plot = waterfall(feature_names, c_in_usd, rotation_value=75);
  return wf_plot

label_pred1 = widgets.Label()
label_pred2 = widgets.Label()
label_pred3 = widgets.Label()

def total_prediction():
  pred_in_usd, bias_in_usd, c_in_usd = get_predicted_breakdown()
  label_pred1.value = "Predicted price for this piece of equipment is"
  label_pred2.value = f"    $ {pred_in_usd:,.2f}"
  label_pred3.value = "Breakdown on the price by attributes:"
  return plot_predicted_breakdown([bias_in_usd] + list(c_in_usd))


def predict_on_click(change):
  wf_plot = total_prediction()
  output_plot.clear_output()
  with output_plot: plt.show(wf_plot);

button = widgets.Button(
    description='Predict Price'
)
button.on_click(predict_on_click)

This Bluebook For Bulldozers app is based on the <a href="https://www.kaggle.com/c/bluebook-for-bulldozers/overview">Kaggle competition</a> of the same name. The goal was to predict auction price of heavy constructuion equipment based on function, size, model, and configuration described in a dataset of 53 fields.

However, the goal of this app is to create a simplified user interface with only 8 inputs to predict an accurate price. Even with so few inputs, a random forest model was trained and achieved a validation root mean squared log error (RMSLE) of 0.240, which would place it just inside the top 20 of the competitions <a href="https://www.kaggle.com/c/bluebook-for-bulldozers/leaderboard">final results</a>. Not bad for a model using only about a seventh of the original dataset.

To use the app, make selections from the dropdowns below. Because the dataset is from 2012, that is the latest year that can be selected for year made, and all prices are predicted for a sale year of 2012 as well. That means it's predicted 2012 prices of the equipment. This is important becaues random forests are not good at extrapolating outside of the domain of the data.

Enter the general type the equipment in the class field, and a more specific physical description in the descriptor field. The full model name is split into three distinct parts: the base model, secondary description, and model description. So a "D3G XL" would be entered as "D3" for the base model, "G" for the secondary description, and "XL" for the model description. Secondary and model description options are dependent on the base model selected.

When the price is predicted a waterfall chart is plotted below that describes how each option contributes to the final prediction. The base price is the average price of all pieces of equipment, and from this starting point the plot shows how much each field adds or substracts to reach the final price. A year field (not just the year made) is included in the price prediction because this was a key feature in the model, but it is fixed at 2012 to predict prices at a time relevant to the dataset.

In [51]:
widgets.VBox([year_drop, product_class_drop, product_desc_drop, base_model_drop,
              sec_model_drop, model_desc_drop, size_drop, enclosure_drop,
              button, label_pred1, label_pred2, label_pred3, output_plot])

VBox(children=(Dropdown(description='Year Made:', index=62, options=(1950, 1951, 1952, 1953, 1954, 1955, 1956,…