In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from airbnb_prices import DataPipeline
from airbnb_prices.eval import train_eval

### DataPipeline

The `DataPipeline` object offers a high-level wrapper to manager the entire data engineering pipeline. It is controlled by a configuration file, in this example `./config.json`.

In [None]:
pipeline = DataPipeline.from_file("../data/train_airbnb_berlin.csv", "./config.json")

`pipeline.run()` is in charge of data preprocessing. It will: 
* Replace NaNs
* Engineer new features
* One-hot encode categoricals
* Standardize selected features
* Drop unused features

In [None]:
pipeline.run()

Training, validation and test data can be easily accessed:

In [None]:
X_train, y_train = pipeline.train_data
X_val, y_val = pipeline.val_data

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

X_val

# Model

In [None]:
model = xgb.XGBRegressor()

## Training

In [None]:
model, train_score, val_score = train_eval.train_eval_once(
    model=model, x_train=X_train, y_train=y_train, x_test=X_val, y_test=y_val
)
print("RMSE score on train set: {}".format(train_score))
print("RMSE score on val set: {}".format(val_score))

## Visualization

### Feature Importance

In [None]:
xgb.plot_importance(model, importance_type="gain", show_values=False)
plt.grid(False)

### Tree Visualization

In [None]:
fig, ax = plt.subplots()
ax.set_title("Tree #4")
xgb.plot_tree(model, num_trees=3, ax=ax, rankdir="LR")

# Testing

In [None]:
x_test, y_test = pipeline.test_data

y_pred = model.predict(x_test)
test_score = np.sqrt(mean_squared_error(y_test, y_pred))
print(" RMSE score on test set: {}".format(test_score))

### Visualize relative error

In [None]:
comparaison = pd.DataFrame()
y_test = y_test.reset_index()["Price"]
comparaison["Prediction"] = y_pred
comparaison["Real Value"] = y_test
comparaison["Error (%)"] = (
    abs(comparaison["Prediction"] - comparaison["Real Value"]) / comparaison["Real Value"] * 100
)
comparaison.sort_values(by="Error (%)", inplace=True)

comparaison

In [None]:
from matplotlib.ticker import AutoMinorLocator, MultipleLocator

fig = plt.figure(figsize=(15, 7.5))
fig.suptitle("Relative and Cumulative Error of XGBoost on the Testing set")

ax = fig.subplots(1, 1)

comparaison["Error (%)"].hist(bins=105, ax=ax)
ax.set_xlabel("Error (%)")
ax.set_ylabel("Number of predictions")

ax.set_xlim(0, 525)
ax.xaxis.set_major_locator(MultipleLocator(25))
ax.xaxis.set_minor_locator(AutoMinorLocator(5))

ax.set_ylim(0, 250)
ax.yaxis.set_major_locator(MultipleLocator(25))
ax.yaxis.set_minor_locator(AutoMinorLocator(5))

ax.xaxis.grid(which="major", color="#CCCCCC", linestyle="-")
ax.xaxis.grid(which="minor", color="#CCCCCC", linestyle="--")

ax = ax.twinx()
comparaison["Error (%)"].hist(
    bins=105, ax=ax, cumulative=True, density=True, histtype="step", color="orange"
)
ax.set_xlabel("Error (%)")
ax.set_ylabel("Cumulative density")

ax.set_ylim(0, 1)
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_minor_locator(AutoMinorLocator(0.05))