# Test PDPbox with Regression problem
## Kaggle Rossmann store with RandomForest regressor
This demo is based on **Rossmann Store Sales**  
https://www.kaggle.com/c/rossmann-store-sales

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "/Users/sosuneko/Desktop/develop/PDPbox")

In [None]:
import pandas as pd
from pdpbox import pdp, get_example, info_plots

In [None]:
import warnings
warnings.filterwarnings("ignore")

# versions
- `matplotlib`: 3.1.1
- `sklearn`: 0.23.1

In [None]:
import matplotlib
print(matplotlib.__version__)

In [None]:
import sklearn
print(sklearn.__version__)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# train a simple RandomForestRegressor
We assume there is a `rossmann-store-sales` data folder under the same directory. The dataset is directly downloaded from kaggle with command `kaggle competitions download -c rossmann-store-sales`.

In [None]:
! ls rossmann-store-sales/

In [None]:
train_df = pd.read_csv('rossmann-store-sales/train.csv')
store_df = pd.read_csv('rossmann-store-sales/store.csv')
print(train_df.shape, store_df.shape)

In [None]:
train_df.head()

In [None]:
store_df.head()

In [None]:
train_df = train_df.merge(store_df, on='Store', how='left')
print(train_df.shape)

## simple feature engineering

In [None]:
ross_data = train_df.copy()
ross_data['Date'] = pd.to_datetime(ross_data['Date'])
ross_data['month'] = ross_data['Date'].dt.month
ross_data['weekofyear'] = ross_data['Date'].dt.weekofyear

uni_values = {
    'StateHoliday': ['a', 'b', 'c'],
    'StoreType': ['a', 'b', 'c', 'd'],
    'Assortment': ['a', 'b', 'c']
}

for col in uni_values:
    for value in uni_values[col]:
        ross_data['{}_{}'.format(col, value)] = (ross_data[col] == value).map(int)

In [None]:
features = [
    'DayOfWeek', 'SchoolHoliday', 'month', 'weekofyear', 
    'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 
    'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d', 
    'Assortment_a', 'Assortment_b', 'Assortment_c'
]
target = 'Sales'
X_train, X_test, y_train, y_test = train_test_split(
    ross_data[features], ross_data[target], test_size = 0.2, random_state=42)

In [None]:
%%time
regressor = RandomForestRegressor(
    n_estimators=50, 
    criterion='mse',
    n_jobs=4,
    random_state=42
)
regressor.fit(X_train, y_train)

In [None]:
regressor.score(X_test, y_test)

In [None]:
ross_features = features
ross_model = regressor
ross_target = target

# [for internal usage] save for unit test and tutorial

# or you can directory load the data and model

In [None]:
test_ross = get_example.ross()
test_ross.keys()

In [None]:
ross_data = test_ross['data']
ross_features = test_ross['features']
ross_model = test_ross['rf_model']
ross_target = test_ross['target']

In [None]:
hasattr(ross_model, "predict_proba")

In [None]:
ross_data.shape

In [None]:
ross_data.head()

# 1. Binary feature: SchoolHoliday

## 1.1 target distribution through feature 'SchoolHoliday'

In [None]:
target_school_holiday = info_plots.TargetPlot(
    df=ross_data,
    feature="SchoolHoliday",
    feature_name="SchoolHoliday",
    target=ross_target,
)

In [None]:
fig, axes, summary_df = target_school_holiday.plot(
    figsize=None,
    ncols=2,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = target_school_holiday.plot(
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine='matplotlib',
)

## 1.2 check prediction distribution through feature 'SchoolHoliday'

In [None]:
predict_school_holiday = info_plots.PredictPlot(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    feature="SchoolHoliday",
    feature_name="SchoolHoliday",
    pred_func=None,
    n_classes=0,
    predict_kwds={},
    chunk_size=-1,
)

In [None]:
fig, axes, summary_df = predict_school_holiday.plot(
    figsize=None,
    ncols=2,
    plot_params={"gaps": {"inner_y": 0.05}},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = predict_school_holiday.plot(
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine='matplotlib',
)

## 1.3 pdp for feature 'SchoolHoliday'

In [None]:
pdp_SchoolHoliday = pdp.PDPIsolate(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    feature='SchoolHoliday',
    feature_name="SchoolHoliday",
    n_classes=0,
)

In [None]:
fig, axes = pdp_SchoolHoliday.plot(
    center=True,
    plot_lines=False,
    frac_to_plot=100,
    cluster=False,
    n_cluster_centers=None,
    cluster_method='accurate',
    plot_pts_dist=False,
    to_bins=False,
    show_percentile=True,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"pdp_hl": True},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
fig, axes = pdp_SchoolHoliday.plot(
    center=True,
    plot_lines=True,
    frac_to_plot=100,
    cluster=False,
    n_cluster_centers=None,
    cluster_method='accurate',
    plot_pts_dist=False,
    to_bins=False,
    show_percentile=True,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"pdp_hl": True},
    engine='matplotlib',
    template='plotly_white',
)

# 2. one-hot encoding feature: StoreType

## 2.1 target distribution through feature 'StoreType'

In [None]:
target_store_type = info_plots.TargetPlot(
    df=ross_data,
    feature=['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d'],
    feature_name="StoreType",
    target=ross_target,
)

In [None]:
fig, axes, summary_df = target_store_type.plot(
    figsize=None,
    ncols=2,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = target_store_type.plot(
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine='matplotlib',
)

## 2.2 check prediction distribution through feature 'StoreType'

In [None]:
predict_store_type = info_plots.PredictPlot(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    feature=['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d'],
    feature_name="StoreType",
    pred_func=None,
    n_classes=0,
    predict_kwds={},
    chunk_size=-1,
)

In [None]:
fig, axes, summary_df = predict_store_type.plot(
    figsize=None,
    ncols=2,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = predict_store_type.plot(
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine='matplotlib',
)

In [None]:
def ross_predict(model, X, predict_kwds={}):
    return model.predict(X, **predict_kwds) + 100

predict_store_type = info_plots.PredictPlot(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    feature=['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d'],
    feature_name="StoreType",
    pred_func=ross_predict,
    n_classes=0,
    predict_kwds={},
    chunk_size=-1,
)

fig, axes, summary_df = predict_store_type.plot(
    figsize=None,
    ncols=2,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

## 2.3 pdp for feature 'StoreType'

In [None]:
pdp_StoreType = pdp.PDPIsolate(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    feature=['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d'],
    feature_name="StoreType",
    n_classes=0,
)

In [None]:
fig, axes = pdp_StoreType.plot(
    center=True,
    plot_lines=True,
    frac_to_plot=100,
    cluster=False,
    n_cluster_centers=None,
    cluster_method='accurate',
    plot_pts_dist=False,
    to_bins=False,
    show_percentile=True,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"pdp_hl": True},
    engine='matplotlib',
    template='plotly_white',
)

In [None]:
fig, axes = pdp_StoreType.plot(
    center=True,
    plot_lines=True,
    frac_to_plot=100,
    cluster=False,
    n_cluster_centers=None,
    cluster_method='accurate',
    plot_pts_dist=False,
    to_bins=False,
    show_percentile=True,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"pdp_hl": True},
    engine='plotly',
    template='plotly_white',
)
fig

# 3. numeric feature: weekofyear

## 3.1 target distribution through feature 'weekofyear'

In [None]:
target_weekofyear = info_plots.TargetPlot(
    df=ross_data,
    feature="weekofyear",
    feature_name="weekofyear",
    target=ross_target,
    num_grid_points=10,
    grid_type='percentile',
    percentile_range=None,
    grid_range=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoint=True,
)

In [None]:
fig, axes, summary_df = target_weekofyear.plot(
    show_percentile=True,
    figsize=None,
    ncols=2,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = target_weekofyear.plot(
    show_percentile=True,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine='matplotlib',
)

## 3.2 check prediction distribution through feature 'weekofyear'

In [None]:
predict_weekofyear = info_plots.PredictPlot(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    feature="weekofyear",
    feature_name="weekofyear",
    pred_func=None,
    n_classes=0,
    num_grid_points=10,
    grid_type='percentile',
    percentile_range=None,
    grid_range=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoint=True,
    predict_kwds={},
    chunk_size=-1,
)

In [None]:
fig, axes, summary_df = predict_weekofyear.plot(
    show_percentile=True,
    figsize=None,
    ncols=2,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = predict_weekofyear.plot(
    show_percentile=True,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine='matplotlib',
)

## 3.3 pdp for feature 'weekofyear'

In [None]:
pdp_weekofyear = pdp.PDPIsolate(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    feature="weekofyear",
    feature_name="weekofyear",
    n_classes=0,
)

In [None]:
fig, axes = pdp_weekofyear.plot(
    center=True,
    plot_lines=True,
    frac_to_plot=100,
    cluster=False,
    n_cluster_centers=None,
    cluster_method='accurate',
    plot_pts_dist=True,
    to_bins=True,
    show_percentile=True,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"pdp_hl": True},
    engine='matplotlib',
    template='plotly_white',
)

In [None]:
fig, axes = pdp_weekofyear.plot(
    center=True,
    plot_lines=True,
    frac_to_plot=100,
    cluster=False,
    n_cluster_centers=None,
    cluster_method='accurate',
    plot_pts_dist=True,
    to_bins=True,
    show_percentile=True,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"pdp_hl": True},
    engine='plotly',
    template='plotly_white',
)
fig

# 4. Interaction between two variables: weekofyear and StoreType

## 4.1 target distribution through feature combination of 'weekofyear' and 'StoreType'

In [None]:
target_weekofyear_storetype = info_plots.InterectTargetPlot(
    df=ross_data,
    features=['weekofyear', ['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d']],
    feature_names=['weekofyear', 'StoreType'],
    target=ross_target,
    num_grid_points=10,
    grid_types='percentile',
    percentile_ranges=None,
    grid_ranges=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoints=True,
)

In [None]:
fig, axes, summary_df = target_weekofyear_storetype.plot(
    show_percentile=True,
    figsize=None,
    ncols=2,
    annotate=True,
    plot_params={"subplot_ratio": {"y": [7, 0.8]}, "gaps": {"inner_y": 0.2}},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df.head()

In [None]:
fig, axes, summary_df = target_weekofyear_storetype.plot(
    show_percentile=True,
    figsize=None,
    ncols=2,
    annotate=True,
    plot_params=None,
    engine='matplotlib',
)

## 4.2 prediction distribution through feature combination of 'weekofyear' and 'StoreType'

In [None]:
predict_weekofyear_storetype = info_plots.InterectPredictPlot(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    n_classes=0,
    features=['weekofyear', ['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d']],
    feature_names=['weekofyear', 'StoreType'],
    num_grid_points=10,
    grid_types='percentile',
    percentile_ranges=None,
    grid_ranges=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoints=True,
)

In [None]:
fig, axes, summary_df = predict_weekofyear_storetype.plot(
    show_percentile=True,
    figsize=None,
    annotate=True,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df.head()

In [None]:
fig, axes, summary_df = predict_weekofyear_storetype.plot(
    show_percentile=True,
    figsize=None,
    ncols=2,
    annotate=True,
    plot_params=None,
    engine='matplotlib',
)

## 4.3 pdp for feature combination of 'weekofyear' and 'StoreType'

In [None]:
pdp_inter = pdp.PDPInteract(
    model=ross_model,
    df=ross_data,
    model_features=ross_features,
    n_classes=0,
    features=['weekofyear', ['StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d']],
    feature_names=['weekofyear', 'storetype'],
)

In [None]:
fig, axes = pdp_inter.plot(
    plot_type="contour",
    to_bins=True,
    plot_pdp=True,
    show_percentile=False,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine="matplotlib",
    template="plotly_white",
)

In [None]:
fig, axes = pdp_inter.plot(
    plot_type="contour",
    to_bins=True,
    plot_pdp=True,
    show_percentile=True,
    which_classes=None,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine="plotly",
    template="plotly_white",
)
fig