# Test PDPbox with multiclass classification problem
## Kaggle Otto Group with RandomForest
This demo is based on **Otto Group Product Classification Challenge**  
https://www.kaggle.com/c/otto-group-product-classification-challenge

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "/Users/sosuneko/Desktop/develop/PDPbox")

In [None]:
import pandas as pd
from pdpbox import pdp, get_example, info_plots

In [None]:
import warnings
warnings.filterwarnings("ignore")

# versions
- `matplotlib`: 3.1.1
- `sklearn`: 0.23.1

In [None]:
import matplotlib
print(matplotlib.__version__)

In [None]:
import sklearn
print(sklearn.__version__)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# train a simple RandomForestClassifier
We assume there is a `otto-group-product-classification-challenge` data folder under the same directory. The dataset is directly downloaded from kaggle with command `kaggle competitions download -c otto-group-product-classification-challenge`.

In [None]:
! ls otto-group-product-classification-challenge

In [None]:
train_df = pd.read_csv('otto-group-product-classification-challenge/train.csv')
print(train_df.shape)

In [None]:
train_df.head()

In [None]:
features = [
    'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10',
    'feat_11', 'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18', 'feat_19',
    'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28',
    'feat_29', 'feat_30', 'feat_31', 'feat_32', 'feat_33', 'feat_34', 'feat_35', 'feat_36', 'feat_37',
    'feat_38', 'feat_39', 'feat_40', 'feat_41', 'feat_42', 'feat_43', 'feat_44', 'feat_45', 'feat_46',
    'feat_47', 'feat_48', 'feat_49', 'feat_50', 'feat_51', 'feat_52', 'feat_53', 'feat_54', 'feat_55',
    'feat_56', 'feat_57', 'feat_58', 'feat_59', 'feat_60', 'feat_61', 'feat_62', 'feat_63', 'feat_64',
    'feat_65', 'feat_66', 'feat_67', 'feat_68', 'feat_69', 'feat_70', 'feat_71', 'feat_72', 'feat_73',
    'feat_74', 'feat_75', 'feat_76', 'feat_77', 'feat_78', 'feat_79', 'feat_80', 'feat_81', 'feat_82',
    'feat_83', 'feat_84', 'feat_85', 'feat_86', 'feat_87', 'feat_88', 'feat_89', 'feat_90', 'feat_91',
    'feat_92', 'feat_93'
]
otto_data = train_df.copy()
uni_targets = sorted(otto_data['target'].unique())
otto_target = ['target']
for idx, value in enumerate(uni_targets):
    otto_data['target_{}'.format(idx)] = (otto_data['target'] == value).map(int)
    otto_target.append('target_{}'.format(idx))
print(otto_target)

X_train, X_test, y_train, y_test = train_test_split(
    otto_data[features], otto_data['target'], test_size = 0.2, random_state=42)

In [None]:
%%time
classifier = RandomForestClassifier(
    n_estimators=30, 
    n_jobs=4,
    random_state=42
)
classifier.fit(X_train, y_train)

In [None]:
classifier.score(X_test, y_test)

In [None]:
otto_features = features
otto_model = classifier

# [for internal usage] save for unit test and tutorial

# or you can directory load the data and model

In [None]:
test_otto = get_example.otto()
test_otto.keys()

In [None]:
otto_data = test_otto['data']
otto_features = test_otto['features']
otto_model = test_otto['rf_model']
otto_target = test_otto['target']

In [None]:
otto_data.head()

# 1. single numeric feature: 'feat_67'

## 1.1 target distribution through feature 'feat_67'

In [None]:
target_67 = info_plots.TargetPlot(
    df=otto_data,
    feature="feat_67",
    feature_name="feat_67",
    target=otto_target[1:],
    num_grid_points=10,
    grid_type='percentile',
    percentile_range=None,
    grid_range=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoint=True,
)

In [None]:
fig, axes, summary_df = target_67.plot(
    which_classes=None,
    show_percentile=True,
    figsize=(1200, 400),
    ncols=2,
    plot_params={"gaps": {"outer_y": 0.05}},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
fig, axes, summary_df = target_67.plot(
    which_classes=[1, 2],
    show_percentile=True,
    figsize=(1200, 400),
    ncols=2,
    plot_params={"gaps": {"outer_y": 0.05, "top": 0.1}},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = target_67.plot(
    show_percentile=True,
    figsize=(16, 6),
    dpi=300,
    ncols=2,
    plot_params={"gaps": {"outer_y": 0.4}, "title": {"subplot_title": {"fontsize": 11}}},
    engine='matplotlib',
)

## 1.2 check prediction distribution through feature 'feat_67'

In [None]:
predict_67 = info_plots.PredictPlot(
    model=otto_model,
    df=otto_data,
    model_features=otto_features,
    feature="feat_67",
    feature_name="feat_67",
    pred_func=None,
    n_classes=None,
    num_grid_points=10,
    grid_type='percentile',
    percentile_range=None,
    grid_range=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoint=True,
    predict_kwds={},
    chunk_size=-1,
)

In [None]:
fig, axes, summary_df = predict_67.plot(
    show_percentile=True,
    figsize=None,
    ncols=2,
    plot_params=None,
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df

In [None]:
fig, axes, summary_df = predict_67.plot(
    which_classes=[1, 3, 5, 8],
    show_percentile=True,
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"title": {"subplot_title": {"fontsize": 10}}, "gaps": {"outer_y": 0.25}},
    engine='matplotlib',
)

## 1.3 pdp for feature 'feat_67'

In [None]:
pdp_feat_67_rf = pdp.PDPIsolate(
    model=otto_model,
    df=otto_data,
    model_features=otto_features,
    feature="feat_67",
    feature_name="feat_67",
    chunk_size=10000,
)

In [None]:
fig, axes = pdp_feat_67_rf.plot(
    plot_lines=True,
    frac_to_plot=100,
    plot_pts_dist=True,
    to_bins=False,
    ncols=3,
    figsize=(1200, 400),
    show_percentile=True,
    plot_params={"pdp_hl": True, "gaps": {"outer_x": 0.06, "inner_y": 0.02, "outer_y": 0.1}},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
fig, axes = pdp_feat_67_rf.plot(
    plot_lines=True,
    frac_to_plot=100,
    plot_pts_dist=True,
    to_bins=True,
    figsize=(16, 6.5),
    show_percentile=True,
    ncols=3,
    plot_params={"pdp_hl": True, "title": {"subplot_title": {"fontsize": 10}}, "gaps": {"outer_y": 0.4}},
    engine='matplotlib',
)

# 2. Interaction between two variables: feat_67 and feat_25

## 2.1 target distribution through feature combination of 'feat_67' and 'feat_25'

In [None]:
target_67_25 = info_plots.InterectTargetPlot(
    df=otto_data,
    features=['feat_67', 'feat_25'],
    feature_names=['feat_67', 'feat_25'],
    target=otto_target[1:],
    num_grid_points=10,
    grid_types='percentile',
    percentile_ranges=None,
    grid_ranges=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoints=True,
)

In [None]:
fig, axes, summary_df = target_67_25.plot(
    which_classes=[0, 1, 2, 3],
    show_percentile=False,
    figsize=None,
    ncols=2,
    annotate=False,
    plot_params={"gaps": {"inner_y": 0.06}},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
fig, axes, summary_df = target_67_25.plot(
    which_classes=[0, 1, 2, 3],
    show_percentile=False,
    figsize=None,
    dpi=300,
    ncols=2,
    annotate=False,
    plot_params={"title": {"subplot_title": {"fontsize": 10}}},
    engine='matplotlib',
)

## 2.2 prediction distribution through feature combination of 'feat_67' and 'feat_25'

In [None]:
predict_67_25 = info_plots.InterectPredictPlot(
    model=otto_model,
    df=otto_data,
    model_features=otto_features,
    features=['feat_67', 'feat_25'], 
    feature_names=['feat_67', 'feat_25'],
    pred_func=None,
    n_classes=None,
    num_grid_points=10,
    grid_types='percentile',
    percentile_ranges=None,
    grid_ranges=None,
    cust_grid_points=None,
    show_outliers=False,
    endpoints=True,
    predict_kwds={},
    chunk_size=-1,
)

In [None]:
fig, axes, summary_df = predict_67_25.plot(
    which_classes=[1, 3, 5, 6],
    show_percentile=False,
    figsize=None,
    ncols=2,
    annotate=False,
    plot_params={"gaps": {"inner_y": 0.06}},
    engine='plotly',
    template='plotly_white',
)
fig

In [None]:
summary_df.head()

In [None]:
fig, axes, summary_df = predict_67_25.plot(
    which_classes=[1, 3, 5, 6],
    show_percentile=False,
    figsize=None,
    dpi=300,
    ncols=2,
    annotate=False,
    plot_params=None,
    engine='matplotlib',
)

## 2.3 pdp for feature combination of 'feat_67' and 'feat_25'

In [None]:
pdp_67_25 = pdp.PDPInteract(
    model=otto_model,
    df=otto_data,
    model_features=otto_features,
    features=['feat_67', 'feat_25'],
    feature_names=['feat_67', 'feat_25'],
)

In [None]:
fig, axes = pdp_67_25.plot(
    plot_type="grid",
    to_bins=True,
    plot_pdp=False,
    show_percentile=True,
    which_classes=[2, 4, 6, 8],
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params={"gaps": {"outer_y": 0.2}},
    engine="matplotlib",
    template="plotly_white",
)

In [None]:
fig, axes = pdp_67_25.plot(
    plot_type="contour",
    to_bins=True,
    plot_pdp=True,
    show_percentile=False,
    which_classes=[2, 4, 6, 8],
    figsize=None,
    dpi=300,
    ncols=2,
    plot_params=None,
    engine="plotly",
    template="plotly_white",
)
fig