In [None]:
!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html -q
!pip install --upgrade kornia -q
!pip install allennlp==1.1.0.rc4 -q
!pip install --upgrade fastai -q

In [None]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [None]:
!pip install dtreeviz -q

In [None]:
#hide
# from fastbook import *
# from kaggle import api
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [None]:
from fastai.tabular.all import *

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
path = Path('../input/bluebook-for-bulldozers')
Path.BASE_PATH = path
path.ls()

In [None]:
os.listdir(path)

In [None]:
from zipfile import ZipFile
with ZipFile(path/'TrainAndValid.zip') as file:
    file.extractall('data/')

In [None]:
df = pd.read_csv('data/TrainAndValid.csv', low_memory=False)

In [None]:
df.shape

In [None]:
df.info()

**Ordinal Variable**

In [None]:
df['ProductSize'] = df['ProductSize'].astype('category')

In [None]:
sizes = 'large','large / Medium','Medium','Small','Mini','Compact'

In [None]:
df['ProductSize'].cat.set_categories(sizes, ordered=True, inplace=True)

**Dependent Variable**

In [None]:
dep_var = 'SalePrice'

In [None]:
df[dep_var].head()

In [None]:
df[dep_var] = np.log(df[dep_var])

**Handling Date**

In [None]:
df[['saledate']].head()

In [None]:
df.shape

In [None]:
df = add_datepart(df, 'saledate')

In [None]:
df.shape

In [None]:
# new variable created by datepart
' '.join(a for a in df.columns if a.startswith('sale'))

In [None]:
df[[a for a in df.columns if a.startswith('sale')]].head()

In [None]:
# test set
df_test = pd.read_csv(path/'Test.csv', low_memory=False)
df_test = add_datepart(df_test, 'saledate')

# TabularPandas and TabularProc

In [None]:
procs = [Categorify, FillMissing]

In [None]:
cond = (df.saleYear<2011) | (df.saleMonth<10)
train_idx = np.where(cond)[0]
valid_idx = np.where(~cond)[0]

splits = (list(train_idx), list(valid_idx))

In [None]:
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)

In [None]:
to = TabularPandas(df, procs=procs, cat_names=cat, cont_names=cont, splits=splits, y_names=dep_var)

In [None]:
len(to.train), len(to.valid)

# Decision Tree

In [None]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y

In [None]:
m = DecisionTreeRegressor()
m.fit(xs, y);

In [None]:
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [None]:
m_rmse(m, xs, y)

In [None]:
m_rmse(m, valid_xs, valid_y)

In [None]:
m.get_n_leaves(), len(xs)

In [None]:
m = DecisionTreeRegressor(min_samples_leaf=25)
m.fit(to.train.xs, to.train.y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

In [None]:
m.get_n_leaves()

In [None]:
from sklearn.ensemble import RandomForestRegressor

def rf(xs, y, n_estimators=40, max_samples=200_000,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [None]:
m = rf(xs, y);

In [None]:
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

In [None]:
r_mse(m.oob_prediction_, y)

In [None]:
preds = np.stack([t.predict(valid_xs) for t in m.estimators_])

In [None]:
preds.shape

In [None]:
preds_std = preds.std(0);preds_std[:5]

# Feature Importance

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importance(m, xs)
fi[:10]

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

plot_fi(fi[:30]);

In [None]:
to_keep = fi[fi.imp>0.005].cols
len(to_keep)

In [None]:
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [None]:
%time m = rf(xs_imp, y)

In [None]:
m_rmse(m, xs_imp, y), m_rmse(m, valid_xs_imp, valid_y)

In [None]:
len(xs.columns), len(xs_imp.columns)

In [None]:
plot_fi(rf_feat_importance(m, xs_imp));


In [None]:

# cluster_columns(xs_imp)

from scipy.cluster import hierarchy as hc
import scipy
corr = np.round(scipy.stats.spearmanr(xs_imp).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=xs_imp.columns, 
      orientation='left', leaf_font_size=16)
plt.show()

In [None]:
def get_oob(df):
    m = RandomForestRegressor(n_estimators=40, min_samples_leaf=15,
        max_samples=50000, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

In [None]:
get_oob(xs_imp)

In [None]:
{c:get_oob(xs_imp.drop(c, axis=1)) for c in (
    'saleYear', 'saleElapsed', 'ProductGroupDesc','ProductGroup',
    'fiModelDesc', 'fiBaseModel',
    'Hydraulics_Flow','Grouser_Tracks', 'Coupler_System')}

In [None]:
to_drop = ['saleYear', 'ProductGroupDesc', 'fiBaseModel', 'Grouser_Tracks']
get_oob(xs_imp.drop(to_drop, axis=1))

In [None]:
xs_final = xs_imp.drop(to_drop, axis=1)
valid_xs_final = valid_xs_imp.drop(to_drop, axis=1)

In [None]:
m = rf(xs_final, y)
m_rmse(m, xs_final, y), m_rmse(m, valid_xs_final, valid_y)

# Partial Dependence

In [None]:
p = valid_xs_final['ProductSize'].value_counts(sort=False).plot.barh()
c = to.classes['ProductSize']
plt.yticks(range(len(c)), c);

In [None]:
ax = valid_xs_final['YearMade'].hist()

In [None]:
from sklearn.inspection import plot_partial_dependence

fig,ax = plt.subplots(figsize=(12, 4))
plot_partial_dependence(m, valid_xs_final, ['YearMade','ProductSize'],
                        grid_resolution=20, ax=ax);

In [None]:
!pip install treeinterpreter -q
!pip install waterfallcharts -q

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall

In [None]:
row = valid_xs_final.iloc[:5]


In [None]:
prediction,bias,contributions = treeinterpreter.predict(m, row.values)

In [None]:
prediction[0], bias[0], contributions[0].sum()


In [None]:
waterfall(valid_xs_final.columns, contributions[0], threshold=0.08, 
          rotation_value=45,formatting='{:,.3f}');
