In [1]:
import fastbook

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures
#from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from pathlib import Path

In [3]:
from sklearn.compose import ColumnTransformer

In [4]:
from sklearn.preprocessing import OrdinalEncoder

In [5]:
import dtreeviz
from sklearn.tree import DecisionTreeRegressor

In [6]:
DATA_PATH = Path.cwd()/'Data'

In [7]:
N_JOBS=-1
N_CV=5
RAND_STATE = 0

In [8]:
full_df = pd.read_csv(DATA_PATH/'Final_data.csv', low_memory=False)
full_df.drop(labels='AlkPhos_UL', inplace=True, axis=1, errors='ignore')

#### Splitting

In [10]:
#Adding categorical age column for proportional/stratified train/test split
AGE_GROUP_AMOUNT = 8
full_df['AGE_GROUP'] = pd.cut(full_df['AGE'], bins=AGE_GROUP_AMOUNT, labels=range(AGE_GROUP_AMOUNT))

#Make train test split with proportional age groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=RAND_STATE)
for train_index, test_index in split.split(full_df, full_df['AGE_GROUP']):
    strat_train_set = full_df.iloc[train_index]
    strat_test_set = full_df.iloc[test_index]

##### EDA

In [10]:
# df.AGE.hist(bins=50, figsize=(10,10)), strat_test_set.AGE.hist(bins=50, figsize=(10,10))





# features = df.drop(labels=["SEQN", "GENDER", "AGE_GROUP"], axis=1).columns
# corr_mat = df.drop(labels=["SEQN", "GENDER", "AGE_GROUP"], axis=1).corr().to_numpy()

# fig, ax = plt.subplots()
# im = ax.imshow(corr_mat)
# fig.set_size_inches(28, 28, forward=True)
# font = {'size': 28, 'family': 'serif'}

# ax.set_xticks(np.arange(len(features)))
# ax.set_yticks(np.arange(len(features)))
# ax.set_xticklabels(features)
# ax.set_yticklabels(features)

# plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# for i in range(len(features)):
#     for j in range(len(features)):
#         text = ax.text(j, i, round(corr_mat[i, j], 2), ha="center", va="center",
#                        color="w", fontdict=font)

# ax.set_title("Heatmap of the correlation matrix")
# fig.tight_layout()
# plt.show()

##### Cleaning

In [12]:
full_transform = ColumnTransformer([
    ('encoder', OrdinalEncoder(), ['GENDER']),
    ('drop', 'drop', ['SEQN', 'AGE_GROUP', 'AGE']),
    ('passthrough', 'passthrough', list(strat_train_set.columns)[2:-2]),
])

##### Decision Tree + Viz

In [13]:
# x = pd.DataFrame(full_transform.fit_transform(strat_train_set), columns=strat_train_set.columns[1:-2])
# y = strat_train_set['AGE']

# m = DecisionTreeRegressor(max_leaf_nodes=15)
# m.fit(x,y)
# fastbook.draw_tree(m, x, size=18, leaves_parallel=True, precision=2)

In [14]:
# dtreeviz.trees.dtreeviz(m, x, y, x.columns, 'AGE',
#         fontname='DejaVu Sans', scale=1.3, label_fontsize=8,
#         orientation='TD')

##### Random Forest

In [15]:
rfr_pipe = Pipeline([
    ('processing', full_transform),
    ('kernel', PolynomialFeatures()),
    ('rfr', RandomForestRegressor(random_state=RAND_STATE, n_jobs=N_JOBS, n_estimators=2, oob_score=True, max_samples=0.7))
])    

In [16]:
rfr_grid = {
     'kernel__degree': [1, 2],
     'rfr__min_samples_leaf': list(range(4, 7)),
     'rfr__max_features': ['sqrt', 'auto', 'log2'],
     'rfr__max_samples': np.linspace(0.6, 0.9, 4),
}

In [17]:
rfr_grid_search = GridSearchCV(rfr_pipe, rfr_grid, n_jobs=N_JOBS, cv=N_CV)
rfr_grid_search.fit(strat_train_set, strat_train_set['AGE'])
rfr_grid_search.best_score_

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OO

0.3062604160158102

In [18]:
def mae(model, x):
    t = model.predict(x)
    return (abs(t - x.AGE.to_numpy())).sum()/t.shape[0]

In [19]:
mae(rfr_grid_search.best_estimator_, strat_test_set)

11.911119928273878

In [30]:
yhat = rfr_grid_search.best_estimator_[2].oob_prediction_

In [35]:
(abs(strat_train_set.AGE.to_numpy() - yhat)).sum() / yhat.shape[0]

25.95109957920934

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OO

In [None]:
rfr_grid_search.best_params_

{'kernel__degree': 1,
 'rfr__max_features': 'sqrt',
 'rfr__max_samples': 0.9,
 'rfr__min_samples_leaf': 6}

In [39]:
rfr = rfr_grid_search.best_estimator_[2]
[dt.predict(strat_test_set) for dt in rfr.estimators_]

[DecisionTreeRegressor(max_features='sqrt', min_samples_leaf=6,
                       random_state=209652396),
 DecisionTreeRegressor(max_features='sqrt', min_samples_leaf=6,
                       random_state=398764591)]