In [1]:
import fastbook

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures
#from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from pathlib import Path

In [3]:
from sklearn.compose import ColumnTransformer

In [4]:
from sklearn.preprocessing import OrdinalEncoder

In [5]:
import dtreeviz
from sklearn.tree import DecisionTreeRegressor

In [6]:
DATA_PATH = Path.cwd()/'Data'

In [7]:
N_JOBS=-1
N_CV=5

In [8]:
full_df = pd.read_csv(DATA_PATH/'Final_data.csv', low_memory=False)
full_df.drop(labels='AlkPhos_UL', inplace=True, axis=1, errors='ignore')

#### Splitting

In [9]:
#Adding categorical age column for proportional/stratified train/test split
AGE_GROUP_AMOUNT = 8
full_df['AGE_GROUP'] = pd.cut(full_df['AGE'], bins=AGE_GROUP_AMOUNT, labels=range(AGE_GROUP_AMOUNT))

#Make train test split with proportional age groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
for train_index, test_index in split.split(full_df, full_df['AGE_GROUP']):
    strat_train_set = full_df.iloc[train_index]
    strat_test_set = full_df.iloc[test_index]

##### EDA

In [10]:
# df.AGE.hist(bins=50, figsize=(10,10)), strat_test_set.AGE.hist(bins=50, figsize=(10,10))





# features = df.drop(labels=["SEQN", "GENDER", "AGE_GROUP"], axis=1).columns
# corr_mat = df.drop(labels=["SEQN", "GENDER", "AGE_GROUP"], axis=1).corr().to_numpy()

# fig, ax = plt.subplots()
# im = ax.imshow(corr_mat)
# fig.set_size_inches(28, 28, forward=True)
# font = {'size': 28, 'family': 'serif'}

# ax.set_xticks(np.arange(len(features)))
# ax.set_yticks(np.arange(len(features)))
# ax.set_xticklabels(features)
# ax.set_yticklabels(features)

# plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# for i in range(len(features)):
#     for j in range(len(features)):
#         text = ax.text(j, i, round(corr_mat[i, j], 2), ha="center", va="center",
#                        color="w", fontdict=font)

# ax.set_title("Heatmap of the correlation matrix")
# fig.tight_layout()
# plt.show()

##### Cleaning

In [11]:
strat_train_set

Unnamed: 0,SEQN,GENDER,Albumin_mgl,Glucose_mmolL,Urea_mmolL,Cholesterol_mmolL,Protein_gdL,Sodium_mmolL,Creatinine_mgdl,Hemoglobin_gdl,Bilirubin_umolL,Triglyceride_mmolL,HDL_mmolL,LDL_mmolL,Calcium_mmolL,Potassium_mmolL,Hematocrit_%,MCHC_gdl,MCV_fL,Platelet_TuL,RBC_MuL,AGE,AGE_GROUP
21880,49895,M,13.2,5.718,5.71,3.70,7.0,141.0,163.0,16.0,27.36,1.231,1.89,1.241,2.325,4.30,44.5,35.9,99.6,279.0,4.47,74,6
3844,59935,M,2.1,7.383,5.00,4.97,6.1,138.0,60.0,13.6,13.68,1.016,1.45,3.051,2.225,3.90,38.3,35.6,89.0,345.0,4.30,55,4
5834,68063,M,2.2,5.384,3.57,4.84,6.7,142.0,53.0,15.4,6.84,1.039,1.71,2.664,2.250,4.00,44.8,34.4,94.7,265.0,4.73,51,3
11000,91903,F,7.7,5.880,5.36,6.23,7.4,138.0,57.0,13.4,10.26,0.869,1.45,4.396,2.300,3.56,39.8,33.6,90.3,278.0,4.40,50,3
16964,27060,M,1.4,8.149,6.78,4.63,7.5,139.0,127.0,15.2,11.97,1.603,1.24,2.664,2.300,3.60,45.5,33.5,92.5,199.0,4.91,57,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2217,53505,M,8.5,5.607,4.64,6.28,6.6,139.0,117.0,15.5,15.39,1.264,1.09,4.629,2.350,4.70,45.7,34.0,87.7,215.0,5.20,48,3
14432,13870,M,0.6,4.794,4.64,4.73,7.2,141.0,18.0,15.1,11.97,0.540,1.58,2.900,2.550,4.50,44.8,33.7,94.3,210.0,4.76,45,3
8557,81032,M,5.0,5.662,6.07,6.44,7.1,141.0,60.0,13.8,8.55,1.242,1.66,4.215,2.425,4.50,42.2,32.6,80.6,169.0,5.24,71,6
15779,20999,M,33.3,4.439,4.64,5.38,7.0,141.0,122.0,14.2,15.39,1.940,0.75,3.750,2.500,4.30,42.5,33.5,93.0,227.0,4.56,85,7


In [12]:
full_transform = ColumnTransformer([
    ('encoder', OrdinalEncoder(), ['GENDER']),
    ('drop', 'drop', ['SEQN', 'AGE_GROUP', 'AGE']),
    ('passthrough', 'passthrough', list(strat_train_set.columns)[2:-2]),
])

##### Decision Tree + Viz

In [13]:
# x = pd.DataFrame(full_transform.fit_transform(strat_train_set), columns=strat_train_set.columns[1:-2])
# y = strat_train_set['AGE']

# m = DecisionTreeRegressor(max_leaf_nodes=15)
# m.fit(x,y)
# fastbook.draw_tree(m, x, size=18, leaves_parallel=True, precision=2)

In [14]:
# dtreeviz.trees.dtreeviz(m, x, y, x.columns, 'AGE',
#         fontname='DejaVu Sans', scale=1.3, label_fontsize=8,
#         orientation='TD')

##### Random Forest

In [15]:
rfr_pipe = Pipeline([
    ('processing', full_transform),
    ('kernel', PolynomialFeatures()),
    ('rfr', RandomForestRegressor(random_state=0, n_jobs=N_JOBS, n_estimators=2, oob_score=True, max_samples=0.7))
])    

In [16]:
rfr_grid = {
     'kernel__degree': [1, 2],
     'rfr__min_samples_leaf': list(range(4, 7)),
     'rfr__max_features': ['sqrt', 'auto', 'log2'],
     'rfr__max_samples': np.linspace(0.6, 0.9, 4),
}

In [17]:
rfr_grid_search = GridSearchCV(rfr_pipe, rfr_grid, n_jobs=N_JOBS, cv=N_CV)
rfr_grid_search.fit(strat_train_set, strat_train_set['AGE'])
rfr_grid_search.best_score_

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OO

0.3062604160158102

In [18]:
def mae(model, x):
    t = model.predict(x)
    return (abs(t - x.AGE.to_numpy())).sum()/t.shape[0]

In [19]:
mae(rfr_grid_search.best_estimator_, strat_test_set)

11.911119928273878

In [30]:
yhat = rfr_grid_search.best_estimator_[2].oob_prediction_

In [35]:
(abs(strat_train_set.AGE.to_numpy() - yhat)).sum() / yhat.shape[0]

25.95109957920934

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OO

In [None]:
rfr_grid_search.best_params_

{'kernel__degree': 1,
 'rfr__max_features': 'sqrt',
 'rfr__max_samples': 0.9,
 'rfr__min_samples_leaf': 6}

In [39]:
rfr = rfr_grid_search.best_estimator_[2]
[dt.predict(strat_test_set) for dt in rfr.estimators_]

[DecisionTreeRegressor(max_features='sqrt', min_samples_leaf=6,
                       random_state=209652396),
 DecisionTreeRegressor(max_features='sqrt', min_samples_leaf=6,
                       random_state=398764591)]