In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from time import time
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

# from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
# from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
ref_train_x = pd.read_csv('data/train_features.csv')
ref_train_y = pd.read_csv('data/train_targets_scored.csv')
# train_y2 = pd.read_csv('data/train_targets_nonscored.csv')

ref_test_x = pd.read_csv('data/test_features.csv')
smplsub = pd.read_csv('data/sample_submission.csv')

In [5]:
def get_label_stratified_val_idxs(df, val_size=0.1, rnd=0):
    
    arr = df.to_numpy()

    X = arr[:,0]
    y = arr[:,1:] # this works irrespective of whether labels are space- or comma-separated
    
    ### sklearn.model_selection.StratifiedKFold
    sss = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=rnd)
    
    for train_index, val_index in sss.split(X, y):
        trn_idxs = train_index
        val_idxs = val_index

    data_report(df, trn_idxs, val_idxs)
    return trn_idxs, val_idxs

def finalize_df(df, targets, as_multi=True): 
    # Select and fuse labels into target column (space separated)
    df_slct = df[[df.columns[0]] + targets]
    if as_multi:
        df_out = np.array([[df_slct.values[i][0], ' '.join(str(x) for x in df_slct.values[i][1:])] for i in range(len(df_slct))])
        return pd.DataFrame(df_out, columns = ["ID", "Target"])
    else: 
        df_out = np.array(df_slct)
        if len(targets) == 1: return pd.DataFrame(df_out, columns = ["ID", 'Target'])
        else: return pd.DataFrame(df_out, columns = ["ID"] + targets)

def data_report(df, trn_idxs, val_idxs, test_csv=None):
    trnval = df
    if len(trnval.columns) != 2:
        print(f"Multilabel csv with comma-separated labels detected!\n")
        trnval = finalize_df(trnval, targets=list(trnval.columns)[1:])
    print(f"""Train label-distribution:\n"""
          f"""{trnval['Target'][trn_idxs].value_counts()}\n"""
          f"""Total: {len(trn_idxs)}\n""")
    print(f"""Val label-distribution:\n"""
          f"""{trnval['Target'][val_idxs].value_counts()}\n"""
          f"""Total: {len(val_idxs)}""")

In [6]:
trn_idxs, val_idxs = get_label_stratified_val_idxs(ref_train_x.iloc[:,:3], val_size=0.1, rnd=0)

Multilabel csv with comma-separated labels detected!

Train label-distribution:
trt_cp 48         6842
trt_cp 72         6462
trt_cp 24         6449
ctl_vehicle 48     583
ctl_vehicle 72     551
ctl_vehicle 24     545
Name: Target, dtype: int64
Total: 21432

Val label-distribution:
trt_cp 48         760
trt_cp 72         718
trt_cp 24         717
ctl_vehicle 48     65
ctl_vehicle 24     61
ctl_vehicle 72     61
Name: Target, dtype: int64
Total: 2382


### Datapred

In [7]:
# convert_to_numeric
ref_train_x['cp_type'] = ref_train_x['cp_type'].apply(lambda x : 0 if x == "trt_cp" else 1 )
ref_train_x['cp_dose'] = ref_train_x['cp_dose'].apply(lambda x : 1 if x == "D1" else 2 )

ref_test_x['cp_type'] = ref_test_x['cp_type'].apply(lambda x : 0 if x == "trt_cp" else 1 )
ref_test_x['cp_dose'] = ref_test_x['cp_dose'].apply(lambda x : 1 if x == "D1" else 2 )

# split train/val
_X_train, _Y_train = ref_train_x.loc[trn_idxs], ref_train_y.loc[trn_idxs]
_X_valid, _Y_valid = ref_train_x.loc[val_idxs], ref_train_y.loc[val_idxs]

# select features / format dfs:
x_fts = list(_X_train.columns[1:])
y_fts = list(_Y_train.columns[1:])
X_train, X_valid, X_test = _X_train[x_fts], _X_valid[x_fts], ref_test_x[x_fts]
Y_train, Y_valid = _Y_train[y_fts], _Y_valid[y_fts]

### XGBoost

In [11]:
def eval_model(forest):
    res = {'train preds': [Y_train.to_numpy(), forest.predict(X_train)], 
           'valid preds': [Y_valid.to_numpy(), forest.predict(X_valid)], 
           'train baseline': [Y_train.to_numpy(), np.zeros(Y_train.shape)],
           'valid baseline': [Y_valid.to_numpy(), np.zeros(Y_valid.shape)]}
     
    for key, value in res.items(): 
        print(f"{key}: {metrics.log_loss(*value)}")
        
    return res['train preds'], res['valid preds']

In [17]:
clf = OneVsRestClassifier(xgb.XGBClassifier(n_estimators=10, n_jobs=-1, max_depth=3, verbosity=1))
clf.fit(X_train, Y_train);

In [14]:
# n_estimators=1, n_jobs=-1, max_depth=3, verbosity=2
train_preds, valid_preds = eval_model(clf)

train preds: 1.2665292408257078
valid preds: 3.33974293220079
train baseline: 3.7761496362408424
valid baseline: 3.699541218798475


In [16]:
# n_estimators=10, n_jobs=-1, max_depth=None, verbosity=1
train_preds, valid_preds = eval_model(clf)

train preds: 2.98966674905155
valid preds: 3.4903376369736243
train baseline: 3.7761496362408424
valid baseline: 3.699541218798475


In [18]:
# n_estimators=10, n_jobs=-1, max_depth=3, verbosity=1
train_preds, valid_preds = eval_model(clf)

train preds: 3.3517202616769346
valid preds: 3.3995132701742037
train baseline: 3.7761496362408424
valid baseline: 3.699541218798475


## ExTrees 

In [9]:
def oob_error(multi_target_forest_obj):
    oobs = np.stack([1 - i.oob_score_ for i in multi_target_forest_obj.estimators_])
    return np.mean(oobs)

def eval_forest(forest):
    res = {'train preds': [Y_train.to_numpy(), forest.predict(X_train)], 
           'valid preds': [Y_valid.to_numpy(), forest.predict(X_valid)], 
           'train baseline': [Y_train.to_numpy(), np.zeros(Y_train.shape)],
           'valid baseline': [Y_valid.to_numpy(), np.zeros(Y_valid.shape)]}
     
    for key, value in res.items(): 
        print(f"{key}: {metrics.log_loss(*value)}")
        
    if hasattr(forest.estimators_[0], 'oob_score_'):
        res['oob_error'] = oob_error(forest)
        print(f"oob_error: {res['oob_error']}")
    
    return res['train preds'], res['valid preds']

def train_multi_forest(f, n_estimators=10, max_depth=None, bootstrap=True, n_jobs=-1, min_samples_leaf=10, oob_score=True): 
    forest = f(n_estimators=n_estimators, max_depth=max_depth, bootstrap=bootstrap,
               n_jobs=n_jobs, min_samples_leaf=min_samples_leaf, oob_score=oob_score);
    multi_target_forest = MultiOutputClassifier(forest);
    multi_target_forest.fit(X_train, Y_train);
    train_res, valid_res = eval_forest(multi_target_forest)
    return multi_target_forest, train_res, valid_res

In [16]:
%time forest1, train_res, valid_res = train_multi_forest(ExtraTreesClassifier, n_estimators=40, n_jobs=-1, min_samples_leaf=3, bootstrap=True, oob_score=True)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

train preds: 3.450307916909121
valid preds: 3.309763437673323
train baseline: 3.7761496362408424
valid baseline: 3.699541218798475
oob_error: 0.003097853857945834
Wall time: 4min 39s


### GridSearchedExForests 

In [10]:
def Grid_fitter(trn, val, grid, gridsearch=True, cv=5, n_jobs=-1):
    
    X_train, Y_train = trn
    X_valid, Y_valid = val
    
    # Train a SVM classification model
    print(f"Train-set: {Y_train.shape}, Test-set: {Y_valid.shape}")    
    print("Fitting the classifier to the training set")
    t0 = time()
    if gridsearch:
        param_grid = grid
        
        clf = GridSearchCV(
            MultiOutputClassifier(
                ExtraTreesClassifier(), n_jobs=n_jobs),
            param_grid, cv=cv, verbose=1, n_jobs=n_jobs)
        clf = clf.fit(X_train, Y_train)
        print("done in %0.3fs" % (time() - t0))
        print("Best estimator found by grid search:")
        print(clf.best_estimator_)
        print('\nrunning inference...')
        pred_valid = clf.predict(X_valid)

        print("done in %0.3fs" % (time() - t0))
    
    return clf, pred_valid, Y_valid

In [21]:
grid = {
    'estimator__min_samples_leaf': [3],
    'estimator__max_depth': [None],
    'estimator__max_features': [1, 0.5, 'log2', 'sqrt'],
    'estimator__n_estimators': [40]
}

clf, pred_valid, Y_valid = Grid_fitter([X_train, Y_train],
                                       [X_valid, Y_valid],
                                       grid, cv=2,
                                       n_jobs=-1)

Train-set: (21432, 206), Test-set: (2382, 206)
Fitting the classifier to the training set
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  1.7min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed: 24.4min finished


done in 3437.736s
Best estimator found by grid search:
MultiOutputClassifier(estimator=ExtraTreesClassifier(bootstrap=False,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features=0.5,
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=3,
                                                     min_samples_split=2,
                                                     min_weight_fraction_l

In [22]:
train_res, valid_res = eval_forest(clf.best_estimator_)

train preds: 2.70868197253716
valid preds: 3.367215067968903
train baseline: 3.7761496362408424
valid baseline: 3.699541218798475


In [23]:
clf.best_params_

{'estimator__max_depth': None,
 'estimator__max_features': 0.5,
 'estimator__min_samples_leaf': 3,
 'estimator__n_estimators': 40}

# Old

In [133]:
res_train = multi_target_forest.predict(X_train[fts])
res_valid = multi_target_forest.predict(X_valid[fts])

In [134]:
train_targets = Y_train.iloc[:,1:].to_numpy()
valid_targets = Y_valid.iloc[:,1:].to_numpy()

In [135]:
train_zeros = np.zeros((res_train.shape))
valid_zeros = np.zeros((res_valid.shape))

In [131]:
# n_estimators=1, max_depth=9, bootstrap=True, n_jobs=-1
# print(metrics.log_loss(train_targets, res_train), metrics.log_loss(valid_targets, res_valid))
# print(metrics.log_loss(train_targets, train_zeros), metrics.log_loss(valid_targets, valid_zeros))

4.3575507942684295 6.809625700822502
3.7761496362408424 3.699541218798475


In [137]:
# n_estimators=1, max_depth=9, bootstrap=False, n_jobs=-1
# print(metrics.log_loss(train_targets, res_train), metrics.log_loss(valid_targets, res_valid))
# print(metrics.log_loss(train_targets, train_zeros), metrics.log_loss(valid_targets, valid_zeros))

3.169187228013868 6.156248308422521
3.7761496362408424 3.699541218798475


In [119]:
# n_estimators=1, max_depth=1, bootstrap=False, n_jobs=-1
# print(metrics.log_loss(train_targets, res_train), metrics.log_loss(valid_targets, res_valid))
# print(metrics.log_loss(train_targets, train_zeros))

3.77926097091646 4.00797008187417
3.7761496362408424


In [None]:
draw_tree(m.estimators_[0], df_trn, precision=3)

In [None]:
m = RandomForestRegressor(n_estimators=1, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

In [25]:
forest = RandomForestClassifier(n_estimators=40, max_depth=None, bootstrap=False, n_jobs=-1, min_samples_leaf=10)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(X_train, Y_train);

In [17]:
# n_estimators=1, max_depth=1, bootstrap=False, n_jobs=-1
eval_forest(multi_target_forest)

3.921241973814986 3.832497240942334
3.7761496362408424 3.699541218798475


In [20]:
# n_estimators=1, max_depth=None, bootstrap=False, n_jobs=-1
eval_forest(multi_target_forest)

0.15374717940460067 10.75085183554159
3.7761496362408424 3.699541218798475


In [22]:
# n_estimators=10, max_depth=None, bootstrap=False, n_jobs=-1
eval_forest(multi_target_forest)

0.15374717940460067 3.446434456216206
3.7761496362408424 3.699541218798475


In [24]:
# n_estimators=10, max_depth=None, bootstrap=False, n_jobs=-1, min_samples_leaf=10
eval_forest(multi_target_forest)

3.519503330847981 3.428846180954989
3.7761496362408424 3.699541218798475


In [32]:
# n_estimators=40, max_depth=None, bootstrap=False, n_jobs=-1, min_samples_leaf=5
preds, targs = eval_forest(multi_target_forest)

3.2090305137811614 3.3468245818470104
3.7761496362408424 3.699541218798475
