In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics

from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier

In [5]:
train_x = pd.read_csv('data/train_features.csv')
train_y = pd.read_csv('data/train_targets_scored.csv')
# train_y2 = pd.read_csv('data/train_targets_nonscored.csv')

# test_x = pd.read_csv('data/test_features.csv')
smplsub = pd.read_csv('data/sample_submission.csv')

In [6]:
train_x.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [7]:
def get_label_stratified_val_idxs(df, val_size=0.1, rnd=0):
    
    arr = df.to_numpy()

    X = arr[:,0]
    y = arr[:,1:] # this works irrespective of whether labels are space- or comma-separated
    
    ### sklearn.model_selection.StratifiedKFold
    sss = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=rnd)
    
    for train_index, val_index in sss.split(X, y):
        trn_idxs = train_index
        val_idxs = val_index

    data_report(df, trn_idxs, val_idxs)
    return trn_idxs, val_idxs

def finalize_df(df, targets, as_multi=True): 
    # Select and fuse labels into target column (space separated)
    df_slct = df[[df.columns[0]] + targets]
    if as_multi:
        df_out = np.array([[df_slct.values[i][0], ' '.join(str(x) for x in df_slct.values[i][1:])] for i in range(len(df_slct))])
        return pd.DataFrame(df_out, columns = ["ID", "Target"])
    else: 
        df_out = np.array(df_slct)
        if len(targets) == 1: return pd.DataFrame(df_out, columns = ["ID", 'Target'])
        else: return pd.DataFrame(df_out, columns = ["ID"] + targets)

def data_report(df, trn_idxs, val_idxs, test_csv=None):
    trnval = df
    if len(trnval.columns) != 2:
        print(f"Multilabel csv with comma-separated labels detected!\n")
        trnval = finalize_df(trnval, targets=list(trnval.columns)[1:])
    print(f"""Train label-distribution:\n"""
          f"""{trnval['Target'][trn_idxs].value_counts()}\n"""
          f"""Total: {len(trn_idxs)}\n""")
    print(f"""Val label-distribution:\n"""
          f"""{trnval['Target'][val_idxs].value_counts()}\n"""
          f"""Total: {len(val_idxs)}""")

In [8]:
trn_idxs, val_idxs = get_label_stratified_val_idxs(train_x.iloc[:,:3])

Multilabel csv with comma-separated labels detected!

Train label-distribution:
trt_cp 48         6842
trt_cp 72         6462
trt_cp 24         6449
ctl_vehicle 48     583
ctl_vehicle 72     551
ctl_vehicle 24     545
Name: Target, dtype: int64
Total: 21432

Val label-distribution:
trt_cp 48         760
trt_cp 72         718
trt_cp 24         717
ctl_vehicle 48     65
ctl_vehicle 72     61
ctl_vehicle 24     61
Name: Target, dtype: int64
Total: 2382


### Datapred

In [9]:
train_x['cp_type'] = train_x['cp_type'].apply(lambda x : 0 if x == "trt_cp" else 1 )
train_x['cp_dose'] = train_x['cp_dose'].apply(lambda x : 1 if x == "D1" else 2 )

In [10]:
X_train, Y_train = train_x.loc[trn_idxs], train_y.loc[trn_idxs]
X_valid, Y_valid = train_x.loc[val_idxs], train_y.loc[val_idxs]

In [11]:
print('train:', X_train.shape, Y_train.shape)
print('train:', X_valid.shape, Y_valid.shape)

train: (21432, 876) (21432, 207)
train: (2382, 876) (2382, 207)


In [12]:
fts = list(X_train.columns[1:])

In [25]:
forest = RandomForestClassifier(n_estimators=40, max_depth=None, bootstrap=False, n_jobs=-1, min_samples_leaf=10)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(X_train[fts], Y_train.iloc[:,1:]);

In [30]:
def train_multi_forest(): return None

def eval_forest(forest):
    res_train = forest.predict(X_train[fts])
    res_valid = forest.predict(X_valid[fts])
    
    train_targets = Y_train.iloc[:,1:].to_numpy()
    valid_targets = Y_valid.iloc[:,1:].to_numpy()
    
    train_zeros = np.zeros((res_train.shape))
    valid_zeros = np.zeros((res_valid.shape))
    
    print(metrics.log_loss(train_targets, res_train), metrics.log_loss(valid_targets, res_valid))
    print(metrics.log_loss(train_targets, train_zeros), metrics.log_loss(valid_targets, valid_zeros))
    return res_valid, valid_targets

In [17]:
# n_estimators=1, max_depth=1, bootstrap=False, n_jobs=-1
eval_forest(multi_target_forest)

3.921241973814986 3.832497240942334
3.7761496362408424 3.699541218798475


In [20]:
# n_estimators=1, max_depth=None, bootstrap=False, n_jobs=-1
eval_forest(multi_target_forest)

0.15374717940460067 10.75085183554159
3.7761496362408424 3.699541218798475


In [22]:
# n_estimators=10, max_depth=None, bootstrap=False, n_jobs=-1
eval_forest(multi_target_forest)

0.15374717940460067 3.446434456216206
3.7761496362408424 3.699541218798475


In [24]:
# n_estimators=10, max_depth=None, bootstrap=False, n_jobs=-1, min_samples_leaf=10
eval_forest(multi_target_forest)

3.519503330847981 3.428846180954989
3.7761496362408424 3.699541218798475


In [32]:
# n_estimators=40, max_depth=None, bootstrap=False, n_jobs=-1, min_samples_leaf=5
preds, targs = eval_forest(multi_target_forest)

3.2090305137811614 3.3468245818470104
3.7761496362408424 3.699541218798475


## ExTrees 

In [60]:
from sklearn.ensemble import ExtraTreesClassifier
from time import time
from sklearn.model_selection import GridSearchCV

In [69]:
Xforest = ExtraTreesClassifier(n_estimators=40, n_jobs=-1, min_samples_leaf=10, bootstrap=True, oob_score=True)
Xmulti_target_forest = MultiOutputClassifier(Xforest, n_jobs=-1)
Xmulti_target_forest.fit(X_train[fts], Y_train.iloc[:,1:]);

In [48]:
# n_estimators=10, n_jobs=-1, min_samples_leaf=10
preds, targs = eval_forest(Xmulti_target_forest)

3.407395830707845 3.342079536892796
3.7761496362408424 3.699541218798475


### GridSearchedExForests 

In [None]:
def oob_error(Xmulti_target_forest_obj)
    oobs = np.stack([1 - i.oob_score_ for i in Xmulti_target_forest_obj.estimators_])
    return np.mean(oobs)

In [83]:
def Grid_fitter(trn, val, gridsearch=True, n_jobs=-1):
    
    X_train, Y_train = trn
    X_valid, Y_valid = val
    
    # Train a SVM classification model
    print(f"Train-set: {Y_train.shape}, Test-set: {Y_valid.shape}")    
    print("Fitting the classifier to the training set")
    t0 = time()
    if gridsearch:
        param_grid = {'estimator__min_samples_leaf': [1,3,5,10,15,30],
                      'estimator__max_depth': [10,50,100,None],
                      'estimator__max_features': [0.5, 'log2', 'sqrt'],
                      'estimator__n_estimators': [10,40,70]}
        
        clf = GridSearchCV(
            MultiOutputClassifier(
                ExtraTreesClassifier(), n_jobs=n_jobs),
            param_grid, cv=5, verbose=1)
        clf = clf.fit(X_train, Y_train)
        print("done in %0.3fs" % (time() - t0))
        print("Best estimator found by grid search:")
        print(clf.best_estimator_)
        print('\nrunning inference...')
        pred_valid = clf.predict(X_val)

        print("done in %0.3fs" % (time() - t0))
    
    return clf, pred_valid, Y_valid

In [84]:
clf, pred_valid, Y_valid = Grid_fitter([X_train[fts], Y_train.iloc[:,1:]],
                                       [X_valid[fts], Y_valid.iloc[:,1:]],
                                       n_jobs=4)

Train-set: (21432, 206), Test-set: (2382, 206)
Fitting the classifier to the training set
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

# Old

In [133]:
res_train = multi_target_forest.predict(X_train[fts])
res_valid = multi_target_forest.predict(X_valid[fts])

In [134]:
train_targets = Y_train.iloc[:,1:].to_numpy()
valid_targets = Y_valid.iloc[:,1:].to_numpy()

In [135]:
train_zeros = np.zeros((res_train.shape))
valid_zeros = np.zeros((res_valid.shape))

In [131]:
# n_estimators=1, max_depth=9, bootstrap=True, n_jobs=-1
# print(metrics.log_loss(train_targets, res_train), metrics.log_loss(valid_targets, res_valid))
# print(metrics.log_loss(train_targets, train_zeros), metrics.log_loss(valid_targets, valid_zeros))

4.3575507942684295 6.809625700822502
3.7761496362408424 3.699541218798475


In [137]:
# n_estimators=1, max_depth=9, bootstrap=False, n_jobs=-1
# print(metrics.log_loss(train_targets, res_train), metrics.log_loss(valid_targets, res_valid))
# print(metrics.log_loss(train_targets, train_zeros), metrics.log_loss(valid_targets, valid_zeros))

3.169187228013868 6.156248308422521
3.7761496362408424 3.699541218798475


In [119]:
# n_estimators=1, max_depth=1, bootstrap=False, n_jobs=-1
# print(metrics.log_loss(train_targets, res_train), metrics.log_loss(valid_targets, res_valid))
# print(metrics.log_loss(train_targets, train_zeros))

3.77926097091646 4.00797008187417
3.7761496362408424


In [None]:
draw_tree(m.estimators_[0], df_trn, precision=3)

In [None]:
m = RandomForestRegressor(n_estimators=1, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)