In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [15]:
import os
import numpy as np
import pandas as pd 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
import matplotlib.pyplot as plt

import torch, torch.nn.functional as F
from torch import ByteTensor, DoubleTensor, FloatTensor, HalfTensor, LongTensor, ShortTensor, Tensor
from torch import nn, optim, as_tensor
from torch.utils.data import BatchSampler, DataLoader, Dataset, Sampler, TensorDataset
from torch.nn.utils import weight_norm

from collections.abc import Iterable
from tqdm import tqdm_notebook as tqdm

import gc

## Data

In [3]:
ref_train_x = pd.read_csv('data/train_features.csv')
ref_train_y = pd.read_csv('data/train_targets_scored.csv')
ref_train_y2 = pd.read_csv('data/train_targets_nonscored.csv')

ref_test_x = pd.read_csv('data/test_features.csv')
smplsub = pd.read_csv('data/sample_submission.csv')

### Add labels 

In [148]:
train_y_all = pd.merge(ref_train_y, ref_train_y2, on='sig_id')
train_y_all['ctrl'] = (ref_train_x['cp_type'] == 'ctl_vehicle').astype(int)
train_y_all['other'] = (train_y_all.loc[:,ref_train_y2.columns].sum(axis=1) > 0).astype(int)
train_y_all['zero'] = (train_y_all.sum(axis=1) == 0).astype(int)

In [149]:
mod_train_y = train_y_all.loc[:,list(ref_train_y.columns)+['ctrl', 'zero', 'other']]

In [150]:
mod_train_y.sum()

sig_id                                             id_000644bb2id_000779bfcid_000a6266aid_0015fd3...
5-alpha_reductase_inhibitor                                                                       17
11-beta-hsd1_inhibitor                                                                            18
acat_inhibitor                                                                                    24
acetylcholine_receptor_agonist                                                                   190
acetylcholine_receptor_antagonist                                                                301
acetylcholinesterase_inhibitor                                                                    73
adenosine_receptor_agonist                                                                        54
adenosine_receptor_antagonist                                                                     96
adenylyl_cyclase_activator                                                                 

In [4]:
def get_label_stratified_val_idxs(df, val_size=0.1, rnd=0):
    
    arr = df.to_numpy()

    X = arr[:,0]
    y = arr[:,1:] # this works irrespective of whether labels are space- or comma-separated
    
    ### sklearn.model_selection.StratifiedKFold
    sss = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=rnd)
    
    for train_index, val_index in sss.split(X, y):
        trn_idxs = train_index
        val_idxs = val_index

    data_report(df, trn_idxs, val_idxs)
    return trn_idxs, val_idxs

def finalize_df(df, targets, as_multi=True): 
    # Select and fuse labels into target column (space separated)
    df_slct = df[[df.columns[0]] + targets]
    if as_multi:
        df_out = np.array([[df_slct.values[i][0], ' '.join(str(x) for x in df_slct.values[i][1:])] for i in range(len(df_slct))])
        return pd.DataFrame(df_out, columns = ["ID", "Target"])
    else: 
        df_out = np.array(df_slct)
        if len(targets) == 1: return pd.DataFrame(df_out, columns = ["ID", 'Target'])
        else: return pd.DataFrame(df_out, columns = ["ID"] + targets)

def data_report(df, trn_idxs, val_idxs, test_csv=None):
    trnval = df
    if len(trnval.columns) != 2:
        print(f"Multilabel csv with comma-separated labels detected!\n")
        trnval = finalize_df(trnval, targets=list(trnval.columns)[1:])
    print(f"""Train label-distribution:\n"""
          f"""{trnval['Target'][trn_idxs].value_counts()}\n"""
          f"""Total: {len(trn_idxs)}\n""")
    print(f"""Val label-distribution:\n"""
          f"""{trnval['Target'][val_idxs].value_counts()}\n"""
          f"""Total: {len(val_idxs)}""")

In [5]:
trn_idxs, val_idxs = get_label_stratified_val_idxs(ref_train_x.iloc[:,:4], val_size=0.1, rnd=0)

Multilabel csv with comma-separated labels detected!

Train label-distribution:
trt_cp 48 D1         3610
trt_cp 72 D1         3240
trt_cp 48 D2         3232
trt_cp 24 D1         3226
trt_cp 24 D2         3223
trt_cp 72 D2         3222
ctl_vehicle 48 D1     309
ctl_vehicle 72 D1     276
ctl_vehicle 72 D2     275
ctl_vehicle 24 D2     274
ctl_vehicle 48 D2     274
ctl_vehicle 24 D1     271
Name: Target, dtype: int64
Total: 21432

Val label-distribution:
trt_cp 48 D1         401
trt_cp 72 D1         360
trt_cp 48 D2         359
trt_cp 24 D1         359
trt_cp 72 D2         358
trt_cp 24 D2         358
ctl_vehicle 48 D1     34
ctl_vehicle 24 D2     31
ctl_vehicle 72 D1     31
ctl_vehicle 48 D2     31
ctl_vehicle 24 D1     30
ctl_vehicle 72 D2     30
Name: Target, dtype: int64
Total: 2382


In [6]:
def onehot_col(df, col):
    enc = pd.get_dummies(df[col])
    enc.columns = [f"{col}_{n}" for n in enc.columns]
    df = df.drop(col, axis=1)
    df = df.join(enc)
    return df

def prep_data(df, cols, func=onehot_col):
    for i in cols: df = func(df, i)
    return df

In [7]:
_ref_train_x = prep_data(ref_train_x, cols=['cp_type', 'cp_time', 'cp_dose'])
_ref_test_x = prep_data(ref_test_x, cols=['cp_type', 'cp_time', 'cp_dose'])

x_fts = _ref_train_x.columns[1:]
y_fts = ref_train_y.columns[1:]

trnval_df_rdy = pd.merge(_ref_train_x, ref_train_y, on='sig_id')
test_df_rdy = _ref_test_x

In [8]:
test_df_rdy.head()

Unnamed: 0,sig_id,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,g-8,...,c-97,c-98,c-99,cp_type_ctl_vehicle,cp_type_trt_cp,cp_time_24,cp_time_48,cp_time_72,cp_dose_D1,cp_dose_D2
0,id_0004d9e33,-0.5458,0.1306,-0.5135,0.4408,1.55,-0.1644,-0.214,0.2221,-0.326,...,-0.0502,0.151,-0.775,0,1,1,0,0,1,0
1,id_001897cda,-0.1829,0.232,1.208,-0.4522,-0.3652,-0.3319,-1.882,0.4022,-0.3528,...,-0.4764,-1.381,-0.73,0,1,0,0,1,1,0
2,id_002429b5b,0.1852,-0.1404,-0.3911,0.131,-1.438,0.2455,-0.339,-0.3206,0.6944,...,1.016,0.4924,-0.1942,1,0,1,0,0,1,0
3,id_00276f245,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.202,0.5998,-0.1799,0.9365,...,-0.1305,0.5645,-0.5809,0,1,1,0,0,0,1
4,id_0027f1083,-0.3979,-1.268,1.913,0.2057,-0.5864,-0.0166,0.5128,0.6365,0.2611,...,-0.5313,0.9931,1.838,0,1,0,1,0,1,0


## Dataloader 

In [9]:
class MOA_data:
    def __init__(self, x_fts, y_fts, bs=512):
        self.x_fts, self.y_fts, self.bs = x_fts, y_fts, bs
    
    def embed(df):
        return df
    
    def get_weights(targ_df):
        cls_weight = ((df.sum(axis=0) / len(df)).to_numpy())
        res = df.to_numpy() * cls_weight
        weights = res.sum(axis=1)
        assert (weights == 0).any()
        return weights

    def create(self, df, val_idxs, test=None, sampler=None): 
        train = df.drop(val_idxs)
        valid = df.loc[val_idxs]
        for ID in valid['sig_id']: assert ID not in list(train['sig_id']) 
        self.train_ds = MOA_ds(train, self.x_fts, self.y_fts)
        self.valid_ds = MOA_ds(valid, self.x_fts, self.y_fts)
        
        self.train_dl = DataLoader(self.train_ds, batch_size=self.bs, shuffle=True, sampler=sampler)
        self.valid_dl = DataLoader(self.valid_ds, batch_size=self.bs, shuffle=False)
        self.fix_dl = DataLoader(self.train_ds, batch_size=self.bs, shuffle=False)
        
        if test is not None:
            self.test_ds = MOA_ds(test, self.x_fts, y_fts, test=True)
            self.test_dl = DataLoader(self.test_ds, batch_size=self.bs, shuffle=False)

class MOA_ds(Dataset):
    def __init__(self, df, x_fts, y_fts, test=False):
        if test: self.x, self.y = df[x_fts].to_numpy(), np.zeros((df.shape[0], len(y_fts)))
        else: self.x, self.y = df[x_fts].to_numpy(), df[y_fts].to_numpy()
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return [torch.tensor(self.x[idx, :], dtype=torch.float),
                torch.tensor(self.y[idx, :], dtype=torch.float)]

In [10]:
data = MOA_data(x_fts, y_fts)
data.create(trnval_df_rdy, val_idxs, test=test_df_rdy)

## Model 

In [11]:
def ifnone(a,b):
    "`a` if `a` is not None, otherwise `b`."
    return b if a is None else a

def listify(p=None, q=None):
    "Make `p` listy and the same length as `q`."
    if p is None: p=[]
    elif isinstance(p, str):          p = [p]
    elif not isinstance(p, Iterable): p = [p]
    #Rank 0 tensors in PyTorch are Iterable but don't have a length.
    else:
        try: a = len(p)
        except: p = [p]
    n = q if type(q)==int else len(p) if q is None else len(q)
    if len(p)==1: p = p * n
    assert len(p)==n, f'List len mismatch ({len(p)} vs {n})'
    return list(p)

def emb_sz_rule(n_cat:int)->int: return min(600, round(1.6 * n_cat**0.56))

def def_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz

def get_emb_szs(self, sz_dict=None):
    "Return the default embedding sizes suitable for this data or takes the ones in `sz_dict`."
    return [def_emb_sz(self.classes, n, sz_dict) for n in self.cat_names]

def embedding(ni,nf):
    "Create an embedding layer."
    emb = nn.Embedding(ni, nf)
    with torch.no_grad(): trunc_normal_(emb.weight, std=0.01)
    return emb

In [12]:
def bn_drop_lin(n_in, n_out, bn=True, p=0., actn=None):
    "Sequence of batchnorm (if `bn`), dropout (with `p`) and linear (`n_in`,`n_out`) layers followed by `actn`."
    layers = [nn.BatchNorm1d(n_in)] if bn else []
    if p != 0: layers.append(nn.Dropout(p))
    layers.append(nn.Linear(n_in, n_out))
    if actn is not None: layers.append(actn)
    return layers

class SimpleNet(nn.Module):
    def __init__(self, in_fts, layers, out_sz, ps=None, use_bn=True, bn_final=False):
        super().__init__()
        ps = ifnone(ps, [0]*len(layers))
        ps = listify(ps, layers)
        sizes = [in_fts] + layers + [out_sz]
        actns = [nn.ReLU(inplace=True) for _ in range(len(sizes)-2)] + [None]
        layers = []
        
        for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+ps,actns)):
            layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)
        if bn_final: layers.append(nn.BatchNorm1d(sizes[-1]))
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        x = self.layers(x)
        return x

In [13]:
ni = data.train_ds.x.shape[1]
layers = [512, 512, 256, 128]
out_sz = 206

m = SimpleNet(ni, layers, out_sz, ps=0.3)
m

SimpleNet(
  (layers): Sequential(
    (0): Linear(in_features=879, out_features=512, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.3)
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.3)
    (8): Linear(in_features=512, out_features=256, bias=True)
    (9): ReLU(inplace)
    (10): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.3)
    (12): Linear(in_features=256, out_features=128, bias=True)
    (13): ReLU(inplace)
    (14): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (15): Dropout(p=0.3)
    (16): Linear(in_features=128, out_features=206, bias=True)
  )
)

## Train

In [16]:
def to_np(x): 
    return x.data.cpu().numpy()

def loss_batch(model, x, y, loss_func, opt=None): 
    out = model(x)
    if not loss_func: return to_np(out), to_np(y)
    loss = loss_func(out, y)
    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
    return loss.detach().cpu()    
    
def validate(model, dl, loss_fn=None, average=True):
    model.eval()
    with torch.no_grad():
        val_losses,nums = [],[]
#         for xb,yb in tqdm(dl, total=len(dl), unit='batches'):
        for xb,yb in dl:
            val_loss = loss_batch(model, xb, yb, loss_fn)
            val_losses.append(val_loss)
            nums.append(xb.shape[0])
        nums = np.array(nums, dtype=np.float32)
        if average: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum()
        else:       return val_losses
        
def fit(model, data, loss_fn, opt, epochs, average=True, scd=None):
    nb = 0
    b2 = opt.param_groups[0]['betas'][1]
    for e in tqdm(range(epochs), total=epochs, unit='epochs'):
        model.train()
        train_losses, nums = [], []
#         for xb,yb in tqdm(data.train_dl, total=len(data.train_dl), unit='batches'):
        for xb,yb in data.train_dl:
            if scd is not None:
                for g in opt.param_groups: g['lr'] = scd[0][nb]
                for g in opt.param_groups: g['betas'] = (scd[1][nb], b2) # only beta1 is scaled
                nb += 1
            loss = loss_batch(model, xb, yb, loss_fn, opt)
            train_losses.append(loss)
            nums.append(xb.shape[0])
        nums = np.array(nums, dtype=np.float32)
        train_loss = (np.stack(train_losses) * nums).sum() / nums.sum()
        valid_loss = validate(model, data.valid_dl, loss_fn, average=True)
        print(f"Epoch {e} -- train_loss: {train_loss}, valid_loss: {valid_loss}")
    print('done!')

def annealing_cos(start, end, pct):
    "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
    cos_out = np.cos(np.pi * pct) + 1
    return end + (start-end)/2 * cos_out
    
class learner():
    def __init__(self, model, data, loss_fn, opt=optim.Adam):
        self.m, self.data, self.loss_fn = model, data, loss_fn
        self.opt = opt
    
    def OneCycleScheduler(self, epochs, lr, pct_start, moms, div=25):
        final_div=div*1e4
        lr_low=lr/div
        n_batches = len(self.data.train_dl)*epochs
        ph1 = int(n_batches * pct_start)
        ph2 = n_batches-ph1
        
        def steps(start, end, ph1, ph2, final_div):
            up = [annealing_cos(start, end, n/ph1) for n in range(ph1)]
            down = [annealing_cos(end, end/final_div, n/ph2) for n in range(ph2)]
            return up+down
        
        lrs = steps(lr_low, lr, ph1, ph2, final_div)
        moms = steps(moms[0], moms[1], ph1, ph2, final_div)
        return [lrs, moms]
    
    def fit(self, epochs, lr=1e-3, wd=0):
        opt = self.opt(self.m.parameters(), lr=lr, weight_decay=wd)
        fit(self.m, self.data, self.loss_fn, opt, epochs)  

    def fit1cycle(self, epochs, wd=0, lr=1e-2, pct_start=0.3, moms=(0.95,0.85), div=25):
        self.scd = self.OneCycleScheduler(epochs, lr, pct_start, moms, div)
        opt = self.opt(self.m.parameters(), lr=lr, weight_decay=wd)
        fit(self.m, self.data, self.loss_fn, opt, epochs, scd=self.scd)  
        
    def plot_scd(self):
        fig, ax = plt.subplot(1,2)
        ax[0,0] = plt.plot(range(len(self.scd[0])), self.scd[0])
        ax[0,1] = plt.plot(range(len(self.scd[1])), self.scd[1])
        
    def predict(self, dl):
        return validate(self.m, dl, loss_fn=None, average=False)

In [None]:
data = MOA_data(x_fts, y_fts)
data.create(trnval_df_rdy, val_idxs, test=test_df_rdy)

In [17]:
ni = data.train_ds.x.shape[1]
layers = [512, 512, 256, 128]
out_sz = 206

m = SimpleNet(ni, layers, out_sz, ps=0.4)

In [18]:
loss_func = nn.BCEWithLogitsLoss()
learn = learner(m, data, loss_func)

In [19]:
learn.fit1cycle(10, lr=1e-1)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Epoch 0 -- train_loss: 0.33201444149017334, valid_loss: 0.04231078177690506
Epoch 1 -- train_loss: 0.03843633830547333, valid_loss: 0.02162717655301094
Epoch 2 -- train_loss: 0.019825752824544907, valid_loss: 0.01784510537981987
Epoch 3 -- train_loss: 0.01827012561261654, valid_loss: 0.016924018040299416
Epoch 4 -- train_loss: 0.017580796033143997, valid_loss: 0.01658945344388485
Epoch 5 -- train_loss: 0.017116185277700424, valid_loss: 0.016308646649122238
Epoch 6 -- train_loss: 0.016701877117156982, valid_loss: 0.01637161336839199
Epoch 7 -- train_loss: 0.016411883756518364, valid_loss: 0.015910614281892776
Epoch 8 -- train_loss: 0.016105279326438904, valid_loss: 0.015780121088027954
Epoch 9 -- train_loss: 0.016014257445931435, valid_loss: 0.01576509326696396

done!


In [20]:
learn.fit1cycle(10, lr=1e-2)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Epoch 0 -- train_loss: 0.015949761494994164, valid_loss: 0.01575574465095997
Epoch 1 -- train_loss: 0.015949904918670654, valid_loss: 0.015779739245772362
Epoch 2 -- train_loss: 0.016075219959020615, valid_loss: 0.016111915931105614
Epoch 3 -- train_loss: 0.01596575602889061, valid_loss: 0.0156718697398901
Epoch 4 -- train_loss: 0.01568703167140484, valid_loss: 0.0158403217792511
Epoch 5 -- train_loss: 0.015408377163112164, valid_loss: 0.01554021518677473
Epoch 6 -- train_loss: 0.015143292024731636, valid_loss: 0.015404722653329372
Epoch 7 -- train_loss: 0.014813438057899475, valid_loss: 0.015429357998073101
Epoch 8 -- train_loss: 0.01453423872590065, valid_loss: 0.015403523109853268
Epoch 9 -- train_loss: 0.014419391751289368, valid_loss: 0.01540491171181202

done!


In [167]:
learn.fit(25, lr=1e-2, wd=0)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

Epoch 0 -- train_loss: 0.20949962735176086, valid_loss: 0.02265077643096447
Epoch 1 -- train_loss: 0.021143915131688118, valid_loss: 0.019176403060555458
Epoch 2 -- train_loss: 0.01928352750837803, valid_loss: 0.018533499911427498
Epoch 3 -- train_loss: 0.01886015385389328, valid_loss: 0.01786782406270504
Epoch 4 -- train_loss: 0.018388397991657257, valid_loss: 0.017606500536203384
Epoch 5 -- train_loss: 0.018018893897533417, valid_loss: 0.01731470413506031
Epoch 6 -- train_loss: 0.01776273362338543, valid_loss: 0.017182406038045883
Epoch 7 -- train_loss: 0.01758640445768833, valid_loss: 0.017024202272295952
Epoch 8 -- train_loss: 0.017436543479561806, valid_loss: 0.016882050782442093
Epoch 9 -- train_loss: 0.017361700534820557, valid_loss: 0.01692408137023449
Epoch 10 -- train_loss: 0.017262758687138557, valid_loss: 0.01688043214380741
Epoch 11 -- train_loss: 0.017195701599121094, valid_loss: 0.01683911681175232
Epoch 12 -- train_loss: 0.01717359758913517, valid_loss: 0.01662640087306

In [168]:
learn.fit(10, lr=5e-3, wd=1e-5)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Epoch 0 -- train_loss: 0.016941102221608162, valid_loss: 0.016247620806097984
Epoch 1 -- train_loss: 0.016665706411004066, valid_loss: 0.01618214137852192
Epoch 2 -- train_loss: 0.016668150201439857, valid_loss: 0.016295522451400757
Epoch 3 -- train_loss: 0.016658030450344086, valid_loss: 0.016031434759497643
Epoch 4 -- train_loss: 0.016613837331533432, valid_loss: 0.016173968091607094
Epoch 5 -- train_loss: 0.016593245789408684, valid_loss: 0.016228972002863884
Epoch 6 -- train_loss: 0.0166561771184206, valid_loss: 0.016130512580275536
Epoch 7 -- train_loss: 0.016535378992557526, valid_loss: 0.015919595956802368
Epoch 8 -- train_loss: 0.01656779833137989, valid_loss: 0.016008682548999786
Epoch 9 -- train_loss: 0.01655224896967411, valid_loss: 0.01607232168316841
done!


In [44]:
learn.fit(25, lr=1e-2, wd=0)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

Epoch 0 -- train_loss: 0.2182941883802414, valid_loss: 0.022483980283141136
Epoch 1 -- train_loss: 0.0220461618155241, valid_loss: 0.019376760348677635
Epoch 2 -- train_loss: 0.020111022517085075, valid_loss: 0.01855853758752346
Epoch 3 -- train_loss: 0.019594714045524597, valid_loss: 0.01825687848031521
Epoch 4 -- train_loss: 0.019305594265460968, valid_loss: 0.017978398129343987
Epoch 5 -- train_loss: 0.018953988328576088, valid_loss: 0.017496727406978607
Epoch 6 -- train_loss: 0.018582912161946297, valid_loss: 0.017173688858747482
Epoch 7 -- train_loss: 0.01827169395983219, valid_loss: 0.01691191829741001
Epoch 8 -- train_loss: 0.017977101728320122, valid_loss: 0.016689136624336243
Epoch 9 -- train_loss: 0.017696768045425415, valid_loss: 0.01653870940208435
Epoch 10 -- train_loss: 0.017501598224043846, valid_loss: 0.016383958980441093
Epoch 11 -- train_loss: 0.017315631732344627, valid_loss: 0.016210580244660378
Epoch 12 -- train_loss: 0.01715618185698986, valid_loss: 0.016051106154

In [21]:
def unpack(res_list):
    preds = np.vstack([p[0] for p in res_list])
    preds = nn.Sigmoid()(torch.tensor(preds))
    y = np.vstack([p[1] for p in res_list])
    return [preds, y]

def eval_model(learn_obj):
    res = {'train preds': unpack(learn_obj.predict(learn_obj.data.fix_dl)), 
           'valid preds': unpack(learn_obj.predict(learn_obj.data.valid_dl)), 
           'train baseline': [learn_obj.data.train_ds.y, np.zeros(learn_obj.data.train_ds.y.shape)],
           'valid baseline': [learn_obj.data.valid_ds.y, np.zeros(learn_obj.data.valid_ds.y.shape)]}
        
    return res['train preds'], res['valid preds']

In [22]:
train_res, valid_res = eval_model(learn)

In [23]:
train_pred = to_np(train_res[0])
train_y = train_res[1]

valid_pred = to_np(valid_res[0])
valid_y = valid_res[1]

In [24]:
print(f"train log_loss: {metrics.log_loss(train_y, train_pred)}")
print(f"train baseline: {metrics.log_loss(train_y, np.zeros(train_y.shape))}")
print('\n')
print(f"valid log_loss: {metrics.log_loss(valid_y, valid_pred)}")
print(f"valid baseline: {metrics.log_loss(valid_y, np.zeros(valid_y.shape))}")

train log_loss: 2.1209915777456496
train baseline: 3.7746580695642495


valid log_loss: 2.465788588763078
valid baseline: 3.712961561792907


### Analysis

In [25]:
def submission_df(preds, ref_df, smplsub):
    cols = smplsub.columns
    res = pd.DataFrame(preds, columns=cols[1:])
    res['sig_id'] = list(ref_df['sig_id'])
    pred_df = res[cols]
    return pred_df

In [42]:
ref_trn_x = ref_train_x.drop(val_idxs).copy()
ref_trn_y = ref_train_y.drop(val_idxs).copy()

ref_val_x = ref_train_x.loc[val_idxs].copy()
ref_val_y = ref_train_y.loc[val_idxs].copy()

train_pred_df = submission_df(train_pred, ref_trn_x, smplsub)
valid_pred_df = submission_df(valid_pred, ref_val_x, smplsub)

In [53]:
z_trn_pred = train_pred_df.iloc[:,1:].to_numpy()[ref_trn_y.sum(axis=1)==0]
z_trn_y = ref_trn_y.iloc[:,1:].to_numpy()[ref_trn_y.sum(axis=1)==0]
nz_trn_pred = train_pred_df.iloc[:,1:].to_numpy()[ref_trn_y.sum(axis=1)>1]
nz_trn_y = ref_trn_y.iloc[:,1:].to_numpy()[ref_trn_y.sum(axis=1)>1]
print(f"train zeros: {metrics.log_loss(z_trn_y,z_trn_pred)}")
print(f"train non-zeros: {metrics.log_loss(nz_trn_y,nz_trn_pred)}")

z_val_pred = valid_pred_df.iloc[:,1:].to_numpy()[ref_val_y.sum(axis=1)==0]
z_val_y = ref_val_y.iloc[:,1:].to_numpy()[ref_val_y.sum(axis=1)==0]
nz_val_pred = valid_pred_df.iloc[:,1:].to_numpy()[ref_val_y.sum(axis=1)>1]
nz_val_y = ref_val_y.iloc[:,1:].to_numpy()[ref_val_y.sum(axis=1)>1]
print(f"train zeros: {metrics.log_loss(z_val_y,z_val_pred)}")
print(f"train non-zeros: {metrics.log_loss(nz_val_y,nz_val_pred)}")

train zeros: 0.0
train non-zeros: 4.568610033906662
train zeros: 0.0
train non-zeros: 5.31293923433169


In [54]:
# train
trn_true_y = pd.merge(ref_trn_x[['sig_id', 'cp_type']], ref_train_y, on='sig_id')
trn_pred_y = pd.merge(ref_trn_x[['sig_id', 'cp_type']], train_pred_df, on='sig_id')

ctrl_true = trn_true_y.loc[trn_true_y['cp_type'] == 'ctl_vehicle']
ctrl_pred = trn_pred_y.loc[trn_pred_y['cp_type'] == 'ctl_vehicle']
print('ctrls: ',metrics.log_loss(ctrl_true.iloc[:,2:].to_numpy(), ctrl_pred.iloc[:,2:].to_numpy()))

non_ctrl_true = trn_true_y.loc[trn_true_y['cp_type'] != 'ctl_vehicle']
non_ctrl_pred = trn_pred_y.loc[trn_pred_y['cp_type'] != 'ctl_vehicle']
print('treat:', metrics.log_loss(non_ctrl_true.iloc[:,2:].to_numpy(), non_ctrl_pred.iloc[:,2:].to_numpy()))

ctrls:  0.0
treat: 2.301275310561092


In [55]:
# val
val_true_y = pd.merge(ref_val_x[['sig_id', 'cp_type']], ref_train_y, on='sig_id')
val_pred_y = pd.merge(ref_val_x[['sig_id', 'cp_type']], valid_pred_df, on='sig_id')


val_ctrl_true = val_true_y.loc[val_true_y['cp_type'] == 'ctl_vehicle']
val_ctrl_pred = val_pred_y.loc[val_pred_y['cp_type'] == 'ctl_vehicle']
print('ctrls: ',metrics.log_loss(val_ctrl_true.iloc[:,2:].to_numpy(), val_ctrl_pred.iloc[:,2:].to_numpy()))

val_non_ctrl_true = val_true_y.loc[val_true_y['cp_type'] != 'ctl_vehicle']
val_non_ctrl_pred = val_pred_y.loc[val_pred_y['cp_type'] != 'ctl_vehicle']
print('treat:', metrics.log_loss(val_non_ctrl_true.iloc[:,2:].to_numpy(), val_non_ctrl_pred.iloc[:,2:].to_numpy()))

ctrls:  0.0
treat: 2.6758580317124205


## Forest: 

In [56]:
import xgboost as xgb
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [68]:
def eval_model(forest):
    res = {'train preds': [f_train_y.to_numpy(), forest.predict(f_train_x)], 
           'valid preds': [f_valid_y.to_numpy(), forest.predict(f_valid_x)], 
           'train baseline': [f_train_y.to_numpy(), np.zeros(f_train_y.shape)],
           'valid baseline': [f_valid_y.to_numpy(), np.zeros(f_valid_y.shape)]}
     
    for key, value in res.items(): 
        print(f"{key}: {metrics.log_loss(*value)}")
        
    return res['train preds'], res['valid preds']

In [65]:
f_valid_x = trnval_df_rdy[x_fts].loc[val_idxs]
f_valid_y = trnval_df_rdy[y_fts].loc[val_idxs]

f_train_x = trnval_df_rdy[x_fts].copy().drop(val_idxs)
f_train_y = trnval_df_rdy[y_fts].copy().drop(val_idxs)

In [66]:
clf = OneVsRestClassifier(xgb.XGBClassifier(n_estimators=10, n_jobs=-1, max_depth=3, verbosity=1))
%time clf.fit(f_train_x, f_train_y);

In [69]:
# n_estimators=10, n_jobs=-1, max_depth=3, verbosity=1
train_preds, valid_preds = eval_model(clf)

train preds: 3.3517202616769346
valid preds: 3.3995132701742037
train baseline: 3.776149636240842
valid baseline: 3.699541218798475


## Thresholding

In [None]:
non_ctrl_pred_arr = non_ctrl_pred.iloc[:,2:].to_numpy()
non_ctrl_true_arr = non_ctrl_true.iloc[:,2:].to_numpy()

val_non_ctrl_pred_arr = val_non_ctrl_pred.iloc[:,2:].to_numpy()
val_non_ctrl_true_arr = val_non_ctrl_true.iloc[:,2:].to_numpy()

In [None]:
def opt_th(targs, preds, start=1e-7, end=1e-5, step=2e-7):
    ths = np.arange(start,end,step)
    res = [metrics.log_loss(targs, (preds > th)*preds) for th in ths]
    idx = np.argmin(res)
    return ths[idx], res[idx]

def ths_binarize(arr, ths):
    arr = arr.copy()
    arr[arr < ths] = 0
    arr[arr > ths] = 1
    return arr

def opt_th_binarize(targs, preds, start=5e-7, end=0.1, step=5e-7):
    ths = np.arange(start,end,step)
    res = [metrics.log_loss(targs, ths_binarize(preds, th)) for th in ths]
    idx = np.argmin(res)
    return ths[idx], res[idx]

In [None]:
trn_res = opt_th_binarize(non_ctrl_true_arr, non_ctrl_pred_arr)
print(f"Optimal threshold train: {trn_res}")

val_res = opt_th_binarize(val_non_ctrl_true_arr, val_non_ctrl_pred_arr)
print(f"Optimal threshold valid: {val_res}")

In [None]:
trn_res = opt_th(non_ctrl_true_arr, non_ctrl_pred_arr)
print(f"Optimal threshold train: {trn_res}")

val_res = opt_th(val_non_ctrl_true_arr, val_non_ctrl_pred_arr)
print(f"Optimal threshold valid: {val_res}")