In [1]:
# import sys
# sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

## Configuration

In [2]:
ROOT = ".."
INPUT = "input"
LISH_MOA = "lish-moa"
NUM_FOLD = 5
NUM_OPTUNA_TRIAL = 30
N_COMP_GENES = 50
N_COMP_CELLS = 15

In [3]:
def seed_everything(seed=334):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=334)

## Read data

In [4]:
train = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "train_features.csv"))
test = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "test_features.csv"))
train_targets_scored = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "train_targets_scored.csv"))
train_targets_nonscored = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "train_targets_nonscored.csv"))
sub = pd.read_csv(os.path.join(ROOT, INPUT, LISH_MOA, "sample_submission.csv"))

In [5]:
GENES = [col for col in train.columns if col.startswith('g-')]
CELLS = [col for col in train.columns if col.startswith('c-')]

In [6]:
train_targets_scored.sum()[1:].sort_values()

atp-sensitive_potassium_channel_antagonist      1
erbb2_inhibitor                                 1
diuretic                                        6
autotaxin_inhibitor                             6
protein_phosphatase_inhibitor                   6
                                             ... 
serotonin_receptor_antagonist                 404
dopamine_receptor_antagonist                  424
cyclooxygenase_inhibitor                      435
proteasome_inhibitor                          726
nfkb_inhibitor                                832
Length: 206, dtype: object

## PCA features + Existing features

In [7]:
def make_pca_features(df_train:pd.DataFrame, df_test:pd.DataFrame, n_components:int, use_cols:list, gene_or_cell:str, concat_flg:bool):
    data = pd.concat([pd.DataFrame(df_train[use_cols]), pd.DataFrame(df_test[use_cols])])
    data_pca = PCA(n_components=n_components, random_state=334).fit_transform(data[use_cols])

    train_pca = data_pca[:df_train.shape[0]]
    test_pca = data_pca[-df_test.shape[0]:]

    train_pca = pd.DataFrame(train_pca, columns=['pca_'+gene_or_cell+str(i) for i in range(n_components)])
    test_pca = pd.DataFrame(test_pca, columns=['pca_'+gene_or_cell+str(i) for i in range(n_components)])

    if concat_flg:
        ret_df_train = pd.concat([df_train, train_pca], axis=1)
        ret_df_test = pd.concat([df_test, test_pca], axis=1)
    else:
        ret_df_train = pd.concat([df_train['sig_id'], train_pca], axis=1)
        ret_df_test = pd.concat([df_test['sig_id'], test_pca], axis=1)
    return ret_df_train, ret_df_test

In [8]:
train, test = make_pca_features(train, test, N_COMP_GENES, GENES, 'G', True)
train, test = make_pca_features(train, test, N_COMP_CELLS, GENES, 'C', True)

In [9]:
print('train.shape = {}'.format(train.shape))
print('test.shape = {}'.format(test.shape))

train.shape = (23814, 941)
test.shape = (3982, 941)


## Feature selection using Variance Encoding

In [10]:
def select_features_by_variance_encoding(df_train:pd.DataFrame, df_test:pd.DataFrame, a_threshold:float):
    data = df_train.append(df_test)
    
    # transform only gene and cell features
    var_thresh = VarianceThreshold(threshold=a_threshold)
    data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

    train_transformed = data_transformed[ : df_train.shape[0]]
    test_transformed = data_transformed[-df_test.shape[0] : ]
    
    ret_df_train = pd.DataFrame(df_train[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])
    ret_df_train = pd.concat([ret_df_train, pd.DataFrame(train_transformed)], axis=1)
    
    ret_df_test = pd.DataFrame(df_test[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                                 columns=['sig_id','cp_type','cp_time','cp_dose'])
    ret_df_test = pd.concat([ret_df_test, pd.DataFrame(test_transformed)], axis=1)
    
    return ret_df_train, ret_df_test

In [11]:
train, test = select_features_by_variance_encoding(train, test, 0.5)

In [16]:
print('train.shape = {}'.format(train.shape))

train.shape = (23814, 931)


In [17]:
train.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,0,1,2,3,4,5,...,917,918,919,920,921,922,923,924,925,926
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,-4.939653,-4.02164,3.108684,2.719949,-2.409827,1.256286,-0.355232,0.009593,-1.128158,1.431649
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,4.779338,2.585911,1.996458,0.306471,1.423812,-0.429648,2.387632,-0.273131,-1.76067,-1.770116
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-1.718382,2.841791,-0.774904,-1.731189,-4.941884,0.260169,-1.257023,-0.634996,0.511237,2.669136
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-10.384501,5.97511,-2.337127,-0.156779,4.973999,-2.901239,0.512712,-2.827272,2.948554,0.214343
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,-4.333936,-1.741194,0.646695,-6.767676,5.830461,0.549369,2.102754,-1.808177,3.865406,0.797927


In [18]:
print('test.shape = {}'.format(test.shape))

test.shape = (3982, 931)


In [19]:
test.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,0,1,2,3,4,5,...,917,918,919,920,921,922,923,924,925,926
0,id_0004d9e33,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.55,-0.1644,...,-1.507929,0.21904,-0.169473,-2.251759,1.54195,-1.291751,-1.31075,-0.886952,-0.508273,1.518639
1,id_001897cda,trt_cp,72,D1,-0.1829,0.232,1.208,-0.4522,-0.3652,-0.3319,...,-6.003272,-8.762068,0.946585,-0.632069,1.702798,3.631158,1.07727,-2.277392,-0.398237,1.882794
2,id_002429b5b,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.131,-1.438,0.2455,...,5.596669,2.191614,-3.977741,2.4656,1.452818,3.445528,-0.96059,0.554596,0.448883,-0.238739
3,id_00276f245,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.202,...,1.79841,-0.218794,-1.635691,3.336652,-2.198993,2.186583,-0.987314,-1.250305,0.937356,-1.39048
4,id_0027f1083,trt_cp,48,D1,-0.3979,-1.268,1.913,0.2057,-0.5864,-0.0166,...,4.337147,3.422253,-2.042022,1.083609,1.873949,1.296667,-0.851018,1.404813,-0.195592,0.834647


In [20]:
train = train.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

test = test[test['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]

In [21]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [25]:
train.head()

Unnamed: 0,sig_id,cp_time,cp_dose,0,1,2,3,4,5,6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0,0,0,0,0,0,0,0,0,0


In [26]:
test.head()

Unnamed: 0,sig_id,cp_time,cp_dose,0,1,2,3,4,5,6,...,917,918,919,920,921,922,923,924,925,926
0,id_0004d9e33,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.55,-0.1644,-0.214,...,-1.507929,0.21904,-0.169473,-2.251759,1.54195,-1.291751,-1.31075,-0.886952,-0.508273,1.518639
1,id_001897cda,72,D1,-0.1829,0.232,1.208,-0.4522,-0.3652,-0.3319,-1.882,...,-6.003272,-8.762068,0.946585,-0.632069,1.702798,3.631158,1.07727,-2.277392,-0.398237,1.882794
2,id_00276f245,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.202,0.5998,...,1.79841,-0.218794,-1.635691,3.336652,-2.198993,2.186583,-0.987314,-1.250305,0.937356,-1.39048
3,id_0027f1083,48,D1,-0.3979,-1.268,1.913,0.2057,-0.5864,-0.0166,0.5128,...,4.337147,3.422253,-2.042022,1.083609,1.873949,1.296667,-0.851018,1.404813,-0.195592,0.834647
4,id_006fc47b8,48,D2,0.3658,0.5536,-0.6898,-1.627,0.5239,-0.3832,-0.4653,...,-6.869998,-2.931882,0.705651,2.195715,0.194101,2.911494,-2.635885,-2.040741,-0.340527,-0.445118


In [28]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

# CV folds

In [29]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=5)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds

Unnamed: 0,sig_id,cp_time,cp_dose,0,1,2,3,4,5,6,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,24,D1,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,-1.0220,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,72,D1,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,0.2341,...,0,0,0,0,0,0,0,0,0,1
2,id_000a6266a,48,D1,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,0.1715,...,0,0,0,0,0,0,0,0,0,2
3,id_0015fd391,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,-1.9590,...,0,0,0,0,0,0,0,0,0,1
4,id_001626bd3,72,D2,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,-0.2800,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21943,id_fff8c2444,72,D1,0.1608,-1.0500,0.2551,-0.2239,-0.2431,0.4256,-0.1166,...,0,0,0,0,0,0,0,0,0,4
21944,id_fffb1ceed,24,D2,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,0.5773,...,0,0,0,0,0,0,0,0,0,3
21945,id_fffb70c0c,24,D2,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,-0.2252,...,0,0,0,0,0,0,0,0,0,0
21946,id_fffcb9e7c,24,D1,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,0.0323,...,0,0,0,0,0,0,0,0,0,3


# Dataset Classes

In [33]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    

In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds
   
    

# Model

In [None]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x