In [1]:
import pandas as pd
import numpy as np
import random
import os
import sys
from time import time
import datetime
from scipy.optimize import minimize, fsolve

from collections import Counter

from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import QuantileTransformer, RobustScaler

import torch
import torch.nn as nn
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

import copy

import seaborn as sns
import plotly.express as px

import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


In [2]:
RANDOM_SEED=42

def seed_everything(seed=RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=RANDOM_SEED)

# Loading the data

In [3]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_drug_ids = pd.read_csv('../input/lish-moa/train_drug.csv') 

train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

train_data = train_features.merge(train_targets, on='sig_id', how='left')
train_data = train_data.merge(train_drug_ids, on='sig_id', how='left') 

In [4]:
NUMBER_OF_MODELS = 4

meta_features = pd.DataFrame(data=train_data.sig_id.values, columns=['sig_id'])

for version in range(1, NUMBER_OF_MODELS + 1):
    df = pd.read_csv(f'../input/moablogstackingdataset/version_{version}.csv')
    df = df.rename(columns={v:f'meta-f-{i}-model{version}' if v != 'sig_id' else v for i, v in enumerate(df.columns)})
    meta_features = pd.merge(meta_features, df, on='sig_id')
    
train_data = pd.merge(train_data, meta_features, on='sig_id')

In [5]:
# artificial_meta_features = pd.DataFrame(data=np.zeros((len(train_data), 206 * 4)), columns=[f'meta-f{i}' for i in range(206 * 4)])
# artificial_meta_features['sig_id'] = train_data.sig_id.values
# train_data = pd.merge(train_data, artificial_meta_features, on='sig_id')

In [6]:
target_columns = [c for c in train_targets.columns if c != 'sig_id']
feature_columns = [c for c in train_data.columns if 'meta-f-' in c]

# Cross validation strategy

In [7]:
def create_cross_validation_strategy(data, targets, FOLDS, SEED):

    vc = data.drug_id.value_counts()
    
#     vc1 = vc.loc[(vc==6)|(vc==12)|(vc==18)].index.sort_values()
#     vc2 = vc.loc[(vc!=6)&(vc!=12)&(vc!=18)].index.sort_values()
    
    vc1 = vc.loc[vc <= 19].index.sort_values()
    vc2 = vc.loc[vc > 19].index.sort_values()

    dct1 = {} 
    dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

    tmp = data.groupby('drug_id')[targets].mean().loc[vc1]
    
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    tmp = data.loc[data.drug_id.isin(vc2)].reset_index(drop=True)
    
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    data['fold'] = data.drug_id.map(dct1)
    data.loc[data.fold.isna(),'fold'] = data.loc[data.fold.isna(),'sig_id'].map(dct2)
    data.fold = data.fold.astype('int8')
    
    return data

# Modeling

In [8]:
class MoaMetaDataset:
    def __init__(self, dataset_df, feature_ids, target_ids):
        self.dataset_df = dataset_df
        self.feature_ids = feature_ids
        self.target_ids = target_ids
        self.num_models = len(feature_ids) // 206

        # samples x models x targets
        self.features = self.dataset_df[feature_ids].values
        self.targets = None

        if self.target_ids is not None:
            self.targets = self.dataset_df[target_ids].values

    def __len__(self):
        return len(self.dataset_df)

    def num_of_features(self):
        return len(feature_ids)

    def num_of_targets(self):
        return None if self.target_ids is None else len(self.target_ids)

    def get_ids(self):
        return self.dataset_df.sig_id.values

    def __getitem__(self, item):
        return_item = {}
        
        return_item['x'] = torch.tensor(self.features[item, :].reshape(self.num_models, 206), dtype=torch.float)
        return_item['sig_id'] = self.dataset_df.loc[item, 'sig_id']

        if self.target_ids is not None:
            return_item['y'] = torch.tensor(self.targets[item, :], dtype=torch.float)

        return return_item

In [9]:
class ModelConfig:
    def __init__(self, number_of_features, number_of_targets):
        self.number_of_features = number_of_features
        self.number_of_targets = number_of_targets

In [10]:
class MoaModelBlock(nn.Module):
    def __init__(self, num_in, num_out, dropout, weight_norm=False):
        super().__init__()
        self.batch_norm = nn.BatchNorm1d(num_in)
        self.dropout = nn.Dropout(dropout)
        
        if weight_norm:
            self.linear = nn.utils.weight_norm(nn.Linear(num_in, num_out))
        else:
            self.linear = nn.Linear(num_in, num_out)
        
        self.activation = nn.PReLU(num_out)
        
        
    def forward(self, x):
        x = self.batch_norm(x)
        x = self.dropout(x)
        x = self.linear(x)
        x = self.activation(x)
        return x

class MoaEncodeBlock(nn.Module):
    def __init__(self, num_in, num_out, dropout, weight_norm=False):
        super().__init__()
        self.batch_norm = nn.BatchNorm1d(num_in)
        self.dropout = nn.Dropout(dropout)
        
        if weight_norm:
            self.linear = nn.utils.weight_norm(nn.Linear(num_in, num_out))
        else:
            self.linear = nn.Linear(num_in, num_out)

    def forward(self, x):
        x = self.batch_norm(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [11]:
class MetaModel(nn.Module):
    def __init__(self, model_config):
        super().__init__()
        self.num_models = model_config.number_of_features // model_config.number_of_targets
        self.model_config = model_config
        
        dropout = 0.15
        hidden_size = 512
        
        self.encoders = nn.ModuleList([MoaEncodeBlock(model_config.number_of_targets, 64, dropout) for i in range(self.num_models)])
        
        self.model = nn.Sequential(nn.Linear(64, hidden_size),
                                   nn.Dropout(dropout),
                                   nn.ReLU(),
                                   nn.Linear(hidden_size, hidden_size),
                                   nn.Dropout(dropout),
                                   nn.ReLU(),
                                   nn.Linear(hidden_size, model_config.number_of_targets))
        

    def forward(self, x): # batch size x models x features
        x_ = self.encoders[0](x[:, 0, :])
        for i in range(1, self.num_models):
            x_ = x_ + self.encoders[i](x[:, i, :]) 
        return self.model(x_)

# Smooth loss function

In [12]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets, n_classes, smoothing=0.0):
        assert 0 <= smoothing <= 1
        with torch.no_grad():
            targets = targets * (1 - smoothing) + torch.ones_like(targets).to(DEVICE) * smoothing / n_classes
        return targets

    def forward(self, inputs, targets):
        targets = SmoothCrossEntropyLoss()._smooth(targets, inputs.shape[1], self.smoothing)

        if self.weight is not None:
            inputs = inputs * self.weight.unsqueeze(0)

        loss = F.binary_cross_entropy_with_logits(inputs, targets)

        return loss

# Scaling functions

In [13]:
def quantile_scaling(train_data, valid_data, feature_columns):
    global RANDOM_SEED
    
    scaler = QuantileTransformer(n_quantiles=100,random_state=RANDOM_SEED, output_distribution="normal")
    train_data[feature_columns] = scaler.fit_transform(train_data[feature_columns])
    valid_data[feature_columns] = scaler.transform(valid_data[feature_columns])

    return train_data, valid_data

# Data preprocessing

In [14]:
def create_dataloader(data, batch_size, shuffle, target_columns=None):
    global feature_columns
    
    dataset = MoaMetaDataset(data, feature_ids=feature_columns, target_ids=target_columns)
    return torch.utils.data.DataLoader(dataset,
                                       batch_size=batch_size,                                    
                                       shuffle=shuffle)

In [15]:
def preprocess_fold_data(train_data, fold, scaling_func=None):
    global feature_columns, target_columns, gene_features, cell_features
    
    fold_train_data = train_data[train_data.fold != fold].reset_index(drop=True)
    fold_valid_data = train_data[train_data.fold == fold].reset_index(drop=True)
    
    if scaling_func is not None:
        fold_train_data, fold_valid_data = scaling_func(fold_train_data, fold_valid_data, feature_columns)
      
    train_dataloader = create_dataloader(data=fold_train_data, batch_size=BATCH_SIZE, shuffle=True, target_columns=target_columns)
    valid_dataloader = create_dataloader(data=fold_valid_data, batch_size=BATCH_SIZE, shuffle=False, target_columns=target_columns)
    
    return train_dataloader, valid_dataloader    

# Blending functions

In [16]:
def log_loss_numpy(y_pred):
    loss = 0
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    for i in range(y_pred.shape[1]):
        loss += - np.mean(y_true[:, i] * np.log(y_pred_clip[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / y_pred.shape[1]

def func_numpy_metric(weights):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    score = log_loss_numpy(oof_blend)
    
    coef = 1e-6
    penalty = coef * (np.sum(weights) - 1) ** 2
    return score + penalty

def grad_func(weights):
    oof_clip = np.clip(oof, 1e-15, 1 - 1e-15)
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], 0
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

oof = []
y_true = []
def find_optimal_blend(predictions, train_data, target_columns):
    
    global oof, y_true
    y_true = train_data.sort_values(by='sig_id')[target_columns].values
    oof = np.zeros((len(predictions), y_true.shape[0], y_true.shape[1]))

    for i, pred in enumerate(predictions):
        oof[i] = pred.sort_values(by='sig_id')[target_columns].values

    tol = 1e-10
    init_guess = [1 / oof.shape[0]] * oof.shape[0]
    bnds = [(0, 1) for _ in range(oof.shape[0])]
    cons = {'type': 'eq', 
            'fun': lambda x: np.sum(x) - 1, 
            'jac': lambda x: [1] * len(x)}

    res_scipy = minimize(fun = func_numpy_metric, 
                         x0 = init_guess, 
                         method = 'SLSQP', 
                         jac = grad_func, 
                         bounds = bnds, 
                         constraints = cons, 
                         tol = tol)
    
    return res_scipy.x

# Utils functions

In [17]:
def inference(model, data_loader, target_columns):
    predictions = []
    
    model.eval()

    for batch in data_loader:
        x = batch['x'].to(DEVICE)
        batch_predictions = model(x).sigmoid().detach().cpu().numpy()
        sig_ids = np.array(batch['sig_id'])

        df = pd.DataFrame(batch_predictions, columns=target_columns)
        df['sig_id'] = sig_ids
        predictions.append(df)

    return pd.concat(predictions).reset_index(drop=True)

In [18]:
def calculate_log_loss(predicted_df, train_df, target_columns):
    predicted_df = predicted_df.copy()
    train_df = train_df.copy()
    
    predicted_df = predicted_df[target_columns + ['sig_id']].reset_index(drop=True)
    predicted_df = predicted_df.sort_values(by=['sig_id'])
    predicted_df = predicted_df.drop('sig_id', axis=1)

    true_df = train_df[target_columns + ['sig_id']].reset_index(drop=True)
    true_df = true_df.sort_values(by=['sig_id'])
    true_df = true_df.drop('sig_id', axis=1)

    predicted_values = predicted_df.values
    true_values = true_df.values
    
    score = 0
    loss_per_class = []
    for i in range(predicted_values.shape[1]):
        _score = log_loss(true_values[:, i].astype(np.float), predicted_values[:, i].astype(np.float), eps=1e-15, labels=[1,0])
        loss_per_class.append(_score)
        score += _score / predicted_values.shape[1]

    return score, loss_per_class

def scale_predictions(predictions, target_columns, scale_values=None):
    predictions = [p.copy() for p in predictions]
    predictions = [p.sort_values(by=['sig_id']).reset_index(drop=True) for p in predictions]
    
    final_predictions = np.zeros((predictions[0].shape[0], len(target_columns)))
    
    for i, p in enumerate(predictions):
        p_values = p[target_columns].values
        
        if scale_values is None:
            final_predictions += p_values / len(predictions)
        else:
            final_predictions += (p_values * scale_values[i])
        
    predictions_df = predictions[0].copy()
    predictions_df.loc[:, target_columns] = final_predictions
    
    return predictions_df

In [19]:
class TrainFactory:
    
    @classmethod
    def meta_model(cls, train_loader, epochs):
        global model_config, DEVICE
        
        model = MetaModel(model_config).to(DEVICE)
        best_model = MetaModel(model_config).to(DEVICE)
        
        optimizer = torch.optim.Adam(params=model.parameters(),
                                     lr=1e-3,
                                     weight_decay=1e-5)
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                        max_lr=1e-2,
                                                        epochs=epochs, 
                                                        steps_per_epoch=len(train_loader))
        
        loss_fn = nn.BCEWithLogitsLoss()
        
        return model, best_model, optimizer, scheduler, loss_fn
    

In [20]:
def train_model(model, best_model, optimizer, scheduler, loss_fn, train_loader, valid_loader, epochs):
    global gene_features, cell_features, target_columns
    
    train_data = train_loader.dataset.dataset_df
    valid_data = valid_loader.dataset.dataset_df
    
    best_loss = np.inf
    
    for epoch in range(epochs):
        
        model.train()
        train_loss = 0
        
        for train_batch in train_loader:
            optimizer.zero_grad()
            
            x = train_batch['x'].to(DEVICE)
            y_pred = model(x)
            
            y_true = train_batch['y'].to(DEVICE)
            
            curr_train_loss = loss_fn(y_pred, y_true)
            
            curr_train_loss.backward()
            
            optimizer.step()
            scheduler.step()
            
            train_loss += ( curr_train_loss.item() * (len(train_batch['sig_id']) / len(train_data)))
            
            
        valid_predictions = inference(model, valid_loader, target_columns)
        valid_loss, _ = calculate_log_loss(valid_predictions, valid_data, target_columns)
        
        if valid_loss < best_loss:
            best_loss = valid_loss
            best_model.load_state_dict(model.state_dict())
            
                           
        if (epoch + 1) % 5 == 0:
            print(f'Epoch:{epoch} \t train_loss:{train_loss:.10f} \t valid_loss:{valid_loss:.10f}')
            
    
    valid_predictions = inference(best_model, valid_loader, target_columns)
    
    return best_model, valid_predictions

In [21]:
#Hyperparameters
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

FOLDS = 5
EPOCHS = 30
BATCH_SIZE = 128
SEEDS = [11, 221, 50]


In [22]:
#Creating the cross validation strategy
train_data = create_cross_validation_strategy(train_data, target_columns, FOLDS, RANDOM_SEED)



In [23]:
train_data = train_data[train_data.cp_type == 'trt_cp'].reset_index(drop=True)

In [24]:
class ModelTrainConfig:
    def __init__(self, model_name, factory_func, scaling_func):
        self.model_name = model_name
        self.factory_func = factory_func
        self.scaling_func = scaling_func

In [25]:
model_version1 = ModelTrainConfig(model_name='meta_model', 
                                  factory_func=TrainFactory.meta_model, 
                                  scaling_func=None)


models_train_configs = [model_version1]

In [26]:
models_valid_predictions = []
models_test_predictions = []

seed_losses = []

for model_train_config in models_train_configs:
    print(f'Training model:{model_train_config.model_name}')
    
    single_model_valid_predictions = []
    single_model_test_predictions = []
    
    for seed in SEEDS:
        seed_everything(seed)

        model_seed_valid_predictions = []
        model_seed_test_predictions = []

        for fold in range(FOLDS):
            print(f'Training fold: {fold}')

            train_loader, valid_loader = preprocess_fold_data(train_data=train_data, 
                                                              fold=fold, 
                                                              scaling_func=model_train_config.scaling_func)
            
            

            model_config = ModelConfig(number_of_features=len(feature_columns),
                                       number_of_targets=len(target_columns))

            model, best_model, optimizer, scheduler, loss_fn = model_train_config.factory_func(train_loader, EPOCHS)

            best_model, valid_predictions = train_model(model=model,
                                                        best_model=best_model,
                                                        optimizer=optimizer, 
                                                        scheduler=scheduler, 
                                                        loss_fn=loss_fn, 
                                                        train_loader=train_loader, 
                                                        valid_loader=valid_loader,
                                                        epochs=EPOCHS)

            #TODO: Save the model here.
            torch.save(best_model.state_dict(), f'model-{model_train_config.model_name}_fold-{fold}_seed-{seed}')
            
            model_seed_valid_predictions.append(valid_predictions)
        
            print('-' * 100)


        valid_predictions = pd.concat(model_seed_valid_predictions).reset_index(drop=True)
        
        single_model_valid_predictions.append(valid_predictions)
        
        valid_loss, _ = calculate_log_loss(valid_predictions, train_data, target_columns)

        seed_losses.append(valid_loss)

        print(f'Model:{model_train_config.model_name} \t Seed:{seed} \t oof_loss:{valid_loss:.10f}')

    valid_predictions = scale_predictions(single_model_valid_predictions, target_columns)
    
    models_valid_predictions.append(valid_predictions)
    
    
    valid_loss, _ = calculate_log_loss(valid_predictions, train_data, target_columns)

    print(f'Model:{model_train_config.model_name} \t valid_loss:{valid_loss:.10f}')

          

Training model:meta_model
Training fold: 0
Epoch:4 	 train_loss:0.0172933366 	 valid_loss:0.0178684464
Epoch:9 	 train_loss:0.0171932050 	 valid_loss:0.0177406156
Epoch:14 	 train_loss:0.0169966765 	 valid_loss:0.0176684645
Epoch:19 	 train_loss:0.0166763352 	 valid_loss:0.0176385989
Epoch:24 	 train_loss:0.0160084534 	 valid_loss:0.0171542757
Epoch:29 	 train_loss:0.0154196326 	 valid_loss:0.0170687272
----------------------------------------------------------------------------------------------------
Training fold: 1
Epoch:4 	 train_loss:0.0170413376 	 valid_loss:0.0185043888
Epoch:9 	 train_loss:0.0170268316 	 valid_loss:0.0182676448
Epoch:14 	 train_loss:0.0168399262 	 valid_loss:0.0183783528
Epoch:19 	 train_loss:0.0165063952 	 valid_loss:0.0179963072
Epoch:24 	 train_loss:0.0157790861 	 valid_loss:0.0180326938
Epoch:29 	 train_loss:0.0152041844 	 valid_loss:0.0179787932
----------------------------------------------------------------------------------------------------
Training f

In [27]:
#Finding optimal blend weights
blend_weights = find_optimal_blend(models_valid_predictions, train_data, target_columns)

print(f'Optimal blend weights: {blend_weights}')

Optimal blend weights: [1.]


In [28]:
valid_predictions = scale_predictions(models_valid_predictions, target_columns, blend_weights)

In [29]:
validation_loss, _ = calculate_log_loss(valid_predictions, train_data, target_columns)
print(f'Validation loss: {validation_loss}')

Validation loss: 0.017249482755610626


In [30]:
print(f'Seed loss std: {np.array(seed_losses).std():.10f}')

Seed loss std: 0.0000180614
