## **Neural Net training**

In [41]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import glob
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000

import gc
import xgboost as xgb
from tqdm import tqdm
import shutil
import copy

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

import pickle

import warnings
warnings.simplefilter('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [43]:
def label_encode(df, cols):
    for col in cols:
        le = LabelEncoder()
        tmp = df[col].fillna("NaN")
        df[col] = pd.Series(le.fit_transform(tmp), index=tmp.index)

    return df


def check_columns(necessary_cols,cols):
    
    cols = set(cols) # make set
    
    lack_cols = [c for c in necessary_cols if c not in cols]
    
    print("-- column check completed --")
    if len(lack_cols) == 0:
        print("  columns are satisfied")
        return True
    else:
        print("  !!columns are lacked!!")
        print("   lacked columns:",lack_cols)
        return False


class FeaturesMaker_v1(object):

    def __init__(self,target_cols):
        self.name = "featuresV1"
        self.feature_exp = "simple features which "

        self.target_cols = target_cols
        self.necessary_col =  ["sig_id",'cp_type',"cp_time","cp_dose","data_part"] + list(target_cols)

    def make_feature(self,df):

        # check existstance of necessary columns
        if check_columns(self.necessary_col,df.columns):

            # label encoding
            cols = ['cp_type',"cp_time","cp_dose"]
            df = label_encode(df, cols=cols)


            # split train and test
            df = df.set_index(["sig_id"],drop=True)

            features = [c for c in df.columns if "g-" in c]
            features = features + [c for c in df.columns if "c-" in c]
            features = features + ['cp_type',"cp_time","cp_dose"]

            print("-- ",self.name," --")
            print("dim:",len(features))
            print("N:",len(df))
            print("-----------------")

            return {sub[0]:(sub[1][features],sub[1][self.target_cols]) for sub in df.groupby(by="data_part")}

        else:
            return False

In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss

Dropout_Model = 0.25
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(Dropout_Model)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(Dropout_Model)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [45]:
class MoADataset_train:
    def __init__(self,features ,targets):
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return(self.features.shape[0])
    
    def __getitem__(self,idx):
        dct = {"x":torch.tensor(self.features[idx,:], dtype=torch.float),
               "y":torch.tensor(self.features[idx,:], dtype=torch.float)}
        
        return dct
        
class MoADataset_test:
    def __init__(self,features ,targets):
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return(self.features.shape[0])
    
    def __getitem__(self,idx):
        dct = {"x":torch.tensor(self.features[idx,:], dtype=torch.float)}
        
        return dct

In [46]:
BATCH_SIZE = 5

def train_model(model, optimizer, scheduler, loss_func, dataloader, device):
    
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data["x"].to(device), data["y"].to(device)
        output = model(inputs)
        loss = loss_func(outputs,targets)
        loss.backward()
        optimier.step()
        scheduler.step()
    
        final_loss += loss.item()
    
    final_loss /= len(dataloader)
    
    return final_loss

def valid_model(model, optimizer, scheduler, loss_func, dataloader, device):
    
    model.eval()
    final_loss = 0
    valid_pred = []    
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_func(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

In [63]:
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [47]:
train_targets = pd.read_csv(os.path.join("..","..","input","lish-moa","train_targets_scored.csv"))
train_features = pd.read_csv(os.path.join("..","..","input","lish-moa","train_features.csv"))
test_features =  pd.read_csv(os.path.join("..","..","input","lish-moa","test_features.csv"))

In [48]:
target_cols = train_targets.columns[1:]
feature_cols = train_features.columns[1:]

In [54]:
# traindata and validatoin data
df_train = copy.copy(train_features)
df_train = pd.merge(df_train,train_targets[["sig_id"]+list(target_cols)],on="sig_id",how="right")
train,valid = train_test_split(df_train)
df_train.loc[train.index,"data_part"] = "train"
df_train.loc[valid.index,"data_part"] = "valid"

# test data
df_test = copy.copy(test_features)
for col in target_cols:
    df_test[col] = np.nan
df_test["data_part"] = "test"

# features processing
df = pd.concat([df_train,df_test])

feature_maker = FeaturesMaker_v1(target_cols=target_cols)
df = feature_maker.make_feature(df)

-- column check completed --
  columns are satisfied
--  featuresV1  --
dim: 875
N: 27796
-----------------


In [57]:
train_dataset = MoADataset_train(df["train"][0].values, df["train"][1].values)
valid_dataset = MoADataset_train(df["valid"][0].values, df["valid"][1].values)

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [60]:
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=1500
DEVICE = "cuda"

In [61]:
model = Model(
    num_features=num_features,
    num_targets=num_targets,
    hidden_size=hidden_size,
)
model.to(DEVICE)

Model(
  (batch_norm1): BatchNorm1d(875, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dense1): Linear(in_features=875, out_features=1500, bias=True)
  (batch_norm2): BatchNorm1d(1500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.25, inplace=False)
  (dense2): Linear(in_features=1500, out_features=1500, bias=True)
  (batch_norm3): BatchNorm1d(1500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout3): Dropout(p=0.25, inplace=False)
  (dense3): Linear(in_features=1500, out_features=206, bias=True)
)

In [64]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                          max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))

loss_fn = nn.BCEWithLogitsLoss()
loss_tr = SmoothBCEwLogits(smoothing =0.001)

early_stopping_steps = EARLY_STOPPING_STEPS
early_step = 0

oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
best_loss = np.inf

NameError: name 'target' is not defined

In [None]:
for epoch in range(EPOCHS):
        
    train_loss = train_fn(model, optimizer,scheduler, loss_tr, trainloader, DEVICE)
    print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
    valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
    print(f"FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")

    if valid_loss < best_loss:

        best_loss = valid_loss
        oof[val_idx] = valid_preds
        torch.save(model.state_dict(), f"FOLD{fold}_.pth")

    elif(EARLY_STOP == True):

        early_step += 1
        if (early_step >= early_stopping_steps):
            break