In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

#from sklearn import preprocessing as pp
#from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import re
import time
from tqdm import tqdm
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk
from bs4 import BeautifulSoup
import transformers
from transformers import AdamW

#import torchvision.transforms as transforms
import torch.optim as optimizers
from torch.utils.data import random_split
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import WeightedRandomSampler, BatchSampler

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#set random seed
def set_seed(seed: int = 123):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
set_seed(1234)

In [4]:
all_data = pd.read_csv('../input/intern-compe4/all_for_nn.csv')
train_set = all_data[all_data["data_type"] == "train"]

In [5]:
target_cols = ["state"]
feature_cols = ['duration', 'goal_min',
                '0', '1', '2', '3', '4', '5', '6',
       '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
       '19', '20', '21', '22', '23', '24',
       'number_of_figure', 'number_of_paragraph',
       'length_of_text', 'country_4',
       'country_16', 'country_21', 'country_9', 'country_3', 'country_7',
       'country_8', 'country_5', 'country_1', 'country_12', 'country_15',
       'country_18', 'country_17', 'country_6', 'country_10', 'country_20',
       'country_2', 'country_19', 'country_11', 'country_13', 'country_0',
       'country_14',
        'category1_12', 'category1_5', 'category1_7', 'category1_13',
       'category1_11', 'category1_0', 'category1_6', 'category1_3',
       'category1_4', 'category1_8', 'category1_9', 'category1_10',
       'category1_1', 'category1_14', 'category1_2']

In [6]:
class CFWholeDataset(Dataset):
    def __init__(self, csv_file, tokenizer=None):
        self.csv_file = csv_file
        #self.transform = transform
        self.tokenizer = tokenizer
        self.feature_cols = feature_cols
        self.target_cols = target_cols
        
        
    def __len__(self):
        return len(self.csv_file)
    
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        features = self.csv_file[self.feature_cols].iloc[idx]
        features = torch.FloatTensor(features)    
        html_text = self.csv_file.html_content.iloc[idx]
        html_text = str(html_text)
        if self.tokenizer:
            html_text = self.tokenizer(html_text)
            
        try:
            target = self.csv_file[self.target_cols].iloc[idx]
            target = torch.tensor(target)
            # train_step
        except:
            target = self.csv_file["id"].iloc[idx]
            # test_step
            
        """
        if self.transform:
            features = self.transform(features)
        """
        
        return (features, html_text), target

In [7]:
class CFWholeDataModule(pl.LightningDataModule):
    def __init__(self, csv_file, transform, split_rate, batch_size, num_workers):
        super().__init__()
        self.csv_file = csv_file
        self.transform = transform
        self.split_rate = split_rate
        self.batch_size = batch_size
        self.num_workers = num_workers
        

    def setup(self, stage=None):
        dataset = self.csv_file
        n_samples = len(dataset)
        n_train = int(n_samples * 0.8)
        n_val = n_samples - n_train
        train_dataset, val_dataset = train_test_split(dataset,  train_size=n_train, test_size=n_val)
        
        self.train_dataset = CFWholeDataset(csv_file=train_dataset, tokenizer=self.transform)
        self.val_dataset = CFWholeDataset(csv_file=val_dataset, tokenizer=self.transform)
        
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          drop_last=True,
                          num_workers=self.num_workers,
                          pin_memory=True)
    
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          drop_last=True,
                          num_workers=self.num_workers,
                          pin_memory=True)

In [8]:
#define transform
class BERT_Tokenize(object):
    def __init__(self, model_type, max_len):
        self.max_len = max_len
        
        if model_type == "BERT" or model_type == "TAPTBERT":
            from transformers import BertTokenizer, BertForSequenceClassification
            self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
            
        elif model_type == "ALBERT":
            from transformers import AlbertTokenizer, AlbertForSequenceClassification
            self.bert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
            
        elif model_type == "XLNET":
            from transformers import XLNetTokenizer, XLNetForSequenceClassification
            self.bert_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
        
        elif model_type == "ROBERTA":
            from transformers import RobertaTokenizer, RobertaForSequenceClassification
            self.bert_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        
        elif model_type == "csROBERTA":
            from transformers import AutoTokenizer, AutoModel
            self.bert_tokenizer = AutoTokenizer.from_pretrained("allenai/cs_roberta_base")
            
        elif model_type == "XLMROBERTA":
            from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
            self.bert_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
            
        elif model_type == "ELECTRA":
            from transformers import ElectraTokenizer, ElectraForSequenceClassification
            self.bert_tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")
            
    
    def __call__(self,text):
        inputs = self.bert_tokenizer.encode_plus(
                        text,                       # Sentence to encode.
                        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
                        max_length = self.max_len,  # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,  # Construct attn. masks.
                   )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        del text, inputs
        return np.array(ids), np.array(mask)

In [9]:
class BertHead(nn.Module):
    def __init__(self, model_type, dropout=0.1, out_dim=None):
        super(BertHead, self).__init__()
        if model_type == "ALBERT":
            from transformers import AlbertTokenizer, AlbertModel
            self.base_model = AlbertModel.from_pretrained("albert-base-v2")
            
        elif model_type == "BERT":
            from transformers import BertTokenizer, BertModel
            self.base_model = BertModel.from_pretrained("bert-base-uncased")
            
        elif model_type == "XLNET":
            from transformers import XLNetTokenizer, XLNetModel
            self.base_model = XLNetModel.from_pretrained("xlnet-base-cased")
            
        elif model_type == "ROBERTA":
            from transformers import RobertaTokenizer, RobertaModel
            self.base_model = RobertaModel.from_pretrained("roberta-base")
        
        elif model_type == "XLMROBERTA":
            from transformers import XLMRobertaTokenizer, XLMRobertaModel
            self.base_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        
        elif model_type == "ELECTRA":
            from transformers import ElectraTokenizer, ElectraModel
            self.base_model = ElectraModel.from_pretrained("google/electra-base-discriminator")
       
        elif model_type == "TAPTBERT":
            from transformers import AutoModel, AutoConfig
            config = AutoConfig.from_pretrained("config.json")
            self.base_model = AutoModel.from_pretrained("pytorch_model.bin", config=config)
        
        """
        self.classifier = nn.Sequential(
            nn.Linear(768, 768), nn.ReLU(), nn.Dropout(p=dropout),
            nn.Linear(768, 768), nn.ReLU(), nn.Dropout(p=dropout),
            nn.Linear(768, out_dim))
        """ 
        """
        for param in self.base_model.parameters():
            param.requires_grad = True
        """
    
    def forward(self, x):
        ids, mask = x

        x = self.base_model(input_ids=ids, attention_mask=mask)
        #x = self.classifier(x[1])
        #preds = torch.sigmoid(x)
        del ids, mask
        return x[1]

In [10]:
class TabularNet(nn.Module):
    def __init__(self, input_dim, hidden_sizes, dropout, out_dim):
        super(TabularNet, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout = nn.Dropout(dropout)
        self.dense1 = nn.utils.weight_norm(nn.Linear(input_dim, hidden_sizes[0]))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_sizes[0])
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_sizes[0], hidden_sizes[1]))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_sizes[1])
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_sizes[1], hidden_sizes[2]))

        self.layers = nn.Linear(hidden_sizes[2], out_dim)
            
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout(x)
        x = F.relu(self.dense3(x))
        
        #x = self.layers(x)
        #return torch.sigmoid(x)
        
        return x

In [11]:
class MergeNet(nn.Module):
    def __init__(self, input_dim, hidden_sizes, dropout, out_dim):
        super(MergeNet, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_sizes[0])
        self.dropout = nn.Dropout(dropout)
        
        self.layer2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        
        self.layer3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        
        self.layer4 = nn.Linear(hidden_sizes[2], out_dim)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = self.dropout(x)
        
        x = F.relu(self.layer2(x))
        x = self.dropout(x)
        
        x = F.relu(self.layer3(x))
        x = self.dropout(x)
        
        x = self.layer4(x)
        
        return torch.sigmoid(x)

In [12]:
#define BERT based model
class BertModule(pl.LightningModule):
    def __init__(self, model_type, tab_input, tab_hidden, tab_out, merge_input, merge_hidden, merge_out, dropout):
        super().__init__()
        self.berthead = BertHead(model_type=model_type)
        self.tabularnet = TabularNet(input_dim=tab_input, hidden_sizes=tab_hidden, dropout=dropout, out_dim=tab_out)
        self.mergenet = MergeNet(input_dim=merge_input, hidden_sizes=merge_hidden, dropout=dropout, out_dim=merge_out)
        
        
    def forward(self, data):
        tab_input, bert_input = data
        bert_prob = self.berthead(bert_input)
        tabular_prob = self.tabularnet(tab_input)
        merge_input = torch.cat([tabular_prob, bert_prob], axis=1)
        preds = self.mergenet(merge_input)
        return preds
    
    
    def training_step(self, batch, batch_idx):
        x, t = batch
        pred = self.forward(x)
        loss = self.criterion(pred, t)
        acc = self.metric(pred, t)
        # you should define log as {"tag_name/log_name"}
        tensorboard_logs = {'train/train_loss': loss, "train/train_acc": acc}
        return {"loss": loss, "acc": acc, "logs": tensorboard_logs, "progress_bar": tensorboard_logs}
    
    
    def validation_step(self, batch, batch_idx):
        x, t = batch
        pred = self.forward(x)
        loss = self.criterion(pred, t)
        acc = self.metric(pred, t)
        logs = {"val_loss": loss, "val_acc": acc}
        return {"val_loss": loss, "val_acc": acc, "progress_bar": logs}

    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([torch.tensor(x['val_acc']) for x in outputs]).mean()
        tensorboard_logs = {'val/avg_loss': avg_loss, "val/avg_acc": avg_acc}
        print(f"val_avg_loss: {avg_loss}, val_avg_acc: {avg_acc}")
        # you should call back as name "val_loss" to using the Early-Stopping
        return {'val_loss': avg_loss, "val_acc": avg_acc, 'log': tensorboard_logs}
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=1e-5)
        """
        scheduler = {"scheduler": 
                     optimizers.lr_scheduler.CosineAnnealingLR(
                        optimizer, T_max=10),
                    "interval": "epoch",
                    "monitor": "val_loss"}
        """
        return optimizer#[optimizer], [scheduler]
    
    
    def criterion(self, pred, t):
        #pred = pred.view(-1)
        pred = pred.float()
        t = t.float()
        return F.binary_cross_entropy(pred,t)
    
    
    def metric(self, pred, t):
        #pred = pred.view(-1)
        pred = pred.float()
        t = t.float()
        pred = torch.where(pred<0.5, 0, 1)    
        t, pred = t.to("cpu"), pred.to("cpu").detach().numpy()
        return f1_score(y_true=t, y_pred=pred, average='binary', sample_weight=None, zero_division='warn')

In [13]:
def main():
    # trainer config
    epochs = 100
    output_path = './'
    
    # data module config
    csv_file = train_set
    model_type = "BERT"
    max_length = 512
    bert_tokenizer = BERT_Tokenize(model_type, max_length)
    transform = bert_tokenizer
    split_rate = 0.8
    batch_size = 16
    num_workers = 4
    
    # model config
    tab_input = len(feature_cols)
    tab_hidden = [120, 30, 70]
    tab_out = 1
    merge_input = 70+768
    merge_hidden = [1200, 300, 700]
    merge_out = 1
    dropout = 0.3
    
    # early stopping config
    patience = 2
    
    cf = CFWholeDataModule(csv_file, transform, split_rate, batch_size, num_workers)
    model = BertModule(model_type, tab_input, tab_hidden, tab_out, merge_input, merge_hidden, merge_out, dropout)
    
    early_stopping = EarlyStopping('val_loss', patience=patience, verbose=True)
    trainer = Trainer(
            max_epochs=epochs,
            weights_save_path=output_path,
            gpus = 1 if torch.cuda.is_available() else None,
            callbacks=[early_stopping]
            #accumulate_grad_batches=1
            # use_amp=False,
        )
        
    trainer.fit(model, cf)
    torch.cuda.empty_cache()
    # TO DO: use model.apply(weights_init) instead of torch.cuda.empty_cache()

In [14]:
main()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




EarlyStopping mode set to min for monitoring val_loss.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type       | Params
------------------------------------------
0 | berthead   | BertHead   | 109 M 
1 | tabularnet | TabularNet | 14.7 K
2 | mergenet   | MergeNet   | 1.6 M 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

val_avg_loss: 0.5703531503677368, val_avg_acc: 0.2


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

val_avg_loss: 0.5642797946929932, val_avg_acc: 0.7463941222490553


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

val_avg_loss: 0.5770756602287292, val_avg_acc: 0.7559585840280745


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

val_avg_loss: 0.6902024745941162, val_avg_acc: 0.7465662967681701

