In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook

from scipy import stats
from sklearn.model_selection import GroupKFold

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data
#from transformers import *
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
import os
import re
import math
import random
from matplotlib import pyplot as plt
import warnings
from math import floor, ceil

warnings.filterwarnings('ignore')
device = torch.device('cuda')
torch.backends.cudnn.benchmark=True

%matplotlib inline

In [2]:
train = pd.read_csv('../input/google-quest-challenge/train.csv').fillna(' ')
test = pd.read_csv('../input/google-quest-challenge/test.csv').fillna(' ')
sub = pd.read_csv('../input/google-quest-challenge/sample_submission.csv').fillna(' ')

In [3]:
train.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [4]:
def _get_segments_xlnet(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "<sep>":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return [0] * (max_seq_length - len(tokens)) + segments

def _get_segments_bert(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))#padding

def _get_ids_xlnet(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids =  [5] * (max_seq_length-len(token_ids)) + token_ids
    return input_ids

def _get_ids_bert(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))#padding
    return input_ids

def _trim_input(title, question, answer, max_sequence_length=512-1, 
                t_max_len=70-1, q_max_len=219, a_max_len=219):


    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)
    # print(type(t))
    # print(t)

    if (t_len+q_len+a_len+4) > max_sequence_length:#???
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = norm_token_length(q, q_new_len)#??
        a = norm_token_length(a, a_new_len)#??
        
    
    return t, q, a

def norm_token_length(tokens, l):
    if len(tokens) > l:
        head = l//2
        tail = l - head
        return tokens[:head] + tokens[-tail:]
    else:
        return tokens[:l]

def _convert_to_bert_inputs(title, question, answer, cate, pretrained_weights, max_sequence_length=512):
    """Converts tokenized input to ids, masks and segments for BERT"""
    if "bert-base" in pretrained_weights:
        stoken = ["[CLS]"] + [cate] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]#type:list
        # print(stoken)
        # print("==================")
        input_ids = _get_ids_bert(stoken, tokenizer, max_sequence_length)
        input_segments = _get_segments_bert(stoken, max_sequence_length)
        # print(input_ids)
        # print("==================")
        # print(input_segments)
        # print("==================")

    elif pretrained_weights == "xlnet-base-cased":
        stoken = [cate] + title + ["<sep>"] + question + ["<sep>"] + answer + ["<sep>", "<cls>"]
        input_ids = _get_ids_xlnet(stoken, tokenizer, max_sequence_length)
        input_segments = _get_segments_xlnet(stoken, max_sequence_length)
        try:
            cls_index = input_segments.index(5) - 1
        except ValueError:
            cls_index = -1
        input_segments[cls_index] = 2
    
    return [input_ids, input_segments]

def convert_row(row, pretrained_weights):
    # print(row)
    # print("==================")
    if pretrained_weights == "bert-base-uncased":
        c = f"[{row['category'].lower()}]"#type:str
    elif pretrained_weights == "bert-base-cased":
        c = f"[{row['category']}]"#type:str
    elif pretrained_weights == "xlnet-base-cased":
        c = f"[{row['category']}]"#type:str
    t, q, a = title = row["question_title"], row["question_body"], row["answer"]#type:str
    # print(type(t))
    # print(t)
    # print(c)
    # print(type(c))
    #print("=====================")
    t, q, a = _trim_input(t, q, a)
    ids, segments = _convert_to_bert_inputs(t, q, a, c, pretrained_weights)
    
    return np.array([[ids, segments]])#shape:(1, 2, 512)

In [5]:
#model="bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained("./token_model_config/tokenizer/")
#tokenizer.add_tokens("./bert_based_tokenizer/added_tokens.json")
#tokenizer.add_tokens(categories)


In [6]:
len(tokenizer)

29001

In [7]:
def custom_loss(data, targets):
    mse = nn.MSELoss(reduction="none")(data[:,:30].sigmoid(), targets[:,:30])
    bce = nn.BCEWithLogitsLoss(reduction='none')(data[:,:30], targets[:,:30])
    w =  targets[:,30:]
    loss = (mse*w).sum() + bce.sum()
    return loss

class CustomBert(nn.Module):
    def __init__(self, config_path=None):
        super(CustomBert, self).__init__()
        self.config = AutoConfig.from_pretrained(config_path) 
        #self.config = torch.load(config_path,output_hidden_states=True)
        self.config.num_labels = 30
        self.config.output_hidden_states = True
        self.n_use_layer = 4
        self.n_labels = self.config.num_labels
        #self.config.save_pretrained("bert_based_config")
        #self.bert = BertModel(config)
        self.bert=AutoModel.from_config(self.config)
        #self.bert.save_pretrained('bert_based_model')
        self.dense1 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        self.dense2 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.num_labels)
        #self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        # outputs = self.bert(input_ids,
        #                     attention_mask=attention_mask,
        #                     token_type_ids=token_type_ids,
        #                     position_ids=position_ids,
        #                     head_mask=head_mask,
        #                     inputs_embeds=inputs_embeds)
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                            )
        
        pooled_output = torch.cat([outputs[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)
        pooled_output = self.dense1(pooled_output)
        pooled_output = self.dense2(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]
        return outputs

In [8]:
model=CustomBert("token_model_config/config/config.json")

In [9]:
BS=8

In [10]:
pretrained_weights = 'bert-base-cased'
#X_test = test[["question_title", "question_body", "answer", "category"]].progress_apply(lambda x: convert_row(x, pretrained_weights), axis=1).values#shape(476)
X_test = test[["question_title", "question_body", "answer", "category"]].apply(lambda x: convert_row(x, pretrained_weights), axis=1).values#shape(476)
#np.vstack(X_test).shape : (476, 2, 512)
X_test = np.vstack(X_test).reshape((len(X_test), 1024))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BS, shuffle=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (4324 > 512). Running this sequence through the model will result in indexing errors


In [11]:
len(tokenizer)

29001

In [12]:
X_test.shape

(476, 1024)

In [13]:
model = model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.bert.resize_token_embeddings(len(tokenizer))#??
model.eval()

CustomBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29001, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (d

In [None]:
#model_dir_target = "../input/Optimize binning/bert-base-cased"#

cased_pred_lst = []
for fold in range(10):
    # if fold in [0,1,2,3,4,5,6,7]:
    #     continue
    #bert_path = f"{model_dir_target}/bert-base-cased_f{fold}_best"
    #bert_path=f"./bert_based_case/no_bce_no_opt_binning/bert-based-case_f{fold}_best"
    #solution_26th_path=f"C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/26th-solution/inference/input/quest-bertcased-10fold/bert-base-cased_f{fold}_best"
    solution_26th_path=f"C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/26th-solution/KaggleQuest-master/work/NoInit/BertBaseCased/bert-base-cased_f{fold}_best"
    model.load_state_dict(torch.load(solution_26th_path),strict=False)
    
    
    lst = []
    for i, (x_batch,)  in enumerate(test_loader):
        input_ids = x_batch[:, :512]
        token_ids = x_batch[:, 512:]
        pred = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device))
        
        lst.append(pred[0].detach().cpu().squeeze().numpy())
    test_pred = np.vstack(lst)#shape:(476, 30)
    
    cased_pred_lst.append(test_pred)

In [15]:
cased_pred_lst[0].shape

(476, 30)

In [16]:
np.array(cased_pred_lst).mean(0)

array([[ 2.9296746 ,  0.69095033, -1.6985661 , ..., -3.3332076 ,
         2.0756507 ,  2.4260817 ],
       [ 1.6843027 ,  0.07697463, -6.058629  , ..., -1.9949615 ,
        -2.7619004 ,  1.7309611 ],
       [ 2.5567524 ,  1.1753161 , -4.4454217 , ..., -2.906634  ,
         2.1950786 ,  2.5745742 ],
       ...,
       [ 1.7081602 , -0.39062575, -3.994512  , ..., -1.6929474 ,
        -0.02041219,  2.5661967 ],
       [ 2.6966932 ,  1.3558286 , -3.8374982 , ..., -2.8559844 ,
         1.9818013 ,  2.89408   ],
       [ 2.4720356 ,  0.6669997 , -5.045515  , ..., -2.0027814 ,
        -1.6465935 ,  2.1118333 ]], dtype=float32)

In [17]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
cased_pred = np.array(cased_pred_lst).mean(0)
cased_pred = sigmoid(cased_pred)
cased_pred

array([[0.949294  , 0.66617835, 0.15465264, ..., 0.03444938, 0.888514  ,
        0.91879463],
       [0.84347343, 0.5192342 , 0.00233215, ..., 0.11973295, 0.05941806,
        0.8495353 ],
       [0.9280258 , 0.76410455, 0.01159611, ..., 0.05182659, 0.8998067 ,
        0.92920715],
       ...,
       [0.8465975 , 0.4035667 , 0.0180834 , ..., 0.15538862, 0.49489716,
        0.92865413],
       [0.93683124, 0.79508096, 0.02109294, ..., 0.05437279, 0.87887305,
        0.947553  ],
       [0.92215806, 0.66083103, 0.00639696, ..., 0.11891119, 0.16156988,
        0.892048  ]], dtype=float32)

In [18]:
sub[sub.columns[1:]] = cased_pred
sub.to_csv("submission.csv", index=False)
sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.949294,0.666178,0.154653,0.553623,0.6045,0.630073,0.679814,0.669111,0.546083,...,0.913388,0.900747,0.517548,0.960436,0.964714,0.811419,0.042888,0.034449,0.888514,0.918795
1,46,0.843473,0.519234,0.002332,0.760515,0.813664,0.948303,0.564785,0.500868,0.103378,...,0.652297,0.947492,0.625975,0.970675,0.981972,0.887246,0.933648,0.119733,0.059418,0.849535
2,70,0.928026,0.764105,0.011596,0.825836,0.892597,0.962454,0.628095,0.482132,0.100511,...,0.906555,0.9106,0.598329,0.96139,0.946694,0.83507,0.065642,0.051827,0.899807,0.929207
3,132,0.916074,0.458491,0.002027,0.729436,0.770467,0.930743,0.594157,0.463071,0.130447,...,0.726541,0.958902,0.683613,0.975063,0.986992,0.908914,0.881619,0.154653,0.62713,0.902189
4,200,0.939911,0.481985,0.040984,0.819952,0.672361,0.84277,0.628446,0.630961,0.253318,...,0.695076,0.918483,0.667006,0.963846,0.962784,0.830125,0.230669,0.139842,0.706315,0.882781
