In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook

from scipy import stats
from sklearn.model_selection import GroupKFold

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data
#from transformers import *
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
import os
import re
import math
import random
from matplotlib import pyplot as plt
import warnings
from math import floor, ceil

warnings.filterwarnings('ignore')
device = torch.device('cuda')
torch.backends.cudnn.benchmark=True

%matplotlib inline

In [2]:
train = pd.read_csv('/kaggle/input/google-quest-challenge/train.csv').fillna(' ')
test = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv').fillna(' ')
sub = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv').fillna(' ')

In [3]:
train.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [4]:
def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False
                current_segment_id = 1#新增 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length=512-1, 
                t_max_len=70-1, q_max_len=219, a_max_len=219):#???

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))

        t = t[:t_new_len]
        q = norm_token_length(q, q_new_len)
        a = norm_token_length(a, a_new_len)
    
    return t, q, a

def norm_token_length(tokens, l):
    if len(tokens) > l:
        head = l//2
        tail = l - head
        return tokens[:head] + tokens[-tail:]
    else:
        return tokens[:l]

def _convert_to_bert_inputs(title, question, answer, cate, max_sequence_length=512):
    """Converts tokenized input to ids, masks and segments for BERT"""
    #stoken = ["[CLS]"] + [cate] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]
    stoken_1 = ["[CLS]"] + [cate] + title + ["[SEP]"] + question + ["[SEP]"]
    stoken_2 = ["[CLS]"] + [cate]+title + ["[SEP]"] + answer + ["[SEP]"]
    input_ids = _get_ids(stoken_1, tokenizer, max_sequence_length)
    input_ids_2 = _get_ids(stoken_2, tokenizer, max_sequence_length)
    input_segments = _get_segments(stoken_1, max_sequence_length)
    input_segments_2 = _get_segments(stoken_2, max_sequence_length)
    
    #return [input_ids, input_segments]
    return input_ids, input_segments,input_ids_2,input_segments_2

def convert_row(row,pretrained_weights):
    #c = f"[{row['category'].lower()}]"

    if pretrained_weights == "bert-base-uncased":
        c = f"[{row['category'].lower()}]"#type:str
    elif pretrained_weights == "bert-base-cased":
        c = f"[{row['category']}]"#type:str
    elif pretrained_weights == "xlnet-base-cased":
        c = f"[{row['category']}]"#type:str
    t, q, a = title = row["question_title"], row["question_body"], row["answer"]#type:str


    t, q, a = row["question_title"], row["question_body"], row["answer"]
    t, q, a = _trim_input(t, q, a)
    #ids, segments = _convert_to_bert_inputs(t, q, a, c)
    ids, segments, ids2, segments2 = _convert_to_bert_inputs(t, q, a, c)
    #total_input=[np.array([[ids, segments]]),np.array([[ids2, segments2]])]
    # total_input=[]
    # print(np.array([[ids, segments]]).shape)
    # total_input.append(np.array([[ids, segments]]))
    # total_input.append(np.array([[ids2, segments2]]))
    # return total_input
    return np.array([[ids, segments, ids2, segments2]])

In [5]:
#model="bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/token-model-config/token_model_config/tokenizer/")
#tokenizer.add_tokens("./bert_based_tokenizer/added_tokens.json")
#tokenizer.add_tokens(categories)


In [6]:
len(tokenizer)

29001

In [7]:
def custom_loss(data,data2, targets,targets2):
    
    # mse = nn.MSELoss(reduction="none")(data[:,:30].sigmoid(), targets[:,:30])
    # bce = nn.BCEWithLogitsLoss(reduction='none')(data[:,:30], targets[:,:30])#??
    mse = nn.MSELoss(reduction="none")(data[:,:].sigmoid(), targets[:,:])
    bce = nn.BCEWithLogitsLoss(reduction='none')(data[:,:], targets[:,:])#??
    
    mse2 = nn.MSELoss(reduction="none")(data2[:,:].sigmoid(), targets2[:,:])
    bce2 = nn.BCEWithLogitsLoss(reduction='none')(data2[:,:], targets2[:,:])#??
    #w =  targets[:,30:]
    #loss = (mse*w).sum() + bce.sum()
    loss = (mse).sum()+ bce.sum()+mse2.sum()+bce2.sum()
    return loss

class CustomBert(nn.Module):
    def __init__(self, config_path=None):
        super(CustomBert, self).__init__()
        self.config = AutoConfig.from_pretrained(config_path) 
        
        self.config.Q_labels = 21
        self.config.A_labels = 9
        self.config.output_hidden_states = True
        self.n_use_layer = 4 #原本
        #self.n_use_layer = 2
        self.double_bert= 1
        self.n_labels = self.config.num_labels
        #self.config.save_pretrained("bert_based_config")
        #self.config.save_pretrained(output_dir+"config")
        #self.bert = BertModel(config)
        self.bert=AutoModel.from_config(self.config)
        self.bert2=AutoModel.from_config(self.config)
        #self.bert.save_pretrained('bert_based_model')
        #self.bert.save_pretrained(output_dir+"model")
        # self.dense1 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        # self.dense2 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        # self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        # self.classifier = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.num_labels)

        self.dense1 = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.double_bert*self.config.hidden_size*self.n_use_layer)
        self.dense2 = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.double_bert*self.config.hidden_size*self.n_use_layer)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.config.Q_labels)
        self.classifier2 = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.config.A_labels)
        #self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None
                ,input_ids2=None, attention_mask2=None, token_type_ids2=None,position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        # outputs = self.bert(input_ids,
        #                     attention_mask=attention_mask,
        #                     token_type_ids=token_type_ids,
        #                     position_ids=position_ids,
        #                     head_mask=head_mask,
        #                     inputs_embeds=inputs_embeds)
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                            )
        outputs2 = self.bert2(input_ids2,
                            attention_mask=attention_mask2,
                            token_type_ids=token_type_ids2
                            )
        
        #print(outputs[2][-1].shape)
        pooled_output = torch.cat([outputs[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)#把倒數最後4個layer的cls output concat在一起，把4個(8,768) concat，變成(8,3072) #原本
        pooled_output2 = torch.cat([outputs2[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)
        #pooled_output = torch.cat([outputs[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)#把倒數最後2個layer的cls output concat在一起,把2個(8,768) concat，變成(8,1536)
        #pooled_output2 = torch.cat([outputs2[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)#同上
        #double_pooled_output=torch.cat([pooled_output,pooled_output],dim=1)#(8,3072)
        
        pooled_output = self.dense1(pooled_output)
        pooled_output = self.dense2(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        pooled_output2 = self.dense1(pooled_output2)
        pooled_output2 = self.dense2(pooled_output2)
        pooled_output2 = self.dropout(pooled_output2)
        logits2 = self.classifier2(pooled_output2)

        # double_pooled_output = self.dense1(double_pooled_output)
        # double_pooled_output = self.dense2(double_pooled_output)
        # double_pooled_output = self.dropout(double_pooled_output)
        # logits = self.classifier(double_pooled_output)

        outputs = (logits,) + outputs[2:]
        outputs2 = (logits2,) + outputs2[2:]

        return outputs,outputs2

In [8]:
model=CustomBert("/kaggle/input/token-model-config/token_model_config/config/config.json")


In [9]:
BS=8

In [10]:
pretrained_weights = 'bert-base-cased'
X_test = test[["question_title", "question_body", "answer", "category"]].apply(lambda x: convert_row(x, pretrained_weights), axis=1).values#shape(476)
#X_train = train[["question_title", "question_body", "answer", "category"]].apply(lambda x: convert_row(x, pretrained_weights), axis=1).values#shape(476)
#np.vstack(X_test).shape : (476, 2, 512)
X_test = np.vstack(X_test).reshape((len(X_test), 2048))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BS, shuffle=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (4324 > 512). Running this sequence through the model will result in indexing errors


In [11]:
model = model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.bert.resize_token_embeddings(len(tokenizer))#??
model.bert2.resize_token_embeddings(len(tokenizer))
model.eval()

CustomBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29001, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (d

In [12]:
#model_dir_target = "../input/Optimize binning/bert-base-cased"#

cased_pred_lst = []
for fold in range(10):
    # if fold in [0,1,2,3,4,5,6,7]:
    #     continue
    #bert_path = f"{model_dir_target}/bert-base-cased_f{fold}_best"
    #bert_path=f"./DoubleBertBasedCase/bce_no_opt_binning/double-bert-based-case_f{fold}_best"
    bert_path=f"/kaggle/input/non-shared-weights-full-fold/double-bert-based-case_f{fold}_best"
    model.load_state_dict(torch.load(bert_path),strict=False)
    
    lst = []
    for i, (x_batch,)  in enumerate(test_loader):
        # input_ids = x_batch[:, :512]
        # token_ids = x_batch[:, 512:]
        input_ids = x_batch[:, :512]
        token_ids = x_batch[:, 512:1024]
        input_ids2 = x_batch[:, 1024:1536]
        token_ids2 = x_batch[:, 1536:]
        #pred = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device))
        pred, pred2 = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device),
                     input_ids2=input_ids2.to(device),attention_mask2=(input_ids2 > 0).to(device),token_type_ids2=token_ids2.to(device))
        total_y_pred=torch.cat((pred[0],pred2[0]),dim=1)
        lst.append(total_y_pred.detach().cpu().squeeze().numpy())
    train_pred = np.vstack(lst)#shape:(476, 30)
    
    cased_pred_lst.append(train_pred)

In [13]:
cased_pred_lst[0].shape

(476, 30)

In [14]:
cased_pred_lst

[array([[ 2.9148734 ,  0.587064  , -1.0286921 , ..., -3.941702  ,
          3.1086798 ,  2.7866452 ],
        [ 1.9138436 , -0.14367293, -5.5822263 , ..., -2.0265658 ,
         -2.8845265 ,  2.1542585 ],
        [ 2.260144  ,  0.79723394, -4.245135  , ..., -3.3293252 ,
          2.716329  ,  2.222594  ],
        ...,
        [ 1.6526252 , -0.7348393 , -4.6569324 , ..., -1.4699891 ,
         -0.48530442,  1.9625344 ],
        [ 3.1886616 ,  1.65253   , -4.2482276 , ..., -2.487862  ,
          3.2956548 ,  2.9502597 ],
        [ 2.3928847 ,  0.23803632, -3.6111562 , ..., -1.6057041 ,
         -1.436203  ,  2.1055822 ]], dtype=float32),
 array([[ 2.5103562 ,  0.4560998 , -2.0337412 , ..., -3.6385992 ,
          2.5943546 ,  2.7311163 ],
        [ 1.9764315 , -0.17333023, -6.0549254 , ..., -1.9884542 ,
         -2.7503848 ,  2.3761358 ],
        [ 2.3559234 ,  0.88232833, -4.627334  , ..., -3.3722978 ,
          2.693163  ,  2.3779893 ],
        ...,
        [ 1.5696975 , -0.6460703 , -4.4

In [15]:
a=[[1,2,3],[4,5,6]]#(2,3)
#b=[[7,8,9],[10,11,12]]
a=np.array(a)
#b=np.array(b)
c=[]
c.append(a)
#c.append(b)
np.array(c).mean(0)

array([[1., 2., 3.],
       [4., 5., 6.]])

In [16]:
np.array(cased_pred_lst).mean(0)

array([[ 2.7984571 ,  0.44034237, -1.4480911 , ..., -3.8564487 ,
         2.7417254 ,  2.5384173 ],
       [ 1.8120759 , -0.12645765, -5.487586  , ..., -2.008405  ,
        -2.9547608 ,  2.2427335 ],
       [ 2.4518032 ,  0.80789244, -4.3384485 , ..., -3.4247983 ,
         2.8845043 ,  2.434679  ],
       ...,
       [ 1.6572117 , -0.67442834, -4.2349787 , ..., -1.5938175 ,
        -0.39036936,  2.1185055 ],
       [ 2.775621  ,  1.6504242 , -3.6823158 , ..., -2.5352693 ,
         3.1531842 ,  2.788747  ],
       [ 2.288085  ,  0.04518995, -3.98935   , ..., -1.9433639 ,
        -1.1310542 ,  2.1625025 ]], dtype=float32)

In [17]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
cased_pred = np.array(cased_pred_lst).mean(0)
cased_pred = sigmoid(cased_pred)
cased_pred

array([[0.94259244, 0.6083406 , 0.1902955 , ..., 0.02070519, 0.9394443 ,
        0.92679155],
       [0.8596125 , 0.46842766, 0.00412077, ..., 0.11832326, 0.04951198,
        0.90402186],
       [0.9206932 , 0.6916602 , 0.01288849, ..., 0.03152938, 0.9470751 ,
        0.91943383],
       ...,
       [0.83986336, 0.33750597, 0.01427344, ..., 0.16884747, 0.40362844,
        0.8926889 ],
       [0.9413441 , 0.83894837, 0.02454691, ..., 0.07342236, 0.95903397,
        0.94206476],
       [0.90788543, 0.5112956 , 0.01817529, ..., 0.12527877, 0.2439666 ,
        0.89683133]], dtype=float32)

#19th PostProcessing

In [18]:
#####21th postprocessing############
def postProcessing(x):

    x = np.where(x>=0.9241, 1.0, x)
    x = np.where(x<=0.0759, 0.0, x)

    return x

targets = ['question_conversational',
           'question_type_compare', 
           'question_type_consequence', 
           'question_type_definition', 
           'question_type_entity', 
           'question_type_choice']

In [19]:
sub[sub.columns[1:]] = cased_pred

In [20]:
sub.loc[:, targets] = postProcessing(sub.loc[:, targets].values)

In [21]:
#sub[sub.columns[1:]] = cased_pred
sub.to_csv("submission.csv", index=False)
sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.942592,0.608341,0.190296,0.394527,0.653499,0.488018,0.67872,0.661841,0.806845,...,0.926952,0.949924,0.623096,0.979302,0.984692,0.883128,0.010401,0.020705,0.939444,0.926792
1,46,0.859613,0.468428,0.0,0.806487,0.759366,0.950711,0.550465,0.434654,0.107945,...,0.612518,0.955891,0.639856,0.97643,0.986246,0.885631,0.949764,0.118323,0.049512,0.904022
2,70,0.920693,0.69166,0.0,0.766912,0.928296,0.949284,0.610134,0.523464,0.176267,...,0.902476,0.905701,0.545887,0.960385,0.960636,0.775189,0.035798,0.031529,0.947075,0.919434
3,132,0.921804,0.466582,0.0,0.757888,0.731844,0.927062,0.580946,0.452688,0.052629,...,0.733163,0.96543,0.682734,0.98217,0.989428,0.907438,0.872733,0.138131,0.7851,0.904953
4,200,0.914812,0.473769,0.0,0.831799,0.810358,0.939746,0.625637,0.571752,0.145085,...,0.599233,0.90209,0.64812,0.95716,0.957328,0.809481,0.20699,0.137857,0.643624,0.90788
