In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook

from scipy import stats
from sklearn.model_selection import GroupKFold

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data
from transformers import *

import os
import re
import math
import random
from matplotlib import pyplot as plt
import warnings
from math import floor, ceil

warnings.filterwarnings('ignore')
device = torch.device('cuda')
#device = torch.device('cpu')
torch.backends.cudnn.benchmark=True

%matplotlib inline



In [2]:
output_dir="./token_model_config/"

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def spearman_corr(y_true, y_pred):
    if np.ndim(y_pred) == 2:
        corr = np.nan_to_num([stats.spearmanr(y_true[:, i], y_pred[:, i])[0] for i in range(y_true.shape[1])]).mean()
    else:
        corr = stats.spearmanr(y_true, y_pred)[0]
    return corr
  
def calc_each_spearman(valid_y, valid_pred):
    lst = []
    for idx in range(30):
        spearman = spearman_corr(valid_y[:,idx], valid_pred[:,idx])
        lst.append(spearman)
    df = pd.DataFrame(lst).T
    df.columns = class_names3
    return df

In [4]:
def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False
                current_segment_id = 1#新增 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def _trim_input(title, question, answer, max_sequence_length=512-1, 
                t_max_len=70-1, q_max_len=219, a_max_len=219):#???

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))

        t = t[:t_new_len]
        q = norm_token_length(q, q_new_len)
        a = norm_token_length(a, a_new_len)
    
    return t, q, a

def norm_token_length(tokens, l):
    if len(tokens) > l:
        head = l//2
        tail = l - head
        return tokens[:head] + tokens[-tail:]
    else:
        return tokens[:l]

def _convert_to_bert_inputs(title, question, answer, cate, max_sequence_length=512):
    """Converts tokenized input to ids, masks and segments for BERT"""
    #stoken = ["[CLS]"] + [cate] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]
    stoken_1 = ["[CLS]"] + [cate] + title + ["[SEP]"] + question + ["[SEP]"]
    # stoken_2 = ["[CLS]"] + title + ["[SEP]"] + answer + ["[SEP]"]
    # stoken_3 = ["[CLS]"] + question + ["[SEP]"] + answer + ["[SEP]"]
    stoken_2 = ["[CLS]"] + [cate]+title + ["[SEP]"] + answer + ["[SEP]"]
    stoken_3 = ["[CLS]"] + [cate]+question + ["[SEP]"] + answer + ["[SEP]"]
    input_ids = _get_ids(stoken_1, tokenizer, max_sequence_length)
    input_ids_2 = _get_ids(stoken_2, tokenizer, max_sequence_length)
    input_ids_3 = _get_ids(stoken_3, tokenizer, max_sequence_length)
    input_segments = _get_segments(stoken_1, max_sequence_length)
    input_segments_2 = _get_segments(stoken_2, max_sequence_length)
    input_segments_3 = _get_segments(stoken_3, max_sequence_length)
    

    
    #return [input_ids, input_segments]
    return input_ids, input_segments,input_ids_2,input_segments_2, input_ids_3, input_segments_3

def convert_row(row):
    #c = f"[{row['category'].lower()}]"
    c = f"[{row['category']}]"
    t, q, a = row["question_title"], row["question_body"], row["answer"]
    t, q, a = _trim_input(t, q, a)
    #ids, segments = _convert_to_bert_inputs(t, q, a, c)
    ids, segments, ids2, segments2, ids3, segments3 = _convert_to_bert_inputs(t, q, a, c)
    #total_input=[np.array([[ids, segments]]),np.array([[ids2, segments2]])]
    # total_input=[]
    # print(np.array([[ids, segments]]).shape)
    # total_input.append(np.array([[ids, segments]]))
    # total_input.append(np.array([[ids2, segments2]]))
    # return total_input
    return np.array([[ids, segments, ids2, segments2, ids3, segments3]])

In [5]:


train = pd.read_csv('C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/my-solution/input/google-quest-challenge/train.csv').fillna(' ')
sub = pd.read_csv('C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/my-solution/input/google-quest-challenge/sample_submission.csv').fillna(' ')

#model_class, tokenizer_class = transformer_models_dict[pretrained_weights]
#tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

categories = train["category"].unique().tolist()
categories = [f"[{c}]" for c in categories]
#tokenizer.add_tokens(categories)#??

#tokenizer.added_tokens_encoder#??

In [6]:
model="bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.add_tokens(categories)


loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\vocab.txt
l

5

In [7]:
out_path="./bert_based_tokenizer"
tokenizer.save_pretrained(output_dir+"tokenizer")

tokenizer config file saved in ./token_model_config/tokenizer\tokenizer_config.json
Special tokens file saved in ./token_model_config/tokenizer\special_tokens_map.json


('./token_model_config/tokenizer\\tokenizer_config.json',
 './token_model_config/tokenizer\\special_tokens_map.json',
 './token_model_config/tokenizer\\vocab.txt',
 './token_model_config/tokenizer\\added_tokens.json',
 './token_model_config/tokenizer\\tokenizer.json')

In [8]:
%%time
X = train.apply(convert_row, axis=1).values
X = np.vstack(X).reshape((len(X), 3072))
assert X.shape == (6079, 3072)

Token indices sequence length is longer than the specified maximum sequence length for this model (649 > 512). Running this sequence through the model will result in indexing errors


Wall time: 8.19 s


In [9]:
X.shape

(6079, 3072)

In [10]:
X[0][:512]

array([  101, 28996,  1327,  1821,   146,  3196,  1165,  1606,  4973,
       11182,  1939,  1104,   170, 23639,  2180, 11039,   136,   102,
        1258,  1773,  1213,  1114, 23639,  2180,  6427,  1113,   118,
        1103,   118, 10928,   113,  2373,   131, 11802, 11039,   117,
        1231,  1964,   119, 11039,  5378,  1113,   170,  2632, 11039,
         117, 14403,  4973, 11182,   114,   117,   146,  1156,  1176,
        1106,  1243,  1748,  1114,  1142,   119,  1109,  2645,  1114,
        1103,  4884,   146,  1215,  1110,  1115,  2817,  1110,  9506,
        1105, 22769,  1654,  1110, 20405,  1120,  1436,   119,  1188,
        2609,  1139, 18011,  1106,  1253,  5174,   113,  2373,   131,
        2044,  9895,   114,  1986,   117,  1112,  3450,  1110,  8320,
         117,   146,  1328,  1106,  1129,  1682,  1106,  5211,  1686,
        9895,   119,   146,  2059,  1115,  1111,  1142,   117, 12365,
       14467,  6697,  1105,  1383,  8637, 22769,  1209,  1129,  1104,
        1632,  1494,

In [11]:
X[0][512:1024]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
X[0][1024:1536]

array([  101, 28996,  1327,  1821,   146,  3196,  1165,  1606,  4973,
       11182,  1939,  1104,   170, 23639,  2180, 11039,   136,   102,
         146,  1198,  1400,  4973, 11182,   117,  1177,  1303,   112,
         188,  1103, 19244,   119,   119,   119,   119,  1184,  1821,
         146,  3196,  1165,  1606, 11182,   119,   119,   119,   136,
         138,  1304,  5602,  2971,  1104,  1609,   106,  3561, 11811,
        4253,  1115,  2462,  1121,  1103,  1322,  1104,  1103, 11039,
        1106,  1103, 15228,  1169,  2195,  1240,  1609,  1317,  6260,
         119, 16544,  1114,  1103,  1864,  1115,  1128,   112,  1325,
        1932,  5211,  2141,  1205,   118,  5363,  1106,  1444,  1106,
        2773,  1240, 11533,  9627,   119,  1109,  1864,  1103, 23639,
        2180,   112,   188,  1132,  1932,  1737,  1304,  1304,  4295,
         117,  1780,   146,  2059,  1115,  3102,   118,  2363,  6262,
         123,   119,   129,  1110,  3155,  1106,  1129,  2385,  4295,
         119,  1109,

In [13]:
X[0][1536:2048]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
X[0][2048:2560]

array([  101, 28996,  1258,  1773,  1213,  1114, 23639,  2180,  6427,
        1113,   118,  1103,   118, 10928,   113,  2373,   131, 11802,
       11039,   117,  1231,  1964,   119, 11039,  5378,  1113,   170,
        2632, 11039,   117, 14403,  4973, 11182,   114,   117,   146,
        1156,  1176,  1106,  1243,  1748,  1114,  1142,   119,  1109,
        2645,  1114,  1103,  4884,   146,  1215,  1110,  1115,  2817,
        1110,  9506,  1105, 22769,  1654,  1110, 20405,  1120,  1436,
         119,  1188,  2609,  1139, 18011,  1106,  1253,  5174,   113,
        2373,   131,  2044,  9895,   114,  1986,   117,  1112,  3450,
        1110,  8320,   117,   146,  1328,  1106,  1129,  1682,  1106,
        5211,  1686,  9895,   119,   146,  2059,  1115,  1111,  1142,
         117, 12365, 14467,  6697,  1105,  1383,  8637, 22769,  1209,
        1129,  1104,  1632,  1494,   119,  1573,   117,  1141,  5119,
        1133,  5865,  5146,  1110,   170, 23639,  2180, 11039,   113,
        1474,   117,

In [15]:
X[0][2560:].shape

(512,)

In [16]:
# class_names = list(sub.columns[1:])
# y = train[class_names].values#(6079,30)

# lst = []
# for idx in range(30):
#     t = pd.DataFrame(y[:,idx])[0]
#     # print(len(t))
#     #print(1-t.value_counts())
#     #print(1-t.value_counts()/len(t))
#     w_df = (1-t.value_counts()/len(t)).reset_index()
#     #print(w_df)
#     w_dic = {row["index"]: row[0] for _, row in w_df.iterrows()}
#     # print("=======================")
#     # print(w_dic)
#     w = t.map(w_dic).values#(6079,)
#     #print(w.shape)
#     lst.append(w)
# # print(lst)
# # print("==================================")
# weights = np.vstack(lst).T#轉置, shape:(6079,30)

# import copy
# y_true = copy.deepcopy(y)
# y = np.hstack([y, weights])#(6079,60)


In [17]:
sub.columns[1:22]

Index(['question_asker_intent_understanding', 'question_body_critical',
       'question_conversational', 'question_expect_short_answer',
       'question_fact_seeking', 'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written'],
      dtype='object')

In [18]:
class_names = list(sub.columns[1:22])
class_names2 = list(sub.columns[22:])
class_names3= list(sub.columns[1:])
y = train[class_names].values#(6079,21)
y2 = train[class_names2].values
y3 = train[class_names3].values
lst = []
# for idx in range(30):
#     t = pd.DataFrame(y[:,idx])[0]
#     # print(len(t))
#     #print(1-t.value_counts())
#     #print(1-t.value_counts()/len(t))
#     w_df = (1-t.value_counts()/len(t)).reset_index()
#     #print(w_df)
#     w_dic = {row["index"]: row[0] for _, row in w_df.iterrows()}
#     # print("=======================")
#     # print(w_dic)
#     w = t.map(w_dic).values#(6079,)
#     #print(w.shape)
#     lst.append(w)
# print(lst)
# print("==================================")
# weights = np.vstack(lst).T#轉置, shape:(6079,30)

import copy
y_true = copy.deepcopy(y)
y_true2 = copy.deepcopy(y2)
#y = np.hstack([y, weights])#(6079,60)

In [19]:
X[:, 512:].min()

0

In [20]:
def custom_loss(data,data2, data3,targets,targets2,targets3):
    
    # mse = nn.MSELoss(reduction="none")(data[:,:30].sigmoid(), targets[:,:30])
    # bce = nn.BCEWithLogitsLoss(reduction='none')(data[:,:30], targets[:,:30])#??
    mse = nn.MSELoss(reduction="none")(data[:,:].sigmoid(), targets[:,:])
    bce = nn.BCEWithLogitsLoss(reduction='none')(data[:,:], targets[:,:])#??
    
    mse2 = nn.MSELoss(reduction="none")(data2[:,:].sigmoid(), targets2[:,:])
    bce2 = nn.BCEWithLogitsLoss(reduction='none')(data2[:,:], targets2[:,:])#??

    mse3 = nn.MSELoss(reduction="none")(data3[:,:].sigmoid(), targets3[:,:])
    bce3 = nn.BCEWithLogitsLoss(reduction='none')(data3[:,:], targets3[:,:])#??
    #w =  targets[:,30:]
    #loss = (mse*w).sum() + bce.sum()
    loss = (mse).sum()+ bce.sum()+mse2.sum()+bce2.sum()+mse3.sum()+bce3.sum()
    return loss

class CustomBert(nn.Module):
    def __init__(self, model,config_path=None):
        super(CustomBert, self).__init__()
        self.config = AutoConfig.from_pretrained(model) 
        
        self.config.Q_labels = 21
        self.config.A_labels = 9
        self.config.All_labels = 30
        self.config.output_hidden_states = True
        self.n_use_layer = 4 #原本
        #self.n_use_layer = 2
        self.double_bert= 1
        self.n_labels = self.config.num_labels
        #self.config.save_pretrained("bert_based_config")
        self.config.save_pretrained(output_dir+"config")
        #self.bert = BertModel(config)
        self.bert=AutoModel.from_pretrained(model, config=self.config)
        self.bert2=AutoModel.from_pretrained(model, config=self.config)
        self.bert3=AutoModel.from_pretrained(model, config=self.config)
        #self.bert.save_pretrained('bert_based_model')
        self.bert.save_pretrained(output_dir+"model")
        # self.dense1 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        # self.dense2 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        # self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        # self.classifier = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.num_labels)

        self.dense1 = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.double_bert*self.config.hidden_size*self.n_use_layer)
        self.dense2 = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.double_bert*self.config.hidden_size*self.n_use_layer)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.config.Q_labels)
        self.classifier2 = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.config.A_labels)
        self.classifier3 = nn.Linear(self.double_bert*self.config.hidden_size*self.n_use_layer, self.config.All_labels)
        #self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None
                ,input_ids2=None, attention_mask2=None, token_type_ids2=None,
                input_ids3=None, attention_mask3=None, token_type_ids3=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        # outputs = self.bert(input_ids,
        #                     attention_mask=attention_mask,
        #                     token_type_ids=token_type_ids,
        #                     position_ids=position_ids,
        #                     head_mask=head_mask,
        #                     inputs_embeds=inputs_embeds)
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                            )
        outputs2 = self.bert2(input_ids2,
                            attention_mask=attention_mask2,
                            token_type_ids=token_type_ids2
                            )
        
        outputs3 = self.bert3(input_ids3,
                            attention_mask=attention_mask3,
                            token_type_ids=token_type_ids3
                            )
        #print(outputs[2][-1].shape)
        pooled_output = torch.cat([outputs[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)#把倒數最後4個layer的cls output concat在一起，把4個(8,768) concat，變成(8,3072) #原本
        pooled_output2 = torch.cat([outputs2[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)
        pooled_output3 = torch.cat([outputs3[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)
        #pooled_output = torch.cat([outputs[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)#把倒數最後2個layer的cls output concat在一起,把2個(8,768) concat，變成(8,1536)
        #pooled_output2 = torch.cat([outputs2[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)#同上
        #double_pooled_output=torch.cat([pooled_output,pooled_output],dim=1)#(8,3072)
        
        pooled_output = self.dense1(pooled_output)
        pooled_output = self.dense2(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        pooled_output2 = self.dense1(pooled_output2)
        pooled_output2 = self.dense2(pooled_output2)
        pooled_output2 = self.dropout(pooled_output2)
        logits2 = self.classifier2(pooled_output2)
        
        pooled_output3 = self.dense1(pooled_output3)
        pooled_output3 = self.dense2(pooled_output3)
        pooled_output3 = self.dropout(pooled_output3)
        logits3 = self.classifier3(pooled_output3)
        # double_pooled_output = self.dense1(double_pooled_output)
        # double_pooled_output = self.dense2(double_pooled_output)
        # double_pooled_output = self.dropout(double_pooled_output)
        # logits = self.classifier(double_pooled_output)
        
        outputs = (logits,) + outputs[2:]
        outputs2 = (logits2,) + outputs2[2:]
        outputs3 = (logits3,) + outputs3[2:]

        return outputs,outputs2,outputs3

In [21]:
#model=CustomBert(model)

In [22]:
# param_optimizer = list(model.named_parameters())
# #print(param_optimizer[0])
# print("==============================")
# print(param_optimizer[0][0])
# print("==============================")
# print(param_optimizer[0][1])

In [23]:
#len(param_optimizer)

In [24]:
# model = model.to(device)
# model.bert.resize_token_embeddings(len(tokenizer))#??
# fold=7
# bert_path=f"./Bce-NoOptbinning/double-bert-based-case_f{fold}_best"
# model.load_state_dict(torch.load(bert_path),strict=False)

In [25]:

N_FOLD=10
N_BERT_LABEL = 30
SEED = 42
#BS = 8
BS = 2
# parameter
n_epoch = 3
learning_rate = 5e-5
max_grad_norm = 1.0

gkf = GroupKFold(n_splits=N_FOLD).split(X=train["question_body"], groups=train["question_body"])#??

spearman_scores = []
best_spearman_lst = []
losses_lst = []
epoch_spearman_lst = []
lr_lst_lst = []
each_speaman_dfs = []
#model=CustomBert(model)
#torch.save(model.config, 'config.pth')
for fold, (train_idx, valid_idx) in enumerate(gkf):#??
  # print(train_idx)
  # print("=======================")
  # print(valid_idx)
  # if fold>=1:
  #   break
  if fold in [0]:
    continue

  seed_everything(SEED)

  # Load Model
#   config = BertConfig.from_pretrained(pretrained_weights)
#   model = CustomBert.from_pretrained(pretrained_weights, config=config)
  model_name="bert-base-cased"
  model=CustomBert(model_name)
  model = model.to(device)
  model.bert.resize_token_embeddings(len(tokenizer))#??
  model.bert2.resize_token_embeddings(len(tokenizer))
  model.bert3.resize_token_embeddings(len(tokenizer))
  model = model.train()
  
  # optimizer setting
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
  optimizer_grouped_parameters = []
  max_lrs = []
  for param in param_optimizer:
    if any(n in param[0] for n in no_decay):#weight_decay
      weight_decay = 0.0
    else:
      weight_decay = 0.1
    if param[0].find("bert.encoder.layer") != -1:
      
      n_diff_last = 11 - int(param[0].split(".")[3])
      lr = learning_rate*0.9**n_diff_last
    elif "embeddings" in param[0]:
      lr = learning_rate*0.9**11
    else:
      lr = learning_rate
    max_lrs.append(lr)
    d = {"params": param[1], "weight_decay": weight_decay}
    optimizer_grouped_parameters.append(d)
  optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=True)

  # print(train_idx)
  # print("===========================")
  # print(valid_idx)
  
  # train valid split
  train_x = X[train_idx]
  valid_x = X[valid_idx]
  train_y = y[train_idx]
  valid_y = y[valid_idx]

  train_y2 = y2[train_idx]
  train_y3 = y3[train_idx]
  valid_y2 = y2[valid_idx]
  valid_y3 = y3[valid_idx]
  # set loader  
  train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_x, dtype=torch.long), 
                                                 torch.tensor(train_y, dtype=torch.float),torch.tensor(train_y2, dtype=torch.float)
                                                 ,torch.tensor(train_y3, dtype=torch.float))
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BS, shuffle=True)
  valid_dataset = torch.utils.data.TensorDataset(torch.tensor(valid_x, dtype=torch.long), 
                                                 torch.tensor(valid_y, dtype=torch.float),torch.tensor(valid_y2, dtype=torch.float)
                                                 ,torch.tensor(valid_y3, dtype=torch.float))
  valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BS, shuffle=False)

  # set schedueler
  num_training_steps = len(train_loader)*n_epoch
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lrs, total_steps=num_training_steps)

  model.zero_grad()
  optimizer.zero_grad()
   
  best_spearman = 0
  losses = []
  epoch_spearman = []
  lr_lst = []
  for epoch in range(n_epoch):
    lr = np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean()
    tk0 = tqdm_notebook(enumerate(train_loader), total=len(train_loader), leave=False)
    for i, (x_batch, y_batch,y_batch2,y_batch3) in tk0:
      input_ids = x_batch[:, :512]
      token_ids = x_batch[:, 512:1024]
      input_ids2 = x_batch[:, 1024:1536]
      token_ids2 = x_batch[:, 1536:2048]
      input_ids3 = x_batch[:, 2048:2560]
      token_ids3 = x_batch[:, 2560:]
      #print((input_ids > 0))
      #print(token_ids.max())
      #mask=(input_ids > 0).type(torch.uint8)
      #print(mask) 
      y_pred,y_pred2,y_pred3 = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device),
                     input_ids2=input_ids2.to(device),attention_mask2=(input_ids2 > 0).to(device),token_type_ids2=token_ids2.to(device)
                     ,input_ids3=input_ids3.to(device),attention_mask3=(input_ids3 > 0).to(device),token_type_ids3=token_ids3.to(device))
      #y_pred = model(input_ids.to(device), attention_mask=mask.to(device), token_type_ids=token_ids.to(device))
      loss = custom_loss(y_pred[0], y_pred2[0],y_pred3[0],y_batch.to(device),y_batch2.to(device),y_batch3.to(device))
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) 
      optimizer.step()
      optimizer.zero_grad()
      scheduler.step()
      lr_lst.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
      losses.append(float(loss))

    # epoch validation
    for param in model.parameters():
      param.requires_grad=False
    model.eval()

    lst = []
    sum_loss = 0
    for i, (x_batch, y_batch,y_batch2,y_batch3)  in enumerate(valid_loader):
      input_ids = x_batch[:, :512]
      token_ids = x_batch[:, 512:1024]
      input_ids2 = x_batch[:, 1024:1536]
      token_ids2 = x_batch[:, 1536:2048]
      input_ids3 = x_batch[:, 2048:2560]
      token_ids3 = x_batch[:, 2560:]

      
      with torch.no_grad():
        #y_pred = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device))
        y_pred,y_pred2,y_pred3 = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device),
                     input_ids2=input_ids2.to(device),attention_mask2=(input_ids2 > 0).to(device),token_type_ids2=token_ids2.to(device)
                     ,input_ids3=input_ids3.to(device),attention_mask3=(input_ids3 > 0).to(device),token_type_ids3=token_ids3.to(device))
        loss = custom_loss(y_pred[0], y_pred2[0],y_pred3[0],y_batch.to(device),y_batch2.to(device),y_batch3.to(device))
      #print(y_pred[0].shape)
      
      total_y_pred=torch.cat((y_pred[0],y_pred2[0]),dim=1)
      
      #lst.append(y_pred[0].sigmoid().cpu().squeeze().numpy())
      lst.append(total_y_pred.sigmoid().cpu().squeeze().numpy())
      #lst+(y_pred2[0].sigmoid().cpu().squeeze().numpy())
      sum_loss += loss.cpu().squeeze().numpy()
    valid_pred = np.vstack(lst)#(608,30)
    
    ave_loss = sum_loss/len(valid_loader)

    spearman_score = spearman_corr(valid_y3[:,:N_BERT_LABEL], valid_pred)  
    epoch_spearman.append(spearman_score)
    
    for param in model.parameters():
      param.requires_grad=True
    model.train()
    model_name="triple-bert-based-case"
    # print(f"{model}_f{fold}_best")
    # print("=======================================")
    if best_spearman <= spearman_score:
      #torch.save(model.state_dict(), f"{model_name}_f{fold}_best")
      torch.save(model.state_dict(), f"./triple/Bce-NoOptbinning/{model_name}_f{fold}_best")
      best_spearman = spearman_score
      #print(valid_y3[:,:N_BERT_LABEL])
      each_speaman_df = calc_each_spearman(valid_y3[:,:N_BERT_LABEL], valid_pred)
      display(each_speaman_df)

    print(f"fold-{fold} epoch {epoch}: {spearman_score} / loss avg: {ave_loss}")
    
  best_spearman_lst.append(best_spearman)
  losses_lst.append(losses)
  epoch_spearman_lst.append(epoch_spearman)
  lr_lst_lst.append(lr_lst)
  each_speaman_dfs.append(each_speaman_df)

  torch.cuda.empty_cache()

loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Configuration saved in ./token_model_config/config\config.json
loading weights file pytorch_model.bin from cache at C:\Users\Lab000/.cache\huggingface\hub\models-

  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.453731,0.688572,0.296974,0.209405,0.303619,0.361379,0.318654,0.475862,0.501662,0.062339,...,0.518833,0.234342,0.377539,0.034318,0.141013,0.237259,0.695178,0.218975,0.567409,0.117885


fold-1 epoch 0: 0.3635655513119051 / loss avg: 51.75513201638272


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.415535,0.707705,0.337191,0.259851,0.31279,0.358814,0.36066,0.518947,0.545929,0.071719,...,0.521757,0.208919,0.331187,0.12857,0.1366,0.272252,0.734242,0.251885,0.629254,0.139996


fold-1 epoch 1: 0.3904751859168864 / loss avg: 50.15014406254417


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.448355,0.709077,0.339699,0.322908,0.33578,0.359071,0.386427,0.527935,0.56657,0.062327,...,0.562728,0.220154,0.364288,0.127093,0.162312,0.291691,0.735073,0.259686,0.631185,0.205427


fold-1 epoch 2: 0.40666583651676286 / loss avg: 49.273471662872716


loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Configuration saved in ./token_model_config/config\config.json
loading weights file pytorch_model.bin from cache at C:\Users\Lab000/.cache\huggingface\hub\models-

  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.347619,0.600262,0.336393,0.206833,0.249241,0.416189,0.336893,0.395622,0.497644,0.05455,...,0.514976,0.228704,0.436811,0.177209,0.163432,0.336095,0.675892,0.23915,0.55985,0.163978


fold-2 epoch 0: 0.3579074340072513 / loss avg: 52.12703221722653


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.366546,0.640786,0.365937,0.254477,0.31436,0.448313,0.379643,0.47,0.519452,0.123507,...,0.519777,0.297008,0.4384,0.13608,0.21643,0.351842,0.715551,0.311111,0.62021,0.195609


fold-2 epoch 1: 0.38880654894836775 / loss avg: 50.654271979081


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.363224,0.650295,0.366166,0.282453,0.322366,0.445573,0.395201,0.48462,0.538351,0.102864,...,0.529853,0.261838,0.485719,0.129249,0.179509,0.348739,0.716286,0.302156,0.636145,0.238945


fold-2 epoch 2: 0.3951324570238007 / loss avg: 50.19039057430468


loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Configuration saved in ./token_model_config/config\config.json
loading weights file pytorch_model.bin from cache at C:\Users\Lab000/.cache\huggingface\hub\models-

  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.219817,0.638898,0.31969,0.250034,0.288633,0.339509,0.272235,0.451239,0.55219,0.070207,...,0.50736,0.19159,0.39274,0.100821,0.089356,0.224369,0.725208,0.231683,0.660738,0.248238


fold-3 epoch 0: 0.3603013925531088 / loss avg: 51.79635229236201


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.356663,0.66988,0.345582,0.238474,0.315262,0.38365,0.31927,0.447006,0.511507,0.150768,...,0.536916,0.261052,0.399484,0.124378,0.173476,0.294212,0.739,0.198955,0.675127,0.235387


fold-3 epoch 1: 0.3836680979665319 / loss avg: 50.29585624368567


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.374886,0.69929,0.370348,0.244548,0.316523,0.374951,0.33343,0.469174,0.56193,0.144416,...,0.539495,0.272787,0.43138,0.137485,0.163357,0.283974,0.748957,0.189303,0.685992,0.256928


fold-3 epoch 2: 0.3939932441755045 / loss avg: 49.145515410523664


loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Configuration saved in ./token_model_config/config\config.json
loading weights file pytorch_model.bin from cache at C:\Users\Lab000/.cache\huggingface\hub\models-

  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.382996,0.54864,0.396163,0.35675,0.3184,0.406748,0.236301,0.477085,0.510004,0.014579,...,0.480107,0.180818,0.31184,0.100186,0.16285,0.178163,0.716037,0.222562,0.61912,0.160237


fold-4 epoch 0: 0.3643723006565961 / loss avg: 51.439658786121164


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.37955,0.589564,0.380379,0.32638,0.286301,0.424473,0.301825,0.494189,0.564084,0.046908,...,0.524007,0.219642,0.29829,0.144373,0.190839,0.214932,0.730486,0.231207,0.668948,0.210328


fold-4 epoch 1: 0.3822240544341434 / loss avg: 49.32229473716334


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.39594,0.606558,0.394594,0.364562,0.368293,0.446117,0.30544,0.520516,0.579188,0.02811,...,0.530044,0.264265,0.333902,0.201055,0.189855,0.262759,0.740425,0.242866,0.665772,0.235414


fold-4 epoch 2: 0.40077184113193287 / loss avg: 48.523101474109446


loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Configuration saved in ./token_model_config/config\config.json
loading weights file pytorch_model.bin from cache at C:\Users\Lab000/.cache\huggingface\hub\models-

  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.324427,0.638124,0.371821,0.171209,0.2954,0.286022,0.338005,0.326053,0.575589,0.03066,...,0.426873,0.172664,0.384248,0.158495,0.169053,0.28131,0.713174,0.238116,0.6297,0.220528


fold-5 epoch 0: 0.35965300210658235 / loss avg: 51.790085309430175


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.355329,0.714179,0.379091,0.193842,0.280537,0.367522,0.332348,0.45296,0.618293,0.016866,...,0.473192,0.232677,0.378749,0.189124,0.183507,0.315436,0.720823,0.277548,0.663977,0.233145


fold-5 epoch 1: 0.38697605964005527 / loss avg: 49.327952108885114


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.366159,0.71697,0.383453,0.234259,0.31537,0.362027,0.384236,0.466823,0.628002,0.037499,...,0.469759,0.248507,0.395941,0.195111,0.154449,0.316362,0.716627,0.274158,0.670387,0.197837


fold-5 epoch 2: 0.39486878412826676 / loss avg: 48.871017430958


loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Configuration saved in ./token_model_config/config\config.json
loading weights file pytorch_model.bin from cache at C:\Users\Lab000/.cache\huggingface\hub\models-

  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.257932,0.615891,0.471133,0.287469,0.343784,0.440561,0.299461,0.403643,0.4924,0.0041,...,0.480622,0.179225,0.385034,0.13307,0.176022,0.270132,0.707777,0.256561,0.553739,0.11678


fold-6 epoch 0: 0.35365237429392843 / loss avg: 53.629215503993784


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.364423,0.639267,0.461243,0.306278,0.378362,0.474521,0.33361,0.452942,0.538859,0.080495,...,0.51127,0.217143,0.461215,0.166128,0.163817,0.262839,0.692935,0.218339,0.554148,0.127206


fold-6 epoch 1: 0.38466873787927564 / loss avg: 51.79725502666674


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.382929,0.651042,0.493737,0.314211,0.40714,0.481413,0.304248,0.471,0.57203,0.074927,...,0.548317,0.217177,0.438169,0.1554,0.150492,0.290001,0.721884,0.214275,0.561234,0.114311


fold-6 epoch 2: 0.3953643433454056 / loss avg: 50.6818452508826


loading configuration file config.json from cache at C:\Users\Lab000/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Configuration saved in ./token_model_config/config\config.json
loading weights file pytorch_model.bin from cache at C:\Users\Lab000/.cache\huggingface\hub\models-

  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.3331,0.587487,0.388509,0.195275,0.251259,0.384145,0.298486,0.469867,0.562054,0.095795,...,0.402184,0.113755,0.318011,0.025492,0.071072,0.241512,0.74989,0.277352,0.631331,0.128723


fold-7 epoch 0: 0.3521414562480321 / loss avg: 52.97065596831472


  0%|          | 0/2736 [00:00<?, ?it/s]

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.331086,0.626157,0.449936,0.284308,0.340565,0.469894,0.358787,0.530761,0.599947,0.164623,...,0.45651,0.238913,0.260035,0.109817,0.187583,0.33479,0.74318,0.304152,0.648177,0.092573


fold-7 epoch 1: 0.395664750462516 / loss avg: 49.86160350473303


  0%|          | 0/2736 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
len(tokenizer)

29001