In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook

from scipy import stats
from sklearn.model_selection import GroupKFold

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data
#from transformers import *
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
import os
import re
import math
import random
from matplotlib import pyplot as plt
import warnings
from math import floor, ceil

warnings.filterwarnings('ignore')
device = torch.device('cuda')
torch.backends.cudnn.benchmark=True

%matplotlib inline

In [2]:
train = pd.read_csv('../input/google-quest-challenge/train.csv').fillna(' ')
test = pd.read_csv('../input/google-quest-challenge/test.csv').fillna(' ')
sub = pd.read_csv('../input/google-quest-challenge/sample_submission.csv').fillna(' ')

In [3]:
def _get_segments_xlnet(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "<sep>":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return [0] * (max_seq_length - len(tokens)) + segments

def _get_segments_bert(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))#padding

def _get_ids_xlnet(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids =  [5] * (max_seq_length-len(token_ids)) + token_ids
    return input_ids

def _get_ids_bert(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))#padding
    return input_ids

def _trim_input(title, question, answer, max_sequence_length=512-1, 
                t_max_len=70-1, q_max_len=219, a_max_len=219):


    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)
    # print(type(t))
    # print(t)

    if (t_len+q_len+a_len+4) > max_sequence_length:#???
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = norm_token_length(q, q_new_len)#??
        a = norm_token_length(a, a_new_len)#??
        
    
    return t, q, a

def norm_token_length(tokens, l):
    if len(tokens) > l:
        head = l//2
        tail = l - head
        return tokens[:head] + tokens[-tail:]
    else:
        return tokens[:l]

def _convert_to_bert_inputs(title, question, answer, cate, pretrained_weights, max_sequence_length=512):
    """Converts tokenized input to ids, masks and segments for BERT"""
    if "bert-base" in pretrained_weights:
        stoken = ["[CLS]"] + [cate] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]#type:list
        # print(stoken)
        # print("==================")
        input_ids = _get_ids_bert(stoken, tokenizer, max_sequence_length)
        input_segments = _get_segments_bert(stoken, max_sequence_length)
        # print(input_ids)
        # print("==================")
        # print(input_segments)
        # print("==================")

    elif pretrained_weights == "xlnet-base-cased":
        stoken = [cate] + title + ["<sep>"] + question + ["<sep>"] + answer + ["<sep>", "<cls>"]
        input_ids = _get_ids_xlnet(stoken, tokenizer, max_sequence_length)
        input_segments = _get_segments_xlnet(stoken, max_sequence_length)
        try:
            cls_index = input_segments.index(5) - 1
        except ValueError:
            cls_index = -1
        input_segments[cls_index] = 2
    
    return [input_ids, input_segments]

def convert_row(row, pretrained_weights):
    # print(row)
    # print("==================")
    if pretrained_weights == "bert-base-uncased":
        c = f"[{row['category'].lower()}]"#type:str
    elif pretrained_weights == "bert-base-cased":
        c = f"[{row['category']}]"#type:str
    elif pretrained_weights == "xlnet-base-cased":
        c = f"[{row['category']}]"#type:str
    t, q, a = title = row["question_title"], row["question_body"], row["answer"]#type:str
    # print(type(t))
    # print(t)
    # print(c)
    # print(type(c))
    #print("=====================")
    t, q, a = _trim_input(t, q, a)
    ids, segments = _convert_to_bert_inputs(t, q, a, c, pretrained_weights)
    
    return np.array([[ids, segments]])#shape:(1, 2, 512)

In [4]:
#model="bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained("./token_model_config/tokenizer/")
#tokenizer.add_tokens("./bert_based_tokenizer/added_tokens.json")
#tokenizer.add_tokens(categories)


In [5]:
len(tokenizer)

29001

In [6]:
def custom_loss(data, targets):
    mse = nn.MSELoss(reduction="none")(data[:,:30].sigmoid(), targets[:,:30])
    bce = nn.BCEWithLogitsLoss(reduction='none')(data[:,:30], targets[:,:30])
    w =  targets[:,30:]
    loss = (mse*w).sum() + bce.sum()
    return loss

class CustomBert(nn.Module):
    def __init__(self, config_path=None):
        super(CustomBert, self).__init__()
        self.config = AutoConfig.from_pretrained(config_path) 
        #self.config = torch.load(config_path,output_hidden_states=True)
        self.config.num_labels = 30
        self.config.output_hidden_states = True
        self.n_use_layer = 4
        self.n_labels = self.config.num_labels
        #self.config.save_pretrained("bert_based_config")
        #self.bert = BertModel(config)
        self.bert=AutoModel.from_config(self.config)
        #self.bert.save_pretrained('bert_based_model')
        self.dense1 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        self.dense2 = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.hidden_size*self.n_use_layer)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.classifier = nn.Linear(self.config.hidden_size*self.n_use_layer, self.config.num_labels)
        #self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        # outputs = self.bert(input_ids,
        #                     attention_mask=attention_mask,
        #                     token_type_ids=token_type_ids,
        #                     position_ids=position_ids,
        #                     head_mask=head_mask,
        #                     inputs_embeds=inputs_embeds)
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                            )
        
        pooled_output = torch.cat([outputs[2][-1*(i+1)][:,0] for i in range(self.n_use_layer)], dim=1)
        pooled_output = self.dense1(pooled_output)
        pooled_output = self.dense2(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]
        return outputs

In [7]:
model=CustomBert("token_model_config/config/config.json")

In [8]:
BS=8

In [9]:
pretrained_weights = 'bert-base-cased'
#X_test = test[["question_title", "question_body", "answer", "category"]].progress_apply(lambda x: convert_row(x, pretrained_weights), axis=1).values#shape(476)
X_test = test[["question_title", "question_body", "answer", "category"]].apply(lambda x: convert_row(x, pretrained_weights), axis=1).values#shape(476)
#np.vstack(X_test).shape : (476, 2, 512)
X_test = np.vstack(X_test).reshape((len(X_test), 1024))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BS, shuffle=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (4324 > 512). Running this sequence through the model will result in indexing errors


In [10]:
len(tokenizer)

29001

In [11]:
X_test.shape

(476, 1024)

In [12]:
model = model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.bert.resize_token_embeddings(len(tokenizer))#??
model.eval()

CustomBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29001, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (d

In [13]:
#model_dir_target = "../input/Optimize binning/bert-base-cased"#

cased_pred_lst = []
for fold in range(10):
    if fold in [0,1,2,3,4,5,6,7,8]:
        continue
    #bert_path = f"{model_dir_target}/bert-base-cased_f{fold}_best"
    bert_path=f"./bert_based_case/no_bce_no_opt_binning/bert-based-case_f{fold}_best"
    model.load_state_dict(torch.load(bert_path),strict=False)
    
    lst = []
    for i, (x_batch,)  in enumerate(test_loader):
        input_ids = x_batch[:, :512]
        token_ids = x_batch[:, 512:]
        pred = model(input_ids.to(device), attention_mask=(input_ids > 0).to(device), token_type_ids=token_ids.to(device))
        
        lst.append(pred[0].detach().cpu().squeeze().numpy())
    test_pred = np.vstack(lst)#shape:(476, 30)
    
    cased_pred_lst.append(test_pred)

In [14]:
cased_pred_lst[0].shape

(476, 30)

In [15]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))
cased_pred = np.array(cased_pred_lst).mean(0)
cased_pred = sigmoid(cased_pred)
cased_pred

array([[0.9526806 , 0.683275  , 0.05784913, ..., 0.02111472, 0.9475731 ,
        0.9419046 ],
       [0.75497913, 0.42909217, 0.00516468, ..., 0.05026854, 0.02080454,
        0.87417156],
       [0.9288383 , 0.76316243, 0.02282806, ..., 0.06160448, 0.95960975,
        0.9164232 ],
       ...,
       [0.8807903 , 0.43563855, 0.0088164 , ..., 0.19514424, 0.3707464 ,
        0.8879043 ],
       [0.9017531 , 0.769908  , 0.00463597, ..., 0.01312383, 0.9181951 ,
        0.93142515],
       [0.94508386, 0.537904  , 0.02420256, ..., 0.02967194, 0.11720841,
        0.95941395]], dtype=float32)

In [16]:
#test_pred = bert_pred*0.4 + cased_pred*0.6
test_pred=cased_pred

## LGBM

In [17]:
import pickle
lgbm_models = pickle.load(open("C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/26th-solution/inference/input/quest-lgbm/lgbm_question_type_spelling.pkl", 'rb'))
count_vectorizers = pickle.load(open("C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/26th-solution/inference/input/quest-lgbm/tfidf_vectorizers.pkl", 'rb'))

In [18]:
test["question_title"][0]

'Will leaving corpses lying around upset my prisoners?'

In [19]:
count_vectorizers[0].transform(test["question_title"])#將字串轉換成604維度的稀疏矩陣

<476x604 sparse matrix of type '<class 'numpy.float64'>'
	with 1666 stored elements in Compressed Sparse Row format>

In [20]:
dfs = []
for idx, col_name in enumerate(["question_title", "question_body", "answer"]):
    X = count_vectorizers[idx].transform(test[col_name])
    
    feat = [f"{col_name}_{c}".encode("utf-8") for c in count_vectorizers[idx].get_feature_names()]
    #print(len(feat))
    # print("=========================")
    df = pd.DataFrame(X.toarray(), columns=feat)
    #print(df)
    dfs.append(df)
test_x = pd.concat(dfs, axis=1)
test_x.shape

(476, 8325)

In [21]:
lgbm_pred = np.zeros(len(test_x))
for fold in range(4):
    lgbm_pred += lgbm_models[fold].predict(test_x)/4
lgbm_pred.shape 

(476,)

## Post Processing

In [22]:
sub.columns[1:]

Index(['question_asker_intent_understanding', 'question_body_critical',
       'question_conversational', 'question_expect_short_answer',
       'question_fact_seeking', 'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written'],
      dtype='object')

In [23]:
unique_values=train["answer_type_procedure"].unique()
print(list(unique_values)+[0,1])
print(set(list(unique_values)+[0,1]))

[0.0, 0.3333333333333333, 0.6666666666666666, 1.0, 0.5, 0, 1]
{0.0, 0.6666666666666666, 0.3333333333333333, 1.0, 0.5}


In [24]:
norm_dict = {}
for c in sub.columns[1:]:
  unique_values = train[c].unique()
  unique_values = list(set(list(unique_values)+[0,1]))
  lst = []
  for common_num in range(90):#90? 0
    num = 90 - common_num#90
    bunbo = [round((1/num)*n, 8) for n in range(num+1)]
    #print(len(bunbo))
    kyoutu = [round(v, 8) for v in unique_values if round(v, 8) in bunbo]
    #print(kyoutu)
    if len(kyoutu) == len(unique_values):
      lst.append(num)
  norm_dict[c] = min(lst)
norm_dict

{'question_asker_intent_understanding': 18,
 'question_body_critical': 18,
 'question_conversational': 6,
 'question_expect_short_answer': 6,
 'question_fact_seeking': 6,
 'question_has_commonly_accepted_answer': 6,
 'question_interestingness_others': 18,
 'question_interestingness_self': 18,
 'question_multi_intent': 6,
 'question_not_really_a_question': 6,
 'question_opinion_seeking': 6,
 'question_type_choice': 6,
 'question_type_compare': 6,
 'question_type_consequence': 6,
 'question_type_definition': 6,
 'question_type_entity': 6,
 'question_type_instructions': 6,
 'question_type_procedure': 6,
 'question_type_reason_explanation': 6,
 'question_type_spelling': 3,
 'question_well_written': 18,
 'answer_helpful': 18,
 'answer_level_of_information': 18,
 'answer_plausible': 18,
 'answer_relevance': 18,
 'answer_satisfaction': 30,
 'answer_type_instructions': 6,
 'answer_type_procedure': 6,
 'answer_type_reason_explanation': 6,
 'answer_well_written': 18}

In [25]:
def spearman_corr(y_true, y_pred):
    if np.ndim(y_pred) == 2:
        corr = np.nan_to_num([stats.spearmanr(y_true[:, i], y_pred[:, i])[0] for i in range(y_true.shape[1])]).mean()
    else:
        corr = stats.spearmanr(y_true, y_pred)[0]
    return corr


# ref: https://qiita.com/kaggle_master-arai-san/items/d59b2fb7142ec7e270a5
# thank you kaggle masterのアライさん!!
class OptimizedRounder(object):
    def __init__(self,
                 n_overall: int = 5,
                 n_classwise: int = 5,
                 n_classes: int = 7,
                 metric: str = "qwk"):
        self.n_overall = n_overall
        self.n_classwise = n_classwise
        self.n_classes = n_classes
        self.coef = [1.0 / n_classes * i for i in range(1, n_classes)]
        self.metric_str = metric
        self.metric = spearman_corr

    def _loss(self, X: np.ndarray, y: np.ndarray) -> float:
        X_p = np.digitize(X, self.coef)
        ll = -self.metric(y, X_p)
        return ll

    def fit(self, X: np.ndarray, y: np.ndarray):
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [
            (0.01, 1.0 / self.n_classes + 0.05),
        ]
        for i in range(1, self.n_classes):
            ab_start.append((i * 1.0 / self.n_classes + 0.05,
                             (i + 1) * 1.0 / self.n_classes + 0.05))
        for _ in range(self.n_overall):
            for idx in range(self.n_classes - 1):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                self.coef[idx] = a
                la = self._loss(X, y)
                self.coef[idx] = b
                lb = self._loss(X, y)
                for it in range(self.n_classwise):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        self.coef[idx] = a
                        la = self._loss(X, y)
                    else:
                        b = b - (b - a) * golden2
                        self.coef[idx] = b
                        lb = self._loss(X, y)

    def predict(self, X: np.ndarray) -> np.ndarray:
        X_p = np.digitize(X, self.coef)
        return X_p

In [26]:
optR_lst = pickle.load(open("C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/26th-solution/inference/input/quest-optimizedrounder/optR_lst_10fold_ensemble_v3.pkl", 'rb'))

In [32]:
len(optR_lst[25].coef)#類別數

30

In [31]:
optR_lst[0].coef

[0.010000018984111226,
 0.10263158973379526,
 0.15526316868116366,
 0.20789474762853208,
 0.26052632657590047,
 0.3131579055232689,
 0.3657894844706373,
 0.4184210634180057,
 0.47105264236537414,
 0.5236842213127426,
 0.5763158002601111,
 0.6289473792074796,
 0.7199743973515239,
 0.7769234872385886,
 0.8270511143653743,
 0.8669055421005689,
 0.9245306419996713,
 0.9466500749755824]

In [28]:
for optR in optR_lst:
    print(len(optR.coef))

18
18
6
6
6
6
18
18
6
6
6
6
6
6
6
6
6
6
6
3
18
18
18
18
18
30
6
6
6
18


In [29]:
norm_dict

{'question_asker_intent_understanding': 18,
 'question_body_critical': 18,
 'question_conversational': 6,
 'question_expect_short_answer': 6,
 'question_fact_seeking': 6,
 'question_has_commonly_accepted_answer': 6,
 'question_interestingness_others': 18,
 'question_interestingness_self': 18,
 'question_multi_intent': 6,
 'question_not_really_a_question': 6,
 'question_opinion_seeking': 6,
 'question_type_choice': 6,
 'question_type_compare': 6,
 'question_type_consequence': 6,
 'question_type_definition': 6,
 'question_type_entity': 6,
 'question_type_instructions': 6,
 'question_type_procedure': 6,
 'question_type_reason_explanation': 6,
 'question_type_spelling': 3,
 'question_well_written': 18,
 'answer_helpful': 18,
 'answer_level_of_information': 18,
 'answer_plausible': 18,
 'answer_relevance': 18,
 'answer_satisfaction': 30,
 'answer_type_instructions': 6,
 'answer_type_procedure': 6,
 'answer_type_reason_explanation': 6,
 'answer_well_written': 18}

#測試OptimizedRounder

In [34]:
test_OptimizedRounder=OptimizedRounder()
train_sample=np.array([1,2,3,4,5,6,7,8,9,10])
train_y_sample=np.array([1,2,3,4,5,6,7,8,9,10])
test_OptimizedRounder.fit(train_sample,train_y_sample)
test_OptimizedRounder.coef

[0.026483684940355287,
 0.2057350217167954,
 0.34859216457393827,
 0.4914493074310811,
 0.6343064502882241,
 0.7771635931453669]

In [28]:
optR_lst

[<__main__.OptimizedRounder at 0x1be39ba5388>,
 <__main__.OptimizedRounder at 0x1be3905e148>,
 <__main__.OptimizedRounder at 0x1be108d99c8>,
 <__main__.OptimizedRounder at 0x1be108d9048>,
 <__main__.OptimizedRounder at 0x1be108d9cc8>,
 <__main__.OptimizedRounder at 0x1be108a3a88>,
 <__main__.OptimizedRounder at 0x1be108a3808>,
 <__main__.OptimizedRounder at 0x1be108a3688>,
 <__main__.OptimizedRounder at 0x1be108a3e88>,
 <__main__.OptimizedRounder at 0x1be108a3908>,
 <__main__.OptimizedRounder at 0x1be10946408>,
 <__main__.OptimizedRounder at 0x1be10946988>,
 <__main__.OptimizedRounder at 0x1be0e19ab08>,
 <__main__.OptimizedRounder at 0x1be10948748>,
 <__main__.OptimizedRounder at 0x1be10948948>,
 <__main__.OptimizedRounder at 0x1be10948fc8>,
 <__main__.OptimizedRounder at 0x1be109484c8>,
 <__main__.OptimizedRounder at 0x1be109488c8>,
 <__main__.OptimizedRounder at 0x1be10948d48>,
 <__main__.OptimizedRounder at 0x1be10948dc8>,
 <__main__.OptimizedRounder at 0x1be10948148>,
 <__main__.Op

In [28]:
#optR_lst = pickle.load(open("C:/Users/Lab000/Desktop/kaggle/kaggle_competetion/Google_Quest_LABEL/26th-solution/inference/input/quest-optimizedrounder/optR_lst_10fold_ensemble_v3.pkl", 'rb'))

lst = []
for idx, optR in enumerate(optR_lst):
    
    coeff = optR.predict(test_pred[:, idx])#ouput:476
    lst.append(coeff/norm_dict[sub.columns[1:][idx]])#476
    #print(len(lst[idx]))
#print(np.array(lst).shape)
opt_preds = np.array(lst).T
test_pred = opt_preds
test_pred[:,19]=cased_pred[:,19]#不能有全欄位在全部的列都等於0
#test_pred[:,19] = lgbm_pred

In [29]:
test_pred

array([[1.        , 0.66666667, 0.        , ..., 0.        , 1.        ,
        0.88888889],
       [0.77777778, 0.44444444, 0.        , ..., 0.        , 0.        ,
        0.83333333],
       [0.94444444, 0.77777778, 0.        , ..., 0.        , 1.        ,
        0.88888889],
       ...,
       [0.83333333, 0.33333333, 0.        , ..., 0.16666667, 0.33333333,
        0.88888889],
       [0.88888889, 0.83333333, 0.        , ..., 0.        , 1.        ,
        0.88888889],
       [0.94444444, 0.5       , 0.        , ..., 0.        , 0.        ,
        0.94444444]])

In [30]:
lgbm_pred.shape

(476,)

In [31]:
test_pred[:,19] = lgbm_pred

In [32]:
test_pred[test[test["category"] != "CULTURE"].index, 19] = 0.0

test["host_info"] = test["question_user_page"].map(lambda x: x.split("/")[2].replace(".stackexchange.com", ""))
test_pred[test[test["host_info"].map(lambda x: x not in ["english", "ell"])].index, 19] = 0.0

In [33]:
sub[sub.columns[1:]] = test_pred
sub.to_csv("加上post_posprocessing_submission.csv", index=False)
sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,1.0,0.666667,0.0,0.833333,0.5,1.0,0.722222,0.611111,0.666667,...,0.944444,0.944444,0.5,1.0,1.0,0.8,0.0,0.0,1.0,0.888889
1,46,0.777778,0.444444,0.0,1.0,0.833333,1.0,0.555556,0.444444,0.166667,...,0.555556,0.944444,0.666667,1.0,1.0,0.9,1.0,0.0,0.0,0.833333
2,70,0.944444,0.777778,0.0,1.0,0.833333,1.0,0.666667,0.555556,0.0,...,0.888889,0.888889,0.5,0.944444,0.944444,0.7,0.0,0.0,1.0,0.888889
3,132,1.0,0.5,0.0,1.0,0.833333,1.0,0.611111,0.444444,0.0,...,0.777778,0.944444,0.666667,1.0,1.0,0.9,0.666667,0.0,1.0,0.944444
4,200,0.944444,0.5,0.0,1.0,0.833333,1.0,0.555556,0.555556,0.166667,...,0.666667,0.944444,0.666667,1.0,0.944444,0.8,0.333333,0.333333,0.666667,0.944444
