In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoModelForPreTraining
from transformers import AutoTokenizer,BertTokenizerFast,AutoModel,BertTokenizer
from torch import nn
from torch.optim import AdamW
import pandas as pd
import numpy as np
import torch
import gc
from tqdm import tqdm
from sklearn.model_selection import KFold
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
from model.modeling_nezha import NeZhaForSequenceClassification,NeZhaPreTrainedModel,NeZhaModel,NeZhaForTokenClassification
from model.configuration_nezha import NeZhaConfig
import  torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from transformers.models.bert.modeling_bert import (
    BertOutput,
    BertPooler,
    BertSelfOutput,
    BertIntermediate,
    BertOnlyMLMHead,
    BertOnlyNSPHead,
    BertPreTrainingHeads,
    BERT_START_DOCSTRING,
    BERT_INPUTS_DOCSTRING,
)
from transformers import BertConfig,BertTokenizerFast,BertModel

# In[3]:


class NeZhaForSequenceClassification(NeZhaPreTrainedModel):
    def __init__(self, config,model_name,num_labels1,num_labels2):
        super().__init__(config)
        self.num_labels1 = num_labels1
        self.num_labels2=num_labels2
        self.bert = BertModel(config).from_pretrained(model_name)
        self.attn1=Attn(config.hidden_size)
        self.attn2=Attn(config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier1 = nn.Linear(config.hidden_size, self.num_labels1)
        self.classifier2 = nn.Linear(config.hidden_size, self.num_labels2)
        self.predict=nn.Sigmoid()
#         self.init_weights()
#         if True:
#             for p in self.bert.parameters(): # 冻结所有bert层
#                 p.requires_grad = False

    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            labels1=None,
        labels2=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
        )

        att1=self.attn1(outputs[0])
        att2=self.attn2(outputs[0])
        pooled_output1 = self.dropout(att1)
        pooled_output2 = self.dropout(att2)
        logits1 = self.classifier1(pooled_output1)
        logits2 = self.classifier2(pooled_output2)
        predict1=self.predict(logits1)
        predict2=self.predict(logits2)
        outputs = (predict1,predict2) + outputs[2:]  # add hidden states and attention if they are here
#         print('label:',labels)
#         print('input_ids:',input_ids)
#         print('attention_mas:',attention_mask)
        if labels1 is not None:
            loss_fct = nn.BCELoss()
#                 print(logits.view(-1, self.num_labels))
#                 print(labels.view(-1, self.num_labels))
            loss1 = loss_fct(predict1.view(-1, self.num_labels1), labels1.view(-1, self.num_labels1))
            loss2 = loss_fct(predict2.view(-1, self.num_labels2), labels2.view(-1, self.num_labels2))
            loss=loss1+loss2
            outputs = (loss,) + outputs
#         print(outputs)
        return outputs  # (loss), predict1,predict2, (hidden_states), (attentions)
    
class Attn(nn.Module):
    def __init__(self,hidden_size):
        super(Attn, self).__init__()
        self.attn = nn.Linear(hidden_size,1)
    def forward(self, x):
        '''
        :param hidden: 
            previous hidden state of the decoder, in shape (layers*directions,B,H)
        :param encoder_outputs:
            encoder outputs from Encoder, in shape (T,B,H)
        :param src_len:
            used for masking. NoneType or tensor in shape (B) indicating sequence length
        :return
            attention energies in shape (B,T)
        '''   
        att=self.attn(x)
        att=F.tanh(att)
        att=F.softmax(att,1)
        att_x=att*x
        return att_x.sum(1)
import torch.utils.data as Data
class CustomDataset(Data.Dataset):
    def __init__(self, data, maxlen,tokenizer,with_labels=True, model_name='bert-base-chinese'):
        self.data = data  # pandas dataframe

        #Initialize the tokenizer
        self.tokenizer = tokenizer#AutoTokenizer.from_pretrained(model_name, use_fast=True)  
        self.maxlen = maxlen
        self.with_labels = with_labels

    def __len__(self):
        return len(self.data)
    def get_label(self,x,num):
        label=[0]*num
       
        x=x.strip().split(' ')

        for l in x:              
            if l and l!='nan':
                label[int(l)]=1
        return label
    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent = str(self.data.loc[index, 'sentence'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent,
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,       # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
#         print(encoded_pair['input_ids'])
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
#         token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label1 = torch.Tensor(self.get_label(str(self.data.loc[index, 'label1']),17))
            label2 = torch.Tensor(self.get_label(str(self.data.loc[index, 'label2']),12))
            return token_ids, attn_masks,label1,label2
        else:
            return token_ids, attn_masks
from sklearn.utils import shuffle as reset
def train_test_split(data_df, test_size=0.2, shuffle=True, random_state=None):
    if shuffle:
        data_df = reset(data_df, random_state=random_state)

    train = data_df[int(len(data_df)*test_size):].reset_index(drop = True)
    test  = data_df[:int(len(data_df)*test_size)].reset_index(drop = True)

    return train, test

from torch.nn.functional import cross_entropy,binary_cross_entropy


def evals(model, optimizer, validation_dataloader,output_model = './train_class/model.pth'):

    model.eval()
    eval_loss, eval_accuracy, nb_eval_steps = 0, 0, 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            predict1,predict2 = model(batch[0], batch[1])
            predict1,predict2 = predict1.detach().cpu(),predict2.detach().cpu()
            label_ids1,label_ids2 = batch[2].cpu(),batch[3].cpu()
            
            tmp_eval_accuracy = binary_cross_entropy(predict1, label_ids1.float()).item()+binary_cross_entropy(predict2, label_ids2.float()).item()
            
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

    print("Validation mlogloss: {}".format(eval_accuracy / nb_eval_steps))
    global best_score
    if best_score > eval_accuracy / nb_eval_steps:
        best_score = eval_accuracy / nb_eval_steps
        save(model, optimizer,output_model)
        return 0
    return 1
def save(model, optimizer,output_model):
    # save
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)
    print('The best model has been saved')
def flat_accuracy(preds, labels):
#     print(preds,labels)
    return -np.mean(labels*np.log(preds+1.e-7)+(1-labels)*np.log(preds+1.e-7))*10

# 对抗训练
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}
    def attack(self, epsilon=1.0, emb_name='word_emb'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
    def restore(self, emb_name='word_emb'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
        
from collections import defaultdict
from torch.optim import Optimizer
import torch


class Lookahead(Optimizer):
    def __init__(self, optimizer, k=5, alpha=0.5):
        self.optimizer = optimizer
        self.k = k
        self.alpha = alpha
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        self.fast_state = self.optimizer.state
        for group in self.param_groups:
            group["counter"] = 0

    def update(self, group):
        for fast in group["params"]:
            param_state = self.state[fast]
            if "slow_param" not in param_state:
                param_state["slow_param"] = torch.zeros_like(fast.data)
                param_state["slow_param"].copy_(fast.data)
            slow = param_state["slow_param"]
            slow += (fast.data - slow) * self.alpha
            fast.data.copy_(slow)

    def update_lookahead(self):
        for group in self.param_groups:
            self.update(group)

    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group in self.param_groups:
            if group["counter"] == 0:
                self.update(group)
            group["counter"] += 1
            if group["counter"] >= self.k:
                group["counter"] = 0
        return loss

    def state_dict(self):
        fast_state_dict = self.optimizer.state_dict()
        slow_state = {
            (id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict["state"]
        param_groups = fast_state_dict["param_groups"]
        return {
            "fast_state": fast_state,
            "slow_state": slow_state,
            "param_groups": param_groups,
        }

    def load_state_dict(self, state_dict):
        slow_state_dict = {
            "state": state_dict["slow_state"],
            "param_groups": state_dict["param_groups"],
        }
        fast_state_dict = {
            "state": state_dict["fast_state"],
            "param_groups": state_dict["param_groups"],
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.optimizer.load_state_dict(fast_state_dict)
        self.fast_state = self.optimizer.state

    def add_param_group(self, param_group):
        param_group["counter"] = 0
        self.optimizer.add_param_group(param_group)

from torch.optim.lr_scheduler import LambdaLR
class WarmupLinearSchedule(LambdaLR):
    """ Linear warmup and then linear decay.
        Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step.
        Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps.
        Linearly decreases the multiplicative variable from 1. to 0. over remaining `t_total - warmup_steps` steps.
    """
    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))


class AdamW(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                        correct_bias=correct_bias)
        super(AdamW, self).__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                state = self.state[p]
                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p.data)
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                state['step'] += 1
                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group['eps'])
                step_size = group['lr']
                if group['correct_bias']:  # No bias correction for Bert
                    bias_correction1 = 1.0 - beta1 ** state['step']
                    bias_correction2 = 1.0 - beta2 ** state['step']
                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
                p.data.addcdiv_(-step_size, exp_avg, denom)
                if group['weight_decay'] > 0.0:
                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
        return loss
def build_optimizer(model, train_steps, learning_rate):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False, eps=1e-8)
    optimizer = Lookahead(optimizer, 5, 1)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * 0.1, t_total=train_steps)
    return optimizer, scheduler
def to_predict(model, dataloader,output_model, with_labels=False):
    
    # load model
    checkpoint = torch.load(output_model, map_location='cuda')
#     print(checkpoint)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    print('-----Testing-----')

    pred_label =np.zeros((len(test),29))
    model.eval()
    for i, batch in enumerate(tqdm(dataloader)):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            predict1,predict2 = model(batch[0], batch[1])
            predict1 = predict1.detach().cpu().numpy()
            predict2 = predict2.detach().cpu().numpy()
            predict=np.concatenate([predict1,predict2],axis=-1)
            pred_label[i*batch_size:(i+1)*batch_size]=predict
    return pred_label


In [7]:
train=pd.read_csv('../tcdata/train.csv',header=None)
test=pd.read_csv('../tcdata/track1_round1_testB.csv',header=None)

# test=train.iloc[-2000:].copy().reset_index(drop=True)
# train=train.iloc[:-2000]

# test=pd.read_csv('../tcdata/testA.csv',header=None)
model_path='../model_weight/bert/'
output_model='../tmp/bert.pth'
batch_size=32
# 合并训练集与测试集 制作特征
for i in range(1,3):
    train[i]=train[i].apply(lambda x:x.replace('|','').strip())
for i in range(1,2):
    test[i]=test[i].apply(lambda x:x.replace('|','').strip())
train.columns=['idx','sentence','label1','label2']
test.columns=['idx','sentence']
# test.columns=['idx','sentence','label1','label2']

tokenizer=BertTokenizerFast.from_pretrained(model_path)

config=BertConfig.from_pretrained(model_path,num_labels=17,hidden_dropout_prob=0.1) # config.output_attentions=True
config.hidden_dropout_prob=0.1

In [8]:
def train_model(train_df,val_df,test_oof):
    
        ###--------------------
    early_stop=0
    print("Reading training data...")
    train_set = CustomDataset(train_df, maxlen=128,tokenizer=tokenizer)
    train_loader = Data.DataLoader(train_set, batch_size=batch_size, num_workers=5, shuffle=True)

    print("Reading validation data...")
    val_set = CustomDataset(val_df, maxlen=128, tokenizer=tokenizer)
    val_loader = Data.DataLoader(val_set, batch_size=batch_size, num_workers=5, shuffle=True)

    test_set = CustomDataset(test, maxlen=128, tokenizer=tokenizer,with_labels=False)
    test_loader = Data.DataLoader(test_set, batch_size=batch_size, num_workers=5, shuffle=False)
    # 准备模型
    model=NeZhaForSequenceClassification(config=config,model_name=model_path,num_labels1=17,num_labels2=12)
    ### 训练
    model.to(device)
    fgm = FGM(model)
    train_num = len(train_set)
    train_steps = int(train_num * epochs / batch_size) + 1

    optimizer, scheduler = build_optimizer(model, train_steps, learning_rate=6e-5)
    print('-----Training-----')
    for epoch in range(epochs):
        model.train()
        model.zero_grad()
        print('Epoch', epoch)
        for i, batch in enumerate(tqdm(train_loader)):
            batch = tuple(t.to(device) for t in batch)
            loss, predict1,predict2 = model(batch[0], batch[1], batch[2],batch[3])
            if i % 50 == 0:
                print(i, loss.item())
            optimizer.zero_grad()
            loss.backward()

            # 对抗训练
            fgm.attack()
            loss_adv, _,_  = model(batch[0], batch[1], batch[2],batch[3])
            loss_adv.backward()
            fgm.restore()

            optimizer.step()
            scheduler.step()
   
    #         if i % 20 == 0:
    #             eval(model, optimizer, val_loader, output_model='./runs/nezha1.pth')
       
        early_stop+=evals(model, optimizer, val_loader, output_model=output_model)
        if early_stop==2:
            break

    test_oof.append(to_predict(model, test_loader,output_model, with_labels=False))
    torch.cuda.empty_cache()
    gc.collect()
    return test_oof   

In [9]:
n_fold=KFold(8,shuffle=True,random_state=1080)
test_oof=[]
epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for trn_idx,val_idx in n_fold.split(train):
    train_df=train.iloc[trn_idx].reset_index(drop=True)
    val_df=train.iloc[val_idx].reset_index(drop=True)
    best_score = float('inf')
    train_model(train_df,val_df,test_oof)

Reading training data...
Reading validation data...


Some weights of BertModel were not initialized from the model checkpoint at ../model_weight/bert/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/547 [00:00<?, ?it/s]

-----Training-----
Epoch 0




0 1.3908965587615967


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370117127/work/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
  9%|▉         | 50/547 [00:24<04:01,  2.06it/s]

50 0.3625543415546417


 18%|█▊        | 100/547 [00:49<03:42,  2.01it/s]

100 0.306037962436676


 27%|██▋       | 150/547 [01:14<03:19,  1.99it/s]

150 0.1908472329378128


 37%|███▋      | 200/547 [01:40<02:55,  1.98it/s]

200 0.21352708339691162


 46%|████▌     | 250/547 [02:06<02:41,  1.84it/s]

250 0.16700202226638794


 55%|█████▍    | 300/547 [02:34<02:21,  1.74it/s]

300 0.12694424390792847


 64%|██████▍   | 350/547 [03:04<02:01,  1.62it/s]

350 0.13708218932151794


 73%|███████▎  | 400/547 [03:36<01:32,  1.60it/s]

400 0.09468851238489151


 82%|████████▏ | 450/547 [04:08<01:06,  1.47it/s]

450 0.07340187579393387


 91%|█████████▏| 500/547 [04:43<00:31,  1.48it/s]

500 0.07787097245454788


100%|██████████| 547/547 [05:22<00:00,  1.70it/s]


Validation mlogloss: 0.07766225971753084


  0%|          | 0/547 [00:00<?, ?it/s]

The best model has been saved
Epoch 1
0 0.06797514855861664


  9%|▉         | 50/547 [00:37<06:04,  1.36it/s]

50 0.05992788448929787


 18%|█▊        | 100/547 [01:23<06:15,  1.19it/s]

100 0.08078087121248245


 27%|██▋       | 150/547 [02:11<07:34,  1.14s/it]

150 0.07217096537351608


 37%|███▋      | 200/547 [02:58<06:01,  1.04s/it]

200 0.10543860495090485


 46%|████▌     | 250/547 [03:45<03:21,  1.48it/s]

250 0.06236641854047775


 55%|█████▍    | 300/547 [04:40<04:59,  1.21s/it]

300 0.05149432271718979


 64%|██████▍   | 350/547 [05:31<03:57,  1.21s/it]

350 0.06978238373994827


 73%|███████▎  | 400/547 [06:23<02:30,  1.03s/it]

400 0.06067626550793648


 82%|████████▏ | 450/547 [07:17<01:52,  1.16s/it]

450 0.06013014540076256


 91%|█████████▏| 500/547 [08:17<00:52,  1.12s/it]

500 0.05723525583744049


100%|██████████| 547/547 [09:09<00:00,  1.01s/it]


Validation mlogloss: 0.05485029469090927


  0%|          | 0/547 [00:00<?, ?it/s]

The best model has been saved
Epoch 2
0 0.04070832207798958


  9%|▉         | 50/547 [00:50<09:23,  1.13s/it]

50 0.03763046860694885


 18%|█▊        | 100/547 [01:48<09:58,  1.34s/it]

100 0.027600131928920746


 27%|██▋       | 150/547 [02:48<08:11,  1.24s/it]

150 0.04832497239112854


 37%|███▋      | 200/547 [03:46<07:49,  1.35s/it]

200 0.0363018698990345


 46%|████▌     | 250/547 [04:43<04:30,  1.10it/s]

250 0.022187354043126106


 55%|█████▍    | 300/547 [05:41<04:03,  1.01it/s]

300 0.042010314762592316


 64%|██████▍   | 350/547 [06:40<03:41,  1.12s/it]

350 0.029389146715402603


 73%|███████▎  | 400/547 [07:36<01:56,  1.26it/s]

400 0.04637901857495308


 82%|████████▏ | 450/547 [08:31<01:14,  1.30it/s]

450 0.03394341096282005


 91%|█████████▏| 500/547 [09:29<00:39,  1.18it/s]

500 0.02750646136701107


100%|██████████| 547/547 [10:26<00:00,  1.14s/it]


Validation mlogloss: 0.04313594408191835


  0%|          | 0/547 [00:00<?, ?it/s]

The best model has been saved
Epoch 3
0 0.014517219737172127


  9%|▉         | 50/547 [00:51<10:11,  1.23s/it]

50 0.020644279196858406


 13%|█▎        | 70/547 [01:16<08:39,  1.09s/it]


KeyboardInterrupt: 

In [4]:
# 0.1085 0.0792  0.076   0.077
                0.052    0.0548
                         0.043

IndentationError: unexpected indent (<ipython-input-4-099d2d28035e>, line 2)

In [5]:
for i in range(len(test_oof)):
    test_oof[i]=test_oof[i]-test_oof[i].min(axis=1,keepdims=True)
test_oof=np.mean(test_oof,axis=0)    
test_oof=test_oof-test_oof.min(axis=1,keepdims=True)

NameError: name 'test_oof' is not defined

In [None]:
sub=pd.DataFrame()
test=pd.read_csv('../tcdata/testA.csv',header=None)
sub['report_ID']=test[0]
sub['Prediction']=[ '|'+' '.join(['%.12f'%j for j in i]) for i in test_oof ]
sub.to_csv('../result.csv',index=False,header=False)