In [7]:
from model.modeling_nezha import NeZhaForSequenceClassification,NeZhaPreTrainedModel,NeZhaModel,NeZhaForTokenClassification
from model.configuration_nezha import NeZhaConfig
from transformers import AutoTokenizer,BertTokenizerFast,AutoModel
import pandas as pd
import torch
from transformers import (
    BertTokenizer,
    Trainer,
    TrainingArguments,
)
import torch.utils.data as Data
from transformers import ReformerTokenizer,ReformerForMaskedLM,ReformerConfig

In [2]:
class CustomDataset(Data.Dataset):
    def __init__(self, data, maxlen,tokenizer,with_labels=True):
        self.data = data  # pandas dataframe

        #Initialize the tokenizer
        self.tokenizer = tokenizer#AutoTokenizer.from_pretrained(model_name, use_fast=True)  
        self.maxlen = maxlen
        
        self.with_labels = with_labels

    def __len__(self):
        return len(self.data)
    def get_label(self,x,num):
        label=[0]*num
        x=x.replace('|','')
        x=x.strip().split(' ')
        for l in x:              
            if l and l!='nan':
                label[int(l)]=1
        return label
    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent = str(self.data.loc[index, 'sentence'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent,
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,       # Truncate to max_length
                                      max_length=self.maxlen, add_special_tokens=True,
                                            return_token_type_ids=True, return_attention_mask=True,
                                      return_tensors='pt')  # Return torch.Tensor objects
#         print(encoded_pair['input_ids'])
        encoded_pair['input_ids']= encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        encoded_pair['attention_mask'] = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        encoded_pair['token_type_ids'] = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label1 = self.get_label(str(self.data.loc[index, 'label1']),17)
            label2 = self.get_label(str(self.data.loc[index, 'label2']),12)
            label=[sum(label1)+sum(label2)]
        encoded_pair.update({'labels':torch.Tensor(label).long()})
        return encoded_pair

In [3]:
def main(train_epoch, batch_size, seq_length, lr, corpus_path, vocab_path,
         config_path, pretrain_model_path, output_record_path, model_save_path):
    num_train_epochs = train_epoch
    pretrain_batch_size = batch_size
    seq_length = seq_length
    lr = lr
    corpus_path = corpus_path
    vocab_path = vocab_path
    config_path = config_path
    output_record_path = output_record_path
    model_save_path = model_save_path

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    data=pd.read_csv(corpus_path,header=None)
    data.columns=['ids','sentence','label1','label2']
    train_dataset=CustomDataset(data,maxlen=128,tokenizer=tokenizer)
    config =NeZhaConfig.from_pretrained(pretrained_model_name_or_path=config_path,num_labels=29)

    model =  NeZhaForSequenceClassification.from_pretrained(pretrain_model_path,config=config)

    training_args = TrainingArguments(
        output_dir=output_record_path, overwrite_output_dir=True, num_train_epochs=num_train_epochs,
        learning_rate=lr, dataloader_num_workers=8, prediction_loss_only=True, fp16=True, fp16_backend='amp',
        per_device_train_batch_size=pretrain_batch_size, save_steps=2000, save_total_limit=50
    )

    trainer = Trainer(
        model=model,
        args=training_args,

        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model(model_save_path)


In [4]:
main(train_epoch=100,batch_size=32, seq_length=128,lr=6e-5,
         corpus_path= '../tcdata/train.csv',
         vocab_path='../model_weight/nezha/vocab.txt',
        config_path= '../model_weight/nezha/config.json',
        pretrain_model_path= '../model_weight/nezha/pytorch_model.bin',
        output_record_path= 'output/record',
     model_save_path='output/model')


Some weights of the model checkpoint at ../model_weight/nezha/pytorch_model.bin were not used when initializing NeZhaForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing NeZhaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NeZhaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of NeZhaForSequenceClassification were not initialized from the model checkpoint at .

Step,Training Loss
500,0.4706
1000,0.1316
1500,0.0565
2000,0.0333
2500,0.0259
3000,0.0205
3500,0.0182
4000,0.0207
4500,0.013
5000,0.0119




KeyboardInterrupt: 