In [1]:
import torch
import random
import numpy as np

config = {
    'train_file_path': 'dataset/train.csv',
    'test_file_path': 'dataset/test.csv',
    'train_val_ratio': 0.1,
    'head': 'cnn',
    'model_path': 'dataset/NeZha_model/',
    'batch_size': 16,
    'num_epochs': 1,
    'learning_rate': 2e-5,
    'logging_step': 500,
    'seed': 2021
}
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2021

In [3]:
import pandas as pd
from tqdm.notebook import tqdm as tqdm
from collections import defaultdict

def read_data(config, tokenizer, mode='train'):
    data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')
    if mode == 'train':
        X_train, y_train = defaultdict(list), []
        X_val, y_val = defaultdict(list), []
        num_val = int(len(data_df) * config['train_val_ratio'])
    else:
        X_test, y_test = defaultdict(list), []
        
    for i, row in tqdm(data_df.iterrows(), desc=f'preprocess {mode} data', total=len(data_df)):
        label = row[1] if mode == 'train' else 0
        sentence = row[-1]
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True)
        
        if mode == 'train':
            if i < num_val:
                X_val['input_ids'].append(inputs['input_ids'])
                y_val.append(label)
                X_val['token_type_ids'].append(inputs['token_type_ids'])
                X_val['attention_mask'].append(inputs['attention_mask'])
            else:
                X_train['input_ids'].append(inputs['input_ids'])
                y_train.append(label)
                X_train['token_type_ids'].append(inputs['token_type_ids'])
                X_train['attention_mask'].append(inputs['attention_mask'])
        else:
            X_test['input_ids'].append(inputs['input_ids'])
            y_test.append(label)
            X_test['token_type_ids'].append(inputs['token_type_ids'])
            X_test['attention_mask'].append(inputs['attention_mask'])
            
    if mode == 'train':
        label2id = {label: i for i, label in enumerate(np.unique(y_train))}
        id2label = {i: label for label, i in label2id.items()}
        y_train = torch.tensor([label2id[i] for i in y_train], dtype=torch.long)
        y_val = torch.tensor([label2id[i] for i in y_val], dtype=torch.long)
        return X_train, y_train, X_val, y_val, label2id, id2label
    else:
        y_test = torch.tensor(y_test, dtype=torch.long)
        return X_test, y_test

In [4]:
from torch.utils.data import Dataset

class TNEWSData(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
        
    def __getitem__(self, idx):
        return {
            'input_ids': self.x['input_ids'][idx],
            'label' : self.y[idx],
            'token_type_ids': self.x['token_type_ids'][idx],
            'attention_mask': self.x['attention_mask'][idx]
        }
    
    def __len__(self):
        return self.y.size(0)

In [5]:
def collate_fn(examples):
    input_ids_list, labels = [], []
    token_type_ids_list, attention_mask_list = [], []
    
    for example in examples:
        input_ids_list.append(example['input_ids'])
        labels.append(example['label'])
        token_type_ids_list.append(example['token_type_ids'])
        attention_mask_list.append(example['attention_mask'])
        
    max_length = max(len(input_ids) for input_ids in input_ids_list)
    input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)
    token_type_ids_tensor = torch.zeros_like(input_ids_tensor)
    attention_mask_tensor = torch.zeros_like(input_ids_tensor)
    
    for i, input_ids in enumerate(input_ids_list):
        input_ids_tensor[i, :len(input_ids)] = torch.tensor(input_ids, dtype=torch.long)
        token_type_ids_tensor[i, :len(input_ids)] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
        attention_mask_tensor[i, :len(input_ids)] = torch.tensor(attention_mask_list[i], dtype=torch.long)
        
    return{
        'input_ids' : input_ids_tensor,
        'labels' : torch.tensor(labels, dtype=torch.long),
        'token_type_ids': token_type_ids_tensor,
        'attention_mask': attention_mask_tensor
    }

In [6]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader

def build_dataloader(config):
    tokenizer = BertTokenizer.from_pretrained(config['model_path'])
    X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, tokenizer, mode='train')
    X_test, y_test = read_data(config, tokenizer, mode='test')

    train_dataset = TNEWSData(X_train, y_train)
    val_dataset = TNEWSData(X_val, y_val)
    test_dataset = TNEWSData(X_test, y_test)

    train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)

    return train_dataloader, val_dataloader, test_dataloader, id2label

In [7]:
train_dataloader, val_dataloader, test_dataloader, id2label = build_dataloader(config)

preprocess train data:   0%|          | 0/53360 [00:00<?, ?it/s]

preprocess test data:   0%|          | 0/10000 [00:00<?, ?it/s]

In [8]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[  101,  2682,  1762,  6948,  2336,  1453,  6804,   743,  5018,   753,
          1947,  2791,  2094,  8024,   126,  1283,  2340,  1381,  4638,  8024,
          3300,   784,   720,  1962,  4638,  2972,  5773,  8043,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  8108,  2207,  3198,   704,  1744,  7674,  2168,   131,  2199,
          8208,   674,  2207,   868,  1773,   976,  1168,  2399,  1057, 10194,
          8157,   783,   117,  4706,  4518,  7790,  1213,  1772,  6631,  3330,
          1649,  6411,   102,     0,     0,     0,     0,     0],
        [  101,  3173,  3528,  1059,  1744,  6121,  1266,   776,  4991,   100,
          2207,  4923,  2415,  3173,  6629,  4157,  2199,   715,  1215,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  5303,   754,  5023,  1168,   872,  8013,  1266,   677,  2408

In [9]:
from NeZha import *
import torch.nn as nn

class NeZhaForTNEWS(NeZhaPreTrainedModel):
    def __init__(self, config, model_path, classifier):
        super(NeZhaForTNEWS, self).__init__(config)
        self.bert = NeZhaModel.from_pretrained(model_path, config=config)
        self.classifier = classifier
        
    def forward(self, input_ids, token_type_ids, attention_mask, labels):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        hidden_states = outputs[2]
        logits = self.classifier(hidden_states, input_ids)
        
        outputs = (logits,)
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels.view(-1))
            outputs = (loss,) + outputs
            
        return outputs

In [10]:
import torch.nn.functional as F
import torch.nn as nn

class ConvClassifier(nn.Module):
    '''
    CNN + global max pool
    '''
    def __init__(self, config):
        super().__init__()
        self.conv = nn.Conv1d(in_channels=config.hidden_size, out_channels=config.hidden_size, kernel_size=3)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.fc = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, hidden_states, input_ids):
        hidden_states = self.dropout(hidden_states[-1])#只取出最后一层
        # hidden_states shape (bs, seq_len, hidden_size) -> (bs, hidden_size, seq_len) 
        hidden_states = hidden_states.permute(0, 2, 1)
        out = F.relu(self.conv(hidden_states))
        
        # out (bs, hidden_size_out, seq_len_out)
        # out (bs, hidden_size, 1)
        # out (bs, hidden_size)
        out = self.global_max_pool(out).squeeze(dim=2)
        out = self.fc(out)
        return out

In [11]:
def build_model(model_path, config, head):
    heads = {
        'cnn':ConvClassifier
    }
    model = NeZhaForTNEWS(config, model_path, heads[head](config))
    return model

In [13]:
from sklearn.metrics import f1_score
import numpy as np
from tqdm.notebook import tqdm as tqdm

def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss = 0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

    with torch.no_grad():
        for batch in val_iterator:
            labels.append(batch['labels'])
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            loss, logits = model(**batch)[:2]

            val_loss += loss.item()
            preds.append(logits.argmax(dim=-1).detach().cpu())

    avg_val_loss = val_loss / len(val_dataloader)
    labels = torch.cat(labels, dim=0).numpy()
    preds = torch.cat(preds, dim=0).numpy()
    f1 = f1_score(labels, preds, average='macro')
    return avg_val_loss, f1

In [17]:
from transformers import BertConfig, BertForSequenceClassification
import torch.optim as optim
from tqdm.notebook import trange as trange


def train(config, id2label, train_dataloader, val_dataloader):
    bert_config = NeZhaConfig.from_pretrained(config['model_path'])
    bert_config.output_hidden_states = True

    bert_config.num_labels = len(id2label)

    model = build_model(config['model_path'], bert_config, config['head'])
    optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])

    model.to(config['device'])
    epoch_iterator = trange(config['num_epochs'])
    global_steps = 0
    train_loss = 0.
    logging_loss = 0.

    for epoch in epoch_iterator:
        train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
        model.train()
        for batch in train_iterator:
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            loss = model(**batch)[0]
            model.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            global_steps += 1

            if global_steps % config['logging_step'] == 0:
                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                logging_loss = train_loss

                avg_val_loss, f1 = evaluation(config, model, val_dataloader)

                print_log = f'>>> training loss: {print_train_loss:.4f}, valid loss: {avg_val_loss:.4f}, ' \
                            f'valid f1 score: {f1:.4f}'
                print(print_log)
                model.train()

    return model

In [18]:
model = train(config, id2label, train_dataloader, val_dataloader)

Some weights of the model checkpoint at dataset/NeZha_model/ were not used when initializing NeZhaModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing NeZhaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NeZhaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of NeZhaModel were not initialized from the model checkpoint at dataset/NeZha_model/ and are newly initialized: ['bert.encoder.layer.7.a

  0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/3002 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 1.7410, valid loss: 1.4098, valid f1 score: 0.5098


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 1.3650, valid loss: 1.3225, valid f1 score: 0.5182


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 1.3186, valid loss: 1.2733, valid f1 score: 0.5097


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 1.2784, valid loss: 1.2766, valid f1 score: 0.5132


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 1.2448, valid loss: 1.2599, valid f1 score: 0.5165


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 1.2555, valid loss: 1.2236, valid f1 score: 0.5495
