In [7]:
import torch
import random
import numpy as np

config = {
    'train_file_path': 'dataset/train.csv',
    'test_file_path': 'dataset/test.csv',
    'train_val_ratio': 0.1,
    'model_path': 'dataset/NeZha_model/',
    'head': 'cnn',
    'batch_size': 16,
    'num_epochs': 1,
    'warmup_ratio': 0.1,
    'learning_rate': 2e-5,
    'logging_step': 500,
    'seed': 2021
}
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2021

In [2]:
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm

def read_data(config, tokenizer, mode='train'):
    data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')
    if mode == 'train':
        X_train, y_train = defaultdict(list), []
        X_val, y_val = defaultdict(list), []
        num_val = int(len(data_df) * config['train_val_ratio'])
    else:
        X_test, y_test = defaultdict(list), []
        
    for i, row in tqdm(data_df.iterrows(), desc=f'Preprocessing {mode} data', total=len(data_df)):
        label = row[1] if mode == 'train' else 0
        sentence = row[-1]
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True)
        
        if mode == 'train':
            if i < num_val:
                X_val['input_ids'].append(inputs['input_ids'])
                X_val['token_type_ids'].append(inputs['token_type_ids'])
                X_val['attention_mask'].append(inputs['attention_mask'])
                y_val.append(label)
            else:
                X_train['input_ids'].append(inputs['input_ids'])
                X_train['token_type_ids'].append(inputs['token_type_ids'])
                X_train['attention_mask'].append(inputs['attention_mask'])
                y_train.append(label)
        else:
            X_test['input_ids'].append(inputs['input_ids'])
            X_test['token_type_ids'].append(inputs['token_type_ids'])
            X_test['attention_mask'].append(inputs['attention_mask'])
            y_test.append(label) 
            
    if mode == 'train':
        label2id = {label: i for i, label in enumerate(np.unique(y_train))}
        id2label = {i: label for label, i in label2id.items()}
        y_train = torch.tensor([label2id[label] for label in y_train], dtype=torch.long)
        y_val = torch.tensor([label2id[label] for label in y_val], dtype=torch.long)
        return X_train, y_train, X_val, y_val, label2id, id2label
    else:
        y_test = torch.tensor(y_test, dtype=torch.long)
        return X_test, y_test

In [3]:
from torch.utils.data import Dataset

class TNEWSDataset(Dataset):

    def __init__(self, X, y):
        self.x = X
        self.y = y

    def __getitem__(self, idx):
        # example
        return {
            'input_ids': self.x['input_ids'][idx],
            'token_type_ids': self.x['token_type_ids'][idx],
            'attention_mask': self.x['attention_mask'][idx],
            'label': self.y[idx]
        }

    def __len__(self):
        return self.y.size(0)

In [4]:
# merge examples into tensor
def collate_fn(examples):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    labels = []

    for example in examples:
        input_ids_list.append(example['input_ids'])
        token_type_ids_list.append(example['token_type_ids'])
        attention_mask_list.append(example['attention_mask'])
        labels.append(example['label'])

    max_length = max(len(input_ids) for input_ids in input_ids_list)
    input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)
    token_type_ids_tensor = torch.zeros_like(input_ids_tensor)
    attention_mask_tensor = torch.zeros_like(input_ids_tensor)
    for i, input_ids in enumerate(input_ids_list):
        seq_len = len(input_ids)
        input_ids_tensor[i, :seq_len] = torch.tensor(input_ids, dtype=torch.long)
        token_type_ids_tensor[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
        attention_mask_tensor[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)

    return {
        'input_ids': input_ids_tensor,
        'token_type_ids': token_type_ids_tensor,
        'attention_mask': attention_mask_tensor,
        'labels': torch.tensor(labels, dtype=torch.long)
    }

In [5]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader

def build_dataloader(config):
    tokenizer = BertTokenizer.from_pretrained(config['model_path'])
    X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, tokenizer, mode='train')
    X_test, y_test = read_data(config, tokenizer, mode='test')
    
    train_dataset = TNEWSDataset(X_train, y_train)
    val_dataset = TNEWSDataset(X_val, y_val)
    test_dataset = TNEWSDataset(X_test, y_test)
    
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'],
                                  num_workers=4, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'],
                                num_workers=4, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'],
                                 num_workers=4, shuffle=False, collate_fn=collate_fn)
    return id2label, test_dataloader, train_dataloader, val_dataloader

In [6]:
id2label, test_dataloader, train_dataloader, val_dataloader = build_dataloader(config)

Preprocessing train data:   0%|          | 0/53360 [00:00<?, ?it/s]

Preprocessing test data:   0%|          | 0/10000 [00:00<?, ?it/s]

In [8]:
from NeZha import *
from extra_loss import FocalLoss

class NeZhaForTNEWS(NeZhaPreTrainedModel):

    def __init__(self, config: NeZhaConfig, model_path: str, classifier: nn.Module):
        super(NeZhaForTNEWS, self).__init__(config)
        self.bert = NeZhaModel.from_pretrained(model_path, config=config)
        self.classifier = classifier
        self.config = config
        
    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_states = outputs[2]
        logits = self.classifier(hidden_states, input_ids)
        outputs = (logits,)
        
        if labels is not None:
            loss_fct = FocalLoss(num_classes=self.config.num_labels)
            loss = loss_fct(logits, labels.view(-1))
            outputs = (loss,) + outputs
            
        return outputs

In [11]:
import torch.nn.functional as F
import torch.nn as nn
from typing import List

class ConvClassifier(nn.Module):
    def __init__(self, config):
        super(ConvClassifier, self).__init__()
        self.conv = nn.Conv1d(in_channels=config.hidden_size, out_channels=config.hidden_size, kernel_size=3, padding=(3-1) // 2)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.fc = nn.Linear(config.hidden_size, config.num_labels)
        
    def forward(self, hidden_states: List[torch.Tensor], input_ids: torch.Tensor):
        hidden_states = self.dropout(hidden_states[-1])  
        hidden_states = hidden_states.permute(0, 2, 1)
        out = F.relu(self.conv(hidden_states))
        out = self.global_max_pool(out).squeeze(dim=2)
        out = self.fc(out)
        return out

In [12]:
def build_model(model_path, config, head):
    heads = {
        'cnn': ConvClassifier
    }
    assert head in heads, "head must have been implemented"
    print(f'>>> You are using {head} head ...')
    model = NeZhaForTNEWS(config, model_path, heads[head](config))
    return model

In [14]:
from sklearn.metrics import f1_score
import numpy as np

def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss = 0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

    with torch.no_grad():
        for batch in val_iterator:
            labels.append(batch['labels'])
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            loss, logits = model(**batch)[:2]

            val_loss += loss.item()
            preds.append(logits.argmax(dim=-1).detach().cpu())

    avg_val_loss = val_loss / len(val_dataloader)
    labels = torch.cat(labels, dim=0).numpy()
    preds = torch.cat(preds, dim=0).numpy()
    f1 = f1_score(labels, preds, average='macro')
    return avg_val_loss, f1

In [15]:
from extra_loss import *
from extra_optim import *
from torch.optim import AdamW
from tqdm.notebook import trange

def train(config, id2label, train_dataloader, val_dataloader):
    bert_config = NeZhaConfig.from_pretrained(config['model_path'])
    bert_config.num_labels = len(id2label)
    bert_config.output_hidden_states = True
    model = build_model(config['model_path'], bert_config, config['head'])

    optimizer_grouped_parameters = model.parameters()
    optimizer = AdamW(optimizer_grouped_parameters, lr=config['learning_rate'])
    # Lookahead要有一个基优化器， k=5, alpha=1
    optimizer = Lookahead(optimizer, 5, 1)
    total_steps = config['num_epochs'] * len(train_dataloader)
    
    lr_scheduler = WarmupLinearSchedule(optimizer,
                                        warmup_steps=int(config['warmup_ratio'] * total_steps),
                                        t_total=total_steps)
    model.to(config['device'])
    epoch_iterator = trange(config['num_epochs'])
    global_steps = 0
    train_loss = 0.
    logging_loss = 0.

    for epoch in epoch_iterator:

        train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
        model.train()
        for batch in train_iterator:
            batch = {item: value.to(config['device']) for item, value in batch.items()}
            loss = model(**batch)[0]

            model.zero_grad()
            loss.backward()
            
            optimizer.step()
            lr_scheduler.step()

            train_loss += loss.item()
            global_steps += 1

            if global_steps % config['logging_step'] == 0:
                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                logging_loss = train_loss

                avg_val_loss, f1 = evaluation(config, model, val_dataloader)

                print_log = f'>>> training loss: {print_train_loss:.4f}, valid loss: {avg_val_loss:.4f}, ' \
                            f'valid f1 score: {f1:.4f}'
                print(print_log)
                model.train()

    return model

In [16]:
model = train(config, id2label, train_dataloader, val_dataloader)

>>> You are using cnn head ...


Some weights of the model checkpoint at dataset/NeZha_model/ were not used when initializing NeZhaModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing NeZhaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NeZhaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of NeZhaModel were not initialized from the model checkpoint at dataset/NeZha_model/ and are newly initialized: ['bert.encoder.layer.5.a

  0%|          | 0/1 [00:00<?, ?it/s]

Training:   0%|          | 0/3002 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 1.1973, valid loss: 0.7609, valid f1 score: 0.4783


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 0.6955, valid loss: 0.6786, valid f1 score: 0.4784


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 0.6746, valid loss: 0.6422, valid f1 score: 0.4876


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 0.6293, valid loss: 0.6109, valid f1 score: 0.5091


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 0.6109, valid loss: 0.5960, valid f1 score: 0.5083


Evaluation:   0%|          | 0/334 [00:00<?, ?it/s]

>>> training loss: 0.5903, valid loss: 0.5886, valid f1 score: 0.5077
