In [18]:
from torch.utils.data import Dataset
import json

class AFQMC(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'rt') as f:
            for idx, line in enumerate(f):
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_data = AFQMC('data/afqmc_public/train.json')
valid_data = AFQMC('data/afqmc_public/dev.json')

train_data[0]

{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}

In [19]:
from torch.utils.data import IterableDataset

class IterableAFQMC(IterableDataset):
    def __init__(self, data_file):
        self.data_file = data_file
    
    def __iter__(self):
        with open(self.data_file, 'rt') as f:
            for line in f:
                sample = json.loads(line.strip())
                yield sample
iter_train_data = IterableAFQMC('data/afqmc_public/train.json')
next(iter(iter_train_data))

{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}

In [20]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    batch_sentence_1, batch_sentence_2 = [], []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['sentence1'])
        batch_sentence_2.append(sample['sentence2'])
        batch_label.append(int(sample['label']))
    X = tokenizer(
        batch_sentence_1,
        batch_sentence_2,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collote_fn)
test_dataloader = DataLoader(valid_data, batch_size=4, shuffle=False, collate_fn=collote_fn)

batch_X, batch_y = next(iter(train_dataloader))
print("batch_X shape:", {k: v.shape for k, v in batch_X.items()})
print("batch_y shape:", batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([4, 34]), 'token_type_ids': torch.Size([4, 34]), 'attention_mask': torch.Size([4, 34])}
batch_y shape: torch.Size([4])
{'input_ids': tensor([[ 101, 2582,  720, 2199, 1369,  671,  702, 5709, 1446, 6572, 1384, 2940,
         6814, 3341,  102, 2769, 2940,  702, 2797, 3322, 1384, 4772, 2582,  720,
         5709, 1446, 3766, 3300,  749,  102,    0,    0,    0,    0],
        [ 101, 6010, 6009,  955, 1446, 2990, 1184, 6820, 3621, 8024, 1168, 6820,
         3621, 3189, 3309, 6820, 4500, 6820, 1658,  102,  955, 1446, 2972, 6826,
         6820, 3621, 3189,  102,    0,    0,    0,    0,    0,    0],
        [ 101, 2769, 6820, 3612, 1914, 2208, 5709, 1446, 3766, 6820,  102, 5709,
         1446, 6820, 3300, 1914, 2208, 7178, 6206, 6820, 2600, 1066,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 6010, 6009, 5709, 1446, 6843, 4638,  831, 6999,  833, 1447, 3221,
         1048, 6589, 4638, 1408,  102, 5709, 1446,

In [21]:
from torch import nn
from transformers import AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

class BertForPairwiseCLS(nn.Module):
    def __init__(self):
        super(BertForPairwiseCLS, self).__init__()
        self.bert_encoder = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)
    
    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        cls_vector = bert_output.last_hidden_state[:, 0, :]
        cls_vector = self.dropout(cls_vector)
        logits = self.classifier(cls_vector)
        return logits

model = BertForPairwiseCLS().to(device)
print(model)

Using cpu device
BertForPairwiseCLS(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [22]:
from torch import nn
from transformers import AutoConfig, BertPreTrainedModel, BertModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device} device")

class BertForPairwiseCLS(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(768, 2)
        self.post_init()
    
    def forward(self, x):
        bert_output = self.bert(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0, :]
        cls_vectors = self.dropout(cls_vectors)
        logits = self.classifier(cls_vectors)
        return logits

config = AutoConfig.from_pretrained(checkpoint)
model = BertForPairwiseCLS.from_pretrained(checkpoint, config=config).to(device)
print(model)

Using cpu device


Some weights of BertForPairwiseCLS were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForPairwiseCLS(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [23]:
outputs = model(batch_X)
outputs.shape

torch.Size([4, 2])

In [24]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f"loss: {0:>7f}")
    finish_step_num = (epoch - 1) * len(dataloader)

    model.train()
    for step, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f"loss: {total_loss/(finish_step_num + step):>7f}")
        progress_bar.update(1)
    return total_loss

def test_loop(dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    
    correct /= size
    print(f"{mode} Accuracy: {(100 * correct):>0.1f}%\n")

In [25]:
from transformers import  get_scheduler
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 1
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_training_steps=0,
    num_warmup_steps=num_training_steps
)
print(num_training_steps)

8584


In [27]:
total_loss = 0.
epoch_num = 1
loss_fn = nn.CrossEntropyLoss()
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    test_loop(test_dataloader, model, mode='Valid')
print("Done!")

Epoch 1/1
-------------------------------


loss: 0.612353: 100%|██████████| 8584/8584 [33:14<00:00,  4.30it/s]


Valid Accuracy: 69.0%

Done!
