In [1]:
import os

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
from tqdm.notebook import tqdm

from collections import deque, defaultdict
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 1. 加载BERT-large模型

In [2]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-large-cased", cache_dir="../../../BERT/large"
)
model = BertModel.from_pretrained("bert-large-cased", cache_dir="../../../BERT/large")

In [3]:
# 模型大小
print(sum(i.numel() for i in model.parameters() if i.requires_grad) / 1000000)

333.579264


# 2. 获取数据

In [4]:
df = pd.read_csv("./relations/sample.csv")
df["label"] = df["label"].apply(
    lambda x: x[8:-2].replace("\n", "").replace(" ", "").split(",")
)
df["label"] = df["label"].apply(lambda x: [int(i) for i in x])
df["text"] = df["text"].apply(lambda x: x[2:-2].split("', '"))

In [19]:
len(df[df['entity'].str[:]=='[]'])

1058

# 3. 创建dataset

In [135]:
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data["text"][index]
        labels = self.data["label"][index][1:-1]

        # Convert tokens to token IDs
        tokens = [i.replace(" ", "") for i in text][1:-1]

        return tokens, labels

## 3.1. 创建数据整理函数

In [182]:
def batch_tokenizer(input_text, max_len=100):
    res = defaultdict(list)
    max_len -= 2
    for text in input_text:
        ids = tokenizer.convert_tokens_to_ids(text)
        valid_len = len(ids) + 2
        if len(ids) > max_len:
            ids = ids[:max_len]
            ids = [101] + ids + [102]
            attention_mask = [1] * max_len
        else:
            ids = [101] + ids + [102] + [0] * (max_len - len(ids))
            attention_mask = [1] * valid_len + [0] * (max_len - valid_len + 2)
        res['input_ids'].append(ids)
        res['attention_mask'].append(attention_mask)
    res['input_ids'] = torch.tensor(res['input_ids']).to(device)
    res['attention_mask'] = torch.tensor(res['attention_mask']).to(device)
    return res


In [183]:
# 数据整理函数
def collate_fn(data):
    tokens = [i[0] for i in data]
    labels = [i[1] for i in data]
    inputs = batch_tokenizer(tokens)

    lens = inputs["input_ids"].shape[1]

    for i in range(len(labels)):
        labels[i] = [3] + labels[i]
        labels[i] += [3] * lens
        labels[i] = labels[i][:lens]

    return inputs, torch.LongTensor(labels)

## 3.2. 拆分数据集

In [186]:
# Set batch size
batch_size = 16

# Create the DataLoader
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create the training dataset and dataloader
train_dataset = NERDataset(train_df.reset_index(), tokenizer)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    drop_last=True,
)

# Create the validation dataset and dataloader
val_dataset = NERDataset(val_df.reset_index(), tokenizer)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    drop_last=True,
)

In [196]:
# 模型试算
for i, j in train_dataloader:
    print(model(**i).last_hidden_state.shape)
    break

torch.Size([16, 100, 1024])


# 4. 搭建微调模型

In [197]:
# 定义下游模型
class Model(torch.nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.tuneing = False
        self.pretrained = pretrained
        self.hidden_size = deque(pretrained.parameters())[-1].shape[0]
        self.rnn = torch.nn.GRU(self.hidden_size, self.hidden_size, batch_first=True)
        self.fc = torch.nn.Linear(self.hidden_size, 4)

    def forward(self, inputs):
        pretrained = self.pretrained
        if self.tuneing:
            out = pretrained(**inputs).last_hidden_state
        else:
            with torch.no_grad():
                out = pretrained(**inputs).last_hidden_state

        out, _ = self.rnn(out)

        out = self.fc(out).softmax(dim=2)

        return out

    def fine_tuneing(self, tuneing):
        self.tuneing = tuneing
        pretrained = self.pretrained
        if tuneing:
            for i in pretrained.parameters():
                i.requires_grad = True

            pretrained.train()
            self.pretrained = pretrained
        else:
            for i in pretrained.parameters():
                i.requires_grad_(False)

            pretrained.eval()
            self.pretrained = None


mymodel = Model(model)
mymodel.fine_tuneing(True)
mymodel = mymodel.to(device)

In [198]:
# 对计算结果和label变形,并且移除pad
def reshape_and_remove_pad(outs, labels, attention_mask):
    # 变形,便于计算loss
    # [b, lens, 8] -> [b*lens, 8]
    outs = outs.reshape(-1, 4)
    # [b, lens] -> [b*lens]
    labels = labels.reshape(-1)

    # 忽略对pad的计算结果
    # [b, lens] -> [b*lens - pad]
    select = attention_mask.reshape(-1) == 1
    outs = outs[select]
    labels = labels[select]

    return outs, labels

In [199]:
# 获取正确数量和总数
def get_correct_and_total_count(labels, outs):
    # [b*lens, 8] -> [b*lens]
    outs = outs.argmax(dim=1)
    correct = (outs == labels).sum().item()
    total = len(labels)

    # 计算除了0以外元素的正确率,因为0太多了,包括的话,正确率很容易虚高
    select = labels != 0
    outs = outs[select]
    labels = labels[select]
    correct_content = (outs == labels).sum().item()
    total_content = len(labels)

    return correct, total, correct_content, total_content

# 5. 训练

In [219]:
# 训练
def train(loader, epochs, model=mymodel, optimizer=optimizer):

    # 训练
    criterion = torch.nn.CrossEntropyLoss()

    mymodel.train()
    for epoch in range(epochs):
        for step, (inputs, labels) in tqdm(enumerate(loader), head="Epoch" + str(epoch)):
            labels = labels.to(device)
            # 模型计算
            # [b, lens] -> [b, lens, 8]
            outs = model(inputs)

            # 对outs和label变形,并且移除pad
            # outs -> [b, lens, 8] -> [c, 8]
            # labels -> [b, lens] -> [c]
            # print(outs.shape, labels.shape, inputs['attention_mask'].shape)
            outs, labels = reshape_and_remove_pad(
                outs, labels, inputs["attention_mask"]
            )

            # 梯度下降
            loss = criterion(outs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if step % 50 == 0:
                counts = get_correct_and_total_count(labels, outs)

                accuracy = counts[0] / counts[1]
                accuracy_content = counts[2] / counts[3]

                print(epoch, step, loss.item(), accuracy, accuracy_content)

        torch.save(model, "../../../BERT/NER_FT/NER_FT.model")
        torch.save(optimizer.state_dict(), "../../../BERT/NER_FT/NER_FT_optimizer.pkl")

In [217]:
# 检查训练数据
for i, j in train_dataloader:
    print(tokenizer.decode(i['input_ids'][0]))
    list_a = []
    for t in range(len(j[0])):
        if j[0][t]!=0:
            list_a.append(i['input_ids'][0][t])
        else:
            list_a.append(0)
    print(tokenizer.decode(list_a))
    print('===================')

[CLS] parameters into a region that would otherwise be inaccessible. Neural network training is [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [220]:
lr = 2e-5 if mymodel.tuneing else 5e-4
optimizer = torch.optim.AdamW(mymodel.parameters(), lr=lr)
if os.path.exists("../../../BERT/NER_FT/NER_FT.model"):
    mymodel.load_state_dict(torch.load("../../../BERT/NER_FT/NER_FT.model"))
    optimizer.load_state_dict(torch.load("../../../BERT/NER_FT/NER_FT_optimizer.pkl"))
    
train(epochs=1, loader=train_dataloader, optimizer=optimizer)

0 0 1.3898124694824219 0.16842105263157894 0.23529411764705882


In [223]:
#测试
def test(loader_test=val_dataloader):
    model_load = torch.load("../../../BERT/NER_FT/NER_FT.model")
    model_load.eval()
    
    correct = 0
    total = 0

    correct_content = 0
    total_content = 0

    for step, (inputs, labels) in enumerate(loader_test):
        labels = labels.to(device)
        if step == 5:
            break
        print(step)

        with torch.no_grad():
            #[b, lens] -> [b, lens, 8] -> [b, lens]
            outs = model_load(inputs)

        #对outs和label变形,并且移除pad
        #outs -> [b, lens, 8] -> [c, 8]
        #labels -> [b, lens] -> [c]
        outs, labels = reshape_and_remove_pad(outs, labels,
                                              inputs['attention_mask'])

        counts = get_correct_and_total_count(labels, outs)
        correct += counts[0]
        total += counts[1]
        correct_content += counts[2]
        total_content += counts[3]

    print(correct / total, correct_content / total_content)


test(val_dataloader)

0
1
2
3
4
0.8295454545454546 0.0


In [224]:
#测试
def predict(loader_test=val_dataloader):
    model_load = torch.load("../../../BERT/NER_FT/NER_FT.model")
    model_load.eval()

    for i, (inputs, labels) in enumerate(loader_test):
        break

    inputs['input_ids'] = inputs['input_ids'].to(device)
    inputs['attention_mask'] = inputs['attention_mask'].to(device)
    labels = labels.to(device)
    with torch.no_grad():
        #[b, lens] -> [b, lens, 8] -> [b, lens]
        outs = model_load(inputs).argmax(dim=2)

    for i in range(5):
        #移除pad
        select = inputs['attention_mask'][i] == 1
        input_id = inputs['input_ids'][i, select]
        out = outs[i, select]
        label = labels[i, select]
        
        #输出原句子
        print(tokenizer.decode(input_id))

        #输出tag
        for tag in [label, out]:
            s = ''
            for j in range(len(tag)):
                if tag[j] == 0:
                    s += '·'
                    continue
                s += tokenizer.decode(input_id[j]).replace(' ', '').replace('##', '') + ' '
                s += ' '

            print(s)
        print('==========================')


predict(val_dataloader)

[CLS] network learned in the pretraining phase. No matter what kind of unsupervised learning algorithm or [SEP]
[CLS]  ··············un  su  per  vise  d  learning  algorithm  ·[SEP]  
························
[CLS] of states. 2. It is possible to use the same transition function f with the same parameters at [SEP]
[CLS]  ····················[SEP]  
······················
[CLS] train even fully connected architectures ( Hinton et al., 2006 ; Hinton and Salakhutdinov, 2006 ; [SEP]
[CLS]  ··fully  connected  architecture  s  ·Hi  nton  et  al  .  ···Hi  nton  and  Sal  ak  hu  t  din  ov  ···[SEP]  
·····························
[CLS] RECURRENT AND RECURSIVE NETS physical implementation of the model, such as a biological neural [SEP]
[CLS]  R  EC  UR  RE  NT  AND  R  EC  UR  SI  VE  NE  TS  ···········[SEP]  
··························
[CLS] an unsupervised representation learning algorithm. However it is also called pretraining, because [SEP]
[CLS]  ·un  su  per  vise  d  representation