In [18]:
# 一些全局配置
config = {
    'is_train': True,  # 是否进行训练
    'is_save': True,  # 是否保存模型文件
    'is_load': True,  # 是否加载模型文件
    'is_save_result': True,  # 是否保存结果

    # 路径相关配置
    'cwd': '.',  # 工作路径
    'model_path': '/models/bert',  # 模型保存路径
    'result_data': '/data/bert-result-',  # 结果保存路径
    'train_data': '/data/train_preprocessed.json',  # 训练数据
    'test_data': '/data/test_preprocessed.json',  # 测试数据
    'answer_data': '/data/answer.txt',

    # 训练数据划分相关配置
    'random_seed': 42,  # 随机种子
    'train_set_ratio': 0.99,  # 训练集占训练数据的比重

    # Model 相关配置
    'max_len': 120,
    'train_batch_size': 8,
    'valid_batch_size': 4,
    'test_batch_size': 4,
    
    # 训练相关配置
    'lr': 2e-05,
    'epochs': 6,
}

# 当前是否是 google colab 中
is_colab = True
try:
    from google.colab import drive
except ImportError:
    is_colab = False
# 挂载 google drive
if is_colab:
    drive.mount('/content/drive')

# 为 Colab 更改对应配置
if is_colab:
    config = {
        **config,
        'is_train': True,
        'is_save': True,
        'is_load': False,
        'cwd': '/content/drive/MyDrive/Colab Notebooks/nlp-target-sentiment-analysis',
    }

# 为 Colab 安装对应依赖
if 'is_init' not in locals().keys() and is_colab:
    %pip install -q transformers
is_init = True

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import json

config['device'] = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
    #  torch.backends.cudnn.deterministic = True

# 设置随机数种子
setup_seed(config['random_seed'])


def pad(l, n, pad=0):
    """
    Pad the list 'l' to have size 'n' using 'padding_element'
    """
    pad_with = max(0, n - len(l))
    return l + pad_with * [pad]

class CustomDataset(Dataset):

    def __init__(self, data_path, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.documents = []
        self.targets = []
        self.labels = []
        self.dists = []
        self.weights = []
        self.max_len = max_len
        with open(data_path, 'r') as f:
            data = json.load(f)
            for item in data:
                self.documents.append(item['document'])
                self.targets.append(item['target'])
                self.labels.append(item['label'])
                self.dists.append(item['dist'])
                self.weights.append(item['weights'])

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, index):
        document = self.documents[index]
        target = self.targets[index]
        label = self.labels[index]
        # document = " ".join(document.split())

        inputs = self.tokenizer.encode_plus(
            document,
            f"What is the sentiment class of {target} in the sentence ?",
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        dists = pad(self.dists[index], self.max_len, 0)
        weights = pad(self.weights[index], self.max_len, 0)
        # 给 [CLS] 设置一个更大的权重 2.
        weights[0] = 2.

        # get the position of the target token with tokenizer
        # target_ids = tokenizer.encode(target, add_special_tokens=False)
        # get the position of the target token with ids
        # target_idx = ids.index(target_ids[0])
        # 设置 weights 为 [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]，即只关注 target, 以及 len(target_ids)
        # weights = [1.] + [0.] * (target_idx - 1) + [1.] * len(target_ids) + [0.] * (self.max_len - target_idx - len(target_ids))

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            # 'target_idx': torch.tensor(target_idx, dtype=torch.long),
            'weights': torch.tensor(weights, dtype=torch.float),
            'label': torch.tensor(int(label) + 1 if label != '' else -1, dtype=torch.float),
        }

dataset = CustomDataset(config['cwd'] + config['train_data'], tokenizer, config['max_len'])
# 划分训练集和验证集
train_size = int(config['train_set_ratio'] * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size])
# 构建 DataLoader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config['train_batch_size'], shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config['valid_batch_size'], shuffle=True)

In [20]:
class TDBert(nn.Module):

    def __init__(self, num_labels=3):
        super(TDBert, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(768, num_labels)


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, weights=None):
        last_hidden_state, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        # 用权重加权平均, 对 shape 为 (batch_size, seq_len, hidden_size) 的 last_hidden_state 进行加权平均
        # weights 的 shape 为 (batch_size, seq_len)
        weighted_output = torch.sum(last_hidden_state * weights.unsqueeze(-1), dim=1) / torch.sum(weights, dim=1).unsqueeze(-1)
        logits = self.classifier(weighted_output)
        return logits

model = TDBert().to(config['device'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
test_dataset = CustomDataset(
    config['cwd'] + config['test_data'], tokenizer, config['max_len'])
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=config['test_batch_size'], shuffle=False)


def predict(model, test_loader, config, is_load=False):
    model = model.to(config['device'])
    if is_load:
        model.load_state_dict(torch.load(
            config['cwd'] + config['model_path'] + '/model.pth'))
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(config['device'])
            token_type_ids = batch['token_type_ids'].to(config['device'])
            attention_mask = batch['attention_mask'].to(config['device'])
            weights = batch['weights'].to(config['device'])
            # logits = model(input_ids, token_type_ids, attention_mask, weights)
            logits = model(input_ids=input_ids, attention_mask=attention_mask,
                           token_type_ids=token_type_ids, weights=weights)
            predictions.extend(torch.argmax(logits, dim=1).tolist())
    return predictions


def evaluate(model, valid_loader, config):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = 0
    acc = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(valid_loader):
            input_ids = batch['input_ids'].to(config['device'])
            token_type_ids = batch['token_type_ids'].to(config['device'])
            attention_mask = batch['attention_mask'].to(config['device'])
            weights = batch['weights'].to(config['device'])
            label = batch['label'].to(config['device'])
            logits = model(input_ids=input_ids, attention_mask=attention_mask,
                           token_type_ids=token_type_ids, weights=weights)
            loss = criterion(logits, label.long())
            acc += (logits.argmax(dim=-1) == label).float().mean()
            total_loss += loss.item()
    acc /= len(valid_loader)
    return total_loss / len(valid_loader), acc


def train(model, train_loader, valid_loader, config):
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    # optimizer = torch.optim.SGD(model.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss()
    best_valid_acc = 0
    for epoch in range(config['epochs']):
        model.train()
        acc = 0
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(config['device'])
            token_type_ids = batch['token_type_ids'].to(config['device'])
            attention_mask = batch['attention_mask'].to(config['device'])
            weights = batch['weights'].to(config['device'])
            label = batch['label'].to(config['device'])
            label = label.reshape(-1)
            # logits = model(input_ids, token_type_ids, attention_mask, weights)
            logits = model(input_ids=input_ids, attention_mask=attention_mask,
                           token_type_ids=token_type_ids, weights=weights)
            loss = criterion(logits, label.long())
            acc += (logits.argmax(dim=-1) == label).float().mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        acc /= len(train_loader)
        print(
            f'Epoch: {epoch}, Train Loss: {loss.item()}, Train Acc: {acc.item()}')
        valid_loss, valid_acc = evaluate(model, valid_loader, config)
        print(f'valid Loss: {valid_loss}, Valid Acc: {valid_acc}')
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            torch.save(model.state_dict(),
                       config['cwd'] + config['model_path'] + '/model.pth')
            print(f'epoch {epoch}: Save model successfully')


if config['is_train']:
    train(model, train_loader, valid_loader, config)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Train Loss: 0.16021403670310974, Train Acc: 0.7474775910377502


In [None]:
if config['is_save_result']:
    predictions = predict(model, test_loader, config, is_load=True)
    with open(config['cwd'] + config['result_data'] + '.txt', 'w') as f:
        for pred in predictions:
            f.write(str(pred) + '\n')