In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import time
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
train_df = pd.read_csv('../data/train_set.csv', sep='\t')
test_df = pd.read_csv('../data/test_a.csv', sep='\t')
test_df['label'] = 0

In [36]:
train_df.head()

Unnamed: 0,label,text
0,2,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...


In [37]:
tokenizer = BertTokenizer.from_pretrained('../emb/bert-mini/vocab.txt')
tokenizer.encode_plus("2967 6758 339 2021 1854",
        add_special_tokens=True,
        max_length=20,
        truncation=True)
# token_type_ids 通常第一个句子全部标记为0，第二个句子全部标记为1。
# attention_mask padding的地方为0，未padding的地方为1。

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


{'input_ids': [2, 280, 1106, 1529, 518, 193, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [38]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = self.data.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [39]:
# Creating the dataset and dataloader for the neural network
MAX_LEN = 256
train_size = 0.8
train_dataset = train_df.sample(frac=train_size,random_state=7)
valid_dataset = train_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALID Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_df.shape))

train_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
valid_set = CustomDataset(valid_dataset, tokenizer, MAX_LEN)
test_set = CustomDataset(test_df, tokenizer, MAX_LEN)

FULL Dataset: (10000, 2)
TRAIN Dataset: (8000, 2)
VALID Dataset: (2000, 2)
TEST Dataset: (50000, 2)


In [40]:
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True}

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True}

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False}

train_loader = DataLoader(train_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)
test_loader = DataLoader(test_set, **test_params)

In [8]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.config = BertConfig.from_pretrained('../emb/bert-mini/bert_config.json', output_hidden_states=True)
        self.l1 = BertModel.from_pretrained('../emb/bert-mini/pytorch_model.bin', config=self.config)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(256, 14)
    
    def forward(self, ids, mask, token_type_ids):
        sequence_output, pooler_output, hidden_states= self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        # [bs, 200, 256]  [bs,256]
        output_2 = self.l2(pooler_output)
        output = self.l3(output_2)
        return output

net = BERTClass()
net.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(5981, 256, padding_idx=0)
      (position_embeddings): Embedding(256, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
     

In [9]:
# 超参数设置
lr, num_epochs = 1e-5, 30
criterion = torch.nn.CrossEntropyLoss()  # 选择损失函数
optimizer = torch.optim.Adam(net.parameters(), lr=lr)  # 选择优化器

In [10]:
def evaluate_accuracy(data_iter, net, device=torch.device('cpu')):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum, n = torch.tensor([0], dtype=torch.float32,device=device), 0
    for data in tqdm(data_iter):
        # If device is the GPU, copy the data to the GPU.
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        net.eval()
        with torch.no_grad():
            targets = targets.long()
            # [[0.2 ,0.4 ,0.5 ,0.6 ,0.8] ,[ 0.1,0.2 ,0.4 ,0.3 ,0.1]] => [ 4 , 2 ]
            acc_sum += torch.sum((torch.argmax(net(ids, mask, token_type_ids), dim=1) == targets))
            n += targets.shape[0]
    return acc_sum.item()/n

In [29]:
def train(epoch,train_iter, test_iter, criterion, num_epochs, optimizer, device):
    print('training on', device)
    net.to(device)
    best_test_acc = 0
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)  # 设置学习率下降策略
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=2e-06)  # 余弦退火
    for epoch in range(num_epochs):
        train_l_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        train_acc_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        n, start = 0, time.time()
        for data in tqdm(train_iter):
            net.train()
            optimizer.zero_grad()
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            y_hat = net(ids, mask, token_type_ids)
            loss = criterion(y_hat, targets.long())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                targets = targets.long()
                train_l_sum += loss.float()
                train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == targets))).float()
                n += targets.shape[0]
        valid_acc = evaluate_accuracy(test_iter, net, device)
        train_acc = train_acc_sum / n
        print('epoch %d, loss %.4f, train acc %.3f, valid acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc, valid_acc,
                 time.time() - start))
        if valid_acc > best_test_acc:
            print('find best! save at model/best.pth')
            best_test_acc = valid_acc
            torch.save(net.state_dict(), 'model/best.pth')
        scheduler.step()  # 更新学习率

In [30]:
train(net,train_loader, valid_loader, criterion, num_epochs, optimizer, device)

  0%|          | 0/250 [00:00<?, ?it/s]

training on cuda


100%|██████████| 250/250 [01:52<00:00,  2.21it/s]
100%|██████████| 125/125 [00:21<00:00,  5.77it/s]
  0%|          | 0/250 [00:00<?, ?it/s]

epoch 1, loss 0.0110, train acc 0.915, valid acc 0.889, time 134.6 sec
find best! save at model/best.pth


100%|██████████| 250/250 [01:52<00:00,  2.23it/s]
100%|██████████| 125/125 [00:21<00:00,  5.76it/s]
  0%|          | 0/250 [00:00<?, ?it/s]

epoch 2, loss 0.0101, train acc 0.919, valid acc 0.890, time 133.9 sec
find best! save at model/best.pth


  5%|▍         | 12/250 [00:05<01:54,  2.07it/s]


KeyboardInterrupt: 

In [41]:
def model_predict(net, test_iter):
    # 预测模型
    preds_list = []
    print('加载最优模型')
    net.load_state_dict(torch.load('model/best.pth'))
    net = net.to(device)
    print('inference测试集')
    with torch.no_grad():
        for data in tqdm(test_iter):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            batch_preds = list(net(ids, mask, token_type_ids).argmax(dim=1).cpu().numpy())
            for preds in batch_preds:
                preds_list.append(preds)           
    return preds_list
preds_list = model_predict(net, test_loader)

  0%|          | 0/3125 [00:00<?, ?it/s]

加载最优模型
inference测试集


100%|██████████| 3125/3125 [10:03<00:00,  5.18it/s]


In [43]:
submission = pd.read_csv('../data/test_a_sample_submit.csv')
submission['label'] = preds_list
submission.to_csv('../output/submission.csv', index=False)