### 1. 安装依赖库

In [1]:
! pip install jieba

### 2. 导入依赖库

In [2]:
import jieba
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


### 3. 数据加载

In [18]:
train_data = pd.read_csv('data/train.tsv', sep='\t')
valid_data = pd.read_csv('data/dev.tsv', sep='\t')
test_data = pd.read_csv('data/test.tsv', sep='\t') 
x_train, y_train = train_data.text_a.values, train_data.label.values # 训练集
x_valid, y_valid = valid_data.text_a.values, valid_data.label.values # 验证集
x_test, y_test = test_data.text_a.values, test_data.label.values # 测试集

In [4]:
train_data

Unnamed: 0,label,text_a
0,1,选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全...
1,1,15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很...
2,0,房间太小。其他的都一般。。。。。。。。。
3,0,"1.接电源没有几分钟,电源适配器热的不行. 2.摄像头用不起来. 3.机盖的钢琴漆，手不能摸..."
4,1,"今天才知道这书还有第6卷,真有点郁闷:为什么同一套书有两种版本呢?当当网是不是该跟出版社商量..."
...,...,...
9141,1,看过该书，感觉中医暂时不会消亡，尚有一、二十株老树活着，还有毛以林、黄煌、刘力红等一批有一定...
9142,0,这本书没读到底，不是特别喜欢。完全可以用序中的评价来表达我的感受：可以包容，却不想实践。除了...
9143,1,"虽是观景房,不过我住的楼层太低(19楼)看不到江景,但地点很好,离轻轨临江门站和较场口站(起..."
9144,1,性价比不错，交通方便。行政楼层感觉很好，只是早上8点楼上装修，好吵。 中餐厅档次太低，虽然便...


In [5]:
x_train, y_train

(array(['选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
        '15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错',
        '房间太小。其他的都一般。。。。。。。。。', ...,
        '虽是观景房,不过我住的楼层太低(19楼)看不到江景,但地点很好,离轻轨临江门站和较场口站(起点)很近,解放碑就在附近(大约100多公尺吧)!',
        '性价比不错，交通方便。行政楼层感觉很好，只是早上8点楼上装修，好吵。 中餐厅档次太低，虽然便宜，但是和酒店档次不相配。',
        '跟心灵鸡汤没什么本质区别嘛，至少我不喜欢这样读经典，把经典都解读成这样有点去中国化的味道了'], dtype=object),
 array([1, 1, 0, ..., 1, 1, 0], dtype=int64))

### 4. 构建词汇表

In [6]:
vocab = set()
cut_docs = train_data.text_a.apply(lambda x: jieba.cut(x)).values
for doc in cut_docs:
    for word in doc:
        if word.strip():
            vocab.add(word.strip())

# 将词表写入本地vocab.txt文件
with open('data/vocab.txt', 'w') as file:
    for word in  vocab:
        file.write(word)
        file.write('\n')
        
print("len(vocal) = %d" % len(vocab))
list(vocab)[:10]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Jun\AppData\Local\Temp\jieba.cache


Loading model cost 0.673 seconds.
Prefix dict has been built successfully.


len(vocal) = 35091


['广博', '香', '摩花', '鼓点', '谋求', '美美', '法子', '口语化', '写作者', '伪品']

### 5. 定义配置参数

In [7]:
class Config():
    embedding_dim = 300 # 词向量维度
    max_seq_len = 200   # 文章最大词数 200
    vocab_file = 'data/vocab.txt' # 词汇表文件路径
config = Config()

### 6. 定义预处理类

In [8]:
class Preprocessor():
    def __init__(self, config):
        self.config = config
        # 初始化词和id的映射词典，预留0给padding字符，1给词表中未见过的词
        token2idx = {"[PAD]": 0, "[UNK]": 1} # {word：id}
        with open(config.vocab_file, 'r') as reader:
            for index, line in enumerate(reader):
                token = line.strip()
                token2idx[token] = index+2
                
        self.token2idx = token2idx
        
    def transform(self, text_list):
        # 文本分词，并将词转换成相应的id, 最后不同长度的文本padding长统一长度，后面补0
        idx_list = [[self.token2idx.get(word.strip(), self.token2idx['[UNK]']) for word in jieba.cut(text)] for text in text_list]
        tensor_list = [torch.tensor(sublist) for sublist in idx_list]
        padded_sequences = [F.pad(sequence, (0, self.config.max_seq_len - sequence.size(0))) for sequence in tensor_list]
        stacked_tensor = torch.stack(padded_sequences)
        
        return stacked_tensor

In [11]:
preprocessor = Preprocessor(config)
res_show = preprocessor.transform(['性价比不错，交通方便。', '宝贝我爱你'])
len(preprocessor.token2idx)

35093

In [19]:
# make dataset
x_train = preprocessor.transform(x_train)
y_train = torch.LongTensor(y_train)
train_dataset = TensorDataset(x_train, y_train)

x_valid = preprocessor.transform(x_valid)
y_valid = torch.LongTensor(y_valid)
valid_dataset = TensorDataset(x_valid, y_valid)

x_test = preprocessor.transform(x_test)
y_test = torch.LongTensor(y_test)
test_dataset = TensorDataset(x_test, y_test)


In [41]:
# make dataloader

batch_size = 64
num_workers = 4
train_loader = DataLoader(train_dataset, 
                            batch_size=batch_size, 
                            shuffle=True,
                            pin_memory = True,
                            num_workers=num_workers)
    
valid_loader = DataLoader(valid_dataset, 
                            batch_size=batch_size, 
                            shuffle=True,
                            pin_memory = True,
                            num_workers=num_workers)

test_loader = DataLoader(test_dataset, 
                            batch_size=batch_size, 
                            shuffle=True,
                            pin_memory = True,
                            num_workers=num_workers)

### 7. 定义模型类

In [47]:
import torch
import torch.nn as nn
from tqdm import tqdm
import sys
from sklearn.metrics import accuracy_score, f1_score

class TextCNN(nn.Module):
    def __init__(self, filter_sizes, num_classes, vocab_size, num_filters, emb_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.convs = nn.ModuleList(
            [nn.Conv1d(emb_dim, num_filters, x) for x in filter_sizes]
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)
        self.relu = nn.ReLU()

    def pool(self, out, conv):
        out = self.relu(conv(out))
        max_pool = nn.MaxPool1d(out.shape[-1])
        out = max_pool(out)
        out = out.squeeze(2)
        return out

    def forward(self, x):

        embedded = self.dropout(self.embedding(x))      # x = [batch_size, seq_len]
        embedded = embedded.permute(0,2,1)              # embedded = [batch_size, seq_len, emb_dim]
        output = [self.pool(embedded, conv) for conv in self.convs]
        out = torch.cat(output, dim=1)
        out = self.fc(out)
        return out
    
    # Train stage
    def fit(self, data_loader, data_loader2, epochs):
        
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.to(device)
        optimizer = torch.optim.Adam(self.parameters(), lr=0.004)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(epochs):
            self.train()
            optimizer.zero_grad()

            # for calculating loss and acc
            train_loss = torch.zeros(1).to(device)
            total_correct = 0
            total_samples = 0

            # for calculating accuracy
            data_loader = tqdm(data_loader, ncols=100)
            data_loader.set_description(f"Epoch [{epoch}] [Train]")

            for step, data in enumerate(data_loader):

                x_train, y_train = data
                x_train, y_train = x_train.to(device), y_train.to(device)
                outputs = self(x_train)
                _, predicted = torch.max(outputs, 1)  # 获取预测的类别
                total_correct += (predicted == y_train).sum().item()  # 统计预测正确的数量
                total_samples += y_train.size(0)  # 统计样本总数

                loss = criterion(outputs, y_train)
                train_loss = (train_loss * step + loss.detach()) / (step + 1)  # update mean losses
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
                data_loader.set_postfix(Loss=train_loss.item(), Acc=total_correct/total_samples)

                if not torch.isfinite(loss):
                    print('WARNING: non-finite loss, ending training ', loss)
                    sys.exit(1)

            self.valid(data_loader2, epoch, criterion, device)


    # Valid stage
    def valid(self, data_loader, epoch, criterion, device):
        
        self.eval()
        eval_loss = torch.zeros(1).to(device)
        total_correct = 0
        total_samples = 0

        # for calculating accuracy
        data_loader = tqdm(data_loader,  ncols=100)
        data_loader.set_description(f"Epoch [{epoch}] [Valid]")

        with torch.no_grad():  
            for step, data in enumerate(data_loader):

                x_valid, y_valid = data
                x_valid, y_valid = x_valid.to(device), y_valid.to(device)
                outputs = self(x_valid)
                _, predicted = torch.max(outputs, 1)  # 获取预测的类别

                total_correct += (predicted == y_valid).sum().item()  # 统计预测正确的数量
                total_samples += y_valid.size(0)  # 统计样本总数

                loss = criterion(outputs, y_valid)
                eval_loss = (eval_loss * step + loss.detach()) / (step + 1)  # update mean losses
                data_loader.set_postfix(Loss=eval_loss.item(), Acc=total_correct/total_samples)

            self.save_model(epoch, round(total_correct/total_samples*100,2))

    def evaluate(self, x_test, y_test):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.eval()
        x_test, y_test = x_test.to(device), y_test.to(device)

        outputs = self(x_test)
        _, predicted = torch.max(outputs, 1)

        y_test, predicted = y_test.to('cpu'), predicted.to('cpu')
        test_acc = accuracy_score(y_test, predicted)
        test_f1  = f1_score(y_test, predicted)
        print(f"[Test ]: Accuracy: {test_acc}, F1-Score: {test_f1}")

    def save_model(self, epoch, val_acc):
        ckpt_path = "./checkpoints/TextCNN_epoch{}_valid{}.pth".format(epoch, val_acc)
        torch.save(self.state_dict(), ckpt_path)
        print("Saving weight to [%s] successfully." % (ckpt_path))
    
    def load_model(self, ckpt_path):
        self.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
        print("Loading weight from [%s] successfully." % (ckpt_path))

### 8. 启动训练

In [51]:
textcnn = TextCNN(filter_sizes = [3,4,5],
                  num_classes = 2,
                  vocab_size = len(preprocessor.token2idx),
                  num_filters = 128,
                  emb_dim = config.embedding_dim,
                  dropout = 0.4)

textcnn.fit(train_loader,valid_loader, 8)

Epoch [0] [Train]: 100%|███████████████████| 143/143 [01:02<00:00,  2.30it/s, Acc=0.725, Loss=0.704]
Epoch [0] [Valid]: 100%|██████████████████████| 19/19 [00:04<00:00,  4.27it/s, Acc=0.853, Loss=0.39]


Saving weight to [./checkpoints/TextCNN_epoch0_valid85.33.pth] successfully.


Epoch [1] [Train]: 100%|███████████████████| 143/143 [00:56<00:00,  2.54it/s, Acc=0.863, Loss=0.346]
Epoch [1] [Valid]: 100%|█████████████████████| 19/19 [00:04<00:00,  4.38it/s, Acc=0.882, Loss=0.385]


Saving weight to [./checkpoints/TextCNN_epoch1_valid88.25.pth] successfully.


Epoch [2] [Train]: 100%|███████████████████| 143/143 [01:11<00:00,  1.99it/s, Acc=0.919, Loss=0.228]
Epoch [2] [Valid]: 100%|█████████████████████| 19/19 [00:07<00:00,  2.66it/s, Acc=0.877, Loss=0.495]


Saving weight to [./checkpoints/TextCNN_epoch2_valid87.67.pth] successfully.


Epoch [3] [Train]: 100%|███████████████████| 143/143 [01:07<00:00,  2.12it/s, Acc=0.944, Loss=0.174]
Epoch [3] [Valid]: 100%|█████████████████████| 19/19 [00:05<00:00,  3.22it/s, Acc=0.883, Loss=0.543]


Saving weight to [./checkpoints/TextCNN_epoch3_valid88.33.pth] successfully.


Epoch [4] [Train]: 100%|██████████████████| 143/143 [01:09<00:00,  2.05it/s, Acc=0.969, Loss=0.0991]
Epoch [4] [Valid]: 100%|█████████████████████| 19/19 [00:06<00:00,  2.90it/s, Acc=0.868, Loss=0.622]


Saving weight to [./checkpoints/TextCNN_epoch4_valid86.75.pth] successfully.


Epoch [5] [Train]: 100%|██████████████████| 143/143 [01:11<00:00,  1.99it/s, Acc=0.975, Loss=0.0907]
Epoch [5] [Valid]: 100%|█████████████████████| 19/19 [00:05<00:00,  3.34it/s, Acc=0.899, Loss=0.528]


Saving weight to [./checkpoints/TextCNN_epoch5_valid89.92.pth] successfully.


Epoch [6] [Train]: 100%|███████████████████| 143/143 [01:12<00:00,  1.98it/s, Acc=0.957, Loss=0.168]
Epoch [6] [Valid]: 100%|███████████████████████| 19/19 [00:08<00:00,  2.32it/s, Acc=0.9, Loss=0.756]


Saving weight to [./checkpoints/TextCNN_epoch6_valid90.0.pth] successfully.


Epoch [7] [Train]: 100%|███████████████████| 143/143 [01:13<00:00,  1.94it/s, Acc=0.976, Loss=0.122]
Epoch [7] [Valid]: 100%|█████████████████████| 19/19 [00:06<00:00,  2.86it/s, Acc=0.892, Loss=0.754]

Saving weight to [./checkpoints/TextCNN_epoch7_valid89.17.pth] successfully.





### 9. 测试评估

In [52]:
textcnn.evaluate(x_test, y_test) # 测试集评估

[Test ]: Accuracy: 0.8866666666666667, F1-Score: 0.8934169278996865


### 10. 离线加载预测

In [53]:
ckpt = "./checkpoints/TextCNN_epoch5_valid90.2.pth"
textcnn = TextCNN(filter_sizes = [3,4,5],
                  num_classes = 2,
                  vocab_size = len(preprocessor.token2idx),
                  num_filters = 128,
                  emb_dim = config.embedding_dim,
                  dropout = 0.4)
textcnn.load_model(ckpt)
textcnn.evaluate(x_test, y_test) # 测试集评估

Loading weight from [./checkpoints/TextCNN_epoch5_valid90.2.pth] successfully.
[Test ]: Accuracy: 0.8875, F1-Score: 0.8899755501222494
