In [44]:
import pandas as pd
import torch 
import torch.nn as nn
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from collections.abc import Iterable, Iterator
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm

#####  当模型在推理过程中遇到了字典中没有包含的token时，会出现key index错误, Out Of Value (OOV问题)
##### 解决方案：通过特殊token：\<UNK\> 替代没有见过的token

In [45]:
class CommentDataset:
    def __init__(self, comments, labels):
        self.comments, self.labels = comments, labels
    
        # 字典构建 (字符为token / 词汇为token)
        self._build_vocab()

    def __getitem__(self, index):
        token_index = [self.vocab.get(tk, self.vocab["<UNK>"]) for tk in self.comments[index]]
        index_tensor = torch.zeros(size=(125,))
        for i in range(len(token_index)):
            index_tensor[i] = token_index[i]
        return index_tensor, torch.tensor(self.labels[index])

    def __len__(self):
        return len(self.labels)
    
    def _build_vocab(self): # 自定义内置函数(_:不希望用户调用)
        tokens = set()
        for cmt in self.comments:
            tokens.update(list(cmt))
        tokens = ["<PAD>", "<UNK>"] + list(tokens)
        self.vocab = {token:i for i, token in enumerate(tokens)} 

In [46]:
print(isinstance(CommentDataset, Iterator))
print(isinstance(CommentDataset, Iterable))

False
False


In [47]:
data = pd.read_pickle("../data/comments.bin")
display(data)
comments, labels = data["Comment"].values, data["labels"].values
comments[[0,1]]

Unnamed: 0,Comment,labels
15,什么破烂反派，毫无戏剧冲突能消耗两个多小时生命，还强加爱情戏 脑残片好圈钱倒是真的,0
21,说实话其实剧情就那样吧，非漫威粉看着可能有的地方会get不到G点吧 （其实漫威卖的不是剧情...,1
25,没有了洛基这个小基仔真是觉得即墨如雪啊,1
40,看毕，我激动地对友人说，等等奥创要来毁灭台北怎么办厚，她拍了拍我肩膀，没事，反正你买了两份...,1
43,不出意料得烂，喜欢这部电影的孩子，大概也喜欢变4……,0
...,...,...
256031,我只能用搞笑的标签，可惜没有吐槽的标签！,0
256032,"剧情逗比,调色二比,绝色塑造无力,渣渣",0
256039,不给行业毒瘤乐视和陆川送一毛钱，雇佣水军黑港囧，电影上映第二天就散播港囧的高清电影复刻版拷...,0
256043,浪费时间浪费钱，虽然是9.9的特价票，我还是觉得不值,0


array([' 什么破烂反派，毫无戏剧冲突能消耗两个多小时生命，还强加爱情戏 脑残片好圈钱倒是真的 ',
       ' 说实话其实剧情就那样吧，非漫威粉看着可能有的地方会get不到G点吧 （其实漫威卖的不是剧情而是人物和世界观呀，漫威宇宙棒棒哒）但对于漫威粉来说真是全程高能+IMAX燃爆啊！#漫威大法好#'],
      dtype=object)

In [48]:
ds = CommentDataset(comments, labels)
for item in ds:
    print(item)
    break
print(next(iter(ds)))

(tensor([2446., 2836.,  814., 2421., 1492., 3508., 3931., 1073.,  946., 3037.,
        3905., 1437., 2188., 3736.,  711., 1794., 1767., 4277., 3443., 1305.,
        3686., 3460., 1188., 4035., 1073., 3988., 1560., 1729., 2616., 1447.,
        3905., 2446., 2362., 2461., 3025.,  268.,  372., 2553., 1731., 3547.,
        2517., 3974., 2446.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.]), tensor(0, dtype=to

In [49]:
print(list(ds.vocab.keys())[:10])
print(list(ds.vocab.values())[:10])
print(ds.vocab["<PAD>"])
print(ds.vocab["<UNK>"])


['<PAD>', '<UNK>', '截', '孱', '贤', '捂', '鋼', '盗', '議', '碰']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0
1


In [50]:
dl = DataLoader(ds, batch_size=10, shuffle=True)
next(iter(dl))[0].shape

torch.Size([10, 125])

In [81]:
print(len(ds))
print(len(dl))

20000
625


In [51]:
class CommentsClassifier(nn.Module):
    def __init__(self, vocab_szie, embedding_size, rnn_hidden_size, num_labels):
        super().__init__()
        self.emb = nn.Embedding(vocab_szie, embedding_size, padding_idx=0)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=rnn_hidden_size, batch_first=True)
        self.classifier = nn.Linear(rnn_hidden_size, num_labels)


    def forward(self, X):
        out = self.emb(X) # (batch_size, seq_len, embedding_size)
        output,_ = self.rnn(out) # (batch_size, seq_len, rnn_hidden_size)
        return self.classifier(output[:,-1,:]) # (batch_size, num_labels)
        pass

In [52]:
X = torch.randint(1, 10, size=(10, 12))
model = CommentsClassifier(vocab_szie=10, embedding_size=30, rnn_hidden_size=20, num_labels=2)

In [53]:
result = model(X)
print(result.shape)

torch.Size([10, 2])


## 模型训练

In [79]:
class CommentDataset:
    def __init__(self, comments, labels):
        self.comments, self.labels = comments, labels
    
        # 字典构建 (字符为token / 词汇为token)
        self._build_vocab()

    def __getitem__(self, index):
        token_index = [self.vocab.get(tk, self.vocab["<UNK>"]) for tk in self.comments[index]]
        index_tensor = torch.zeros(size=(125,), dtype=torch.long)
        for i in range(len(token_index)):
            index_tensor[i] = token_index[i]
        return index_tensor, torch.tensor(self.labels[index], dtype=torch.long)

    def __len__(self):
        return len(self.labels)
    
    def _build_vocab(self): # 自定义内置函数(_:不希望用户调用)
        tokens = set()
        for cmt in self.comments:
            tokens.update(list(cmt))
        tokens = ["<PAD>", "<UNK>"] + list(tokens)
        self.vocab = {token:i for i, token in enumerate(tokens)} 

batch_size = 32
lr = 1e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
epoch = 5
embedding_size = 200
rnn_hidden_size = 100
num_labels = 2

data = pd.read_pickle("../data/comments.bin")
comments, labels = data["Comment"].values, data["labels"].values

ds = CommentDataset(comments, labels)
dl = DataLoader(ds, batch_size=batch_size, shuffle=True)

class CommentsClassifier(nn.Module):
    def __init__(self, vocab_szie, embedding_size, rnn_hidden_size, num_labels):
        super().__init__()
        self.emb = nn.Embedding(vocab_szie, embedding_size, padding_idx=0)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=rnn_hidden_size, batch_first=True)
        self.classifier = nn.Linear(rnn_hidden_size, num_labels)


    def forward(self, X):
        out = self.emb(X) # (batch_size, seq_len, embedding_size)
        output,_ = self.rnn(out) # (batch_size, seq_len, rnn_hidden_size)
        return self.classifier(output[:,-1,:]) # (batch_size, num_labels)
        pass

model = CommentsClassifier(
    vocab_szie=len(ds.vocab),
    embedding_size=embedding_size,
    rnn_hidden_size=rnn_hidden_size,
    num_labels=num_labels
)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)

for e in range(epoch):
    process_bar = tqdm(dl)
    for cmt, lbl in process_bar:
        cmt, lbl = cmt.to(device), lbl.to(device)
        y_hat = model(cmt)
        loss = loss_fn(y_hat, lbl)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        process_bar.set_description(f"epoch: {e + 1}, loss: {loss.item():.4f}")
    

epoch: 1, loss: 0.6926: 100%|██████████| 625/625 [00:05<00:00, 113.96it/s]
epoch: 2, loss: 0.6914: 100%|██████████| 625/625 [00:05<00:00, 117.45it/s]
epoch: 3, loss: 0.6939: 100%|██████████| 625/625 [00:05<00:00, 117.54it/s]
epoch: 4, loss: 0.6907: 100%|██████████| 625/625 [00:05<00:00, 116.77it/s]
epoch: 5, loss: 0.6928: 100%|██████████| 625/625 [00:05<00:00, 110.26it/s]
