In [1]:
from utils import load_corpus
import pandas as pd

TRAIN_PATH = "../data/weibo_senti_100k/train.csv"
TEST_PATH = "../data/weibo_senti_100k/test.csv"

In [3]:
train_data = load_corpus(TRAIN_PATH)
test_data = load_corpus(TEST_PATH)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4d/xybtwjyj2d3dv4p2_6s1yccw0000gn/T/jieba.cache
Loading model cost 0.350 seconds.
Prefix dict has been built successfully.


In [4]:
# 加载数据集
df_train = pd.DataFrame(train_data, columns=["review", "label"])
df_test = pd.DataFrame(test_data, columns=["review", "label"])
df_train.head()

Unnamed: 0,review,label
0,乐乐 的 武汉 之 武汉 步行街 哈哈 我 窃以为 只 比 南京 的 夫子庙 好 了 那么 ...,1
1,然后 去 医院 医生 给 火柴 包 起 一圈 绷带 出来 就 成 棉签 了 哈哈,1
2,午饭 来不及 吃 先 垫下 肚子 泪,0
3,汗 想想 办法 吧 狗 X 的 贵 G 福利 真 好 看 了 公司 照片 真是 本尊 啊 太强 了,0
4,昨天 被 人 恶作剧 了 表白 什么 的 不是 真的 晕 晕 晕,0


In [5]:
df = pd.concat([df_train, df_test], ignore_index=True)

In [6]:
# word2vec要求的输入格式: list(word)
wv_input = df['review'].map(lambda s: s.split(" "))   # [for w in s.split(" ") if w not in stopwords]
wv_input.head()

0    [乐乐, 的, 武汉, 之, 武汉, 步行街, 哈哈, 我, 窃以为, 只, 比, 南京, ...
1    [然后, 去, 医院, 医生, 给, 火柴, 包, 起, 一圈, 绷带, 出来, 就, 成,...
2                           [午饭, 来不及, 吃, 先, 垫下, 肚子, 泪]
3    [汗, 想想, 办法, 吧, 狗, X, 的, 贵, G, 福利, 真, 好, 看, 了, ...
4       [昨天, 被, 人, 恶作剧, 了, 表白, 什么, 的, 不是, 真的, 晕, 晕, 晕]
Name: review, dtype: object

In [7]:
# 训练词向量
from gensim import models
# 
# word2vec = models.Word2Vec(wv_input, 
#                            vector_size=100,   # 词向量维度
#                            min_count=1,      # 最小词频, 因为数据量较小, 这里卡1
#                            workers=8,
#                            epochs=100)      # 迭代轮次
# word2vec.save("../models/word2vec.model")



In [138]:
# 查看词向量训练效果
# word2vec.wv.most_similar("你")

[('我', 0.8070616722106934),
 ('你们', 0.7611672878265381),
 ('他', 0.6921034455299377),
 ('她', 0.6857293844223022),
 ('他们', 0.6757404208183289),
 ('自己', 0.6481757760047913),
 ('我们', 0.6067619323730469),
 ('别人', 0.6038157939910889),
 ('的', 0.5902299880981445),
 ('妈妈', 0.5795382261276245)]

In [13]:
# word2vec.wv.most_similar("哈哈")

[('嘻嘻', 0.6881745457649231),
 ('哈哈哈', 0.6328049302101135),
 ('偷笑', 0.5760723948478699),
 ('哈哈哈哈', 0.5443381071090698),
 ('回复', 0.5043001174926758),
 ('媳妇', 0.4680759608745575),
 ('你', 0.44520074129104614),
 ('也', 0.43679869174957275),
 ('哈', 0.43084850907325745),
 ('啊', 0.41636788845062256)]

In [14]:
# word2vec.wv.most_similar("伤心")

[('泪', 0.5092527866363525),
 ('失望', 0.4898751974105835),
 ('而终', 0.48545870184898376),
 ('悲伤', 0.4719094932079315),
 ('心碎', 0.46313390135765076),
 ('受伤', 0.4509361684322357),
 ('痛心', 0.45078450441360474),
 ('可怜', 0.4503190219402313),
 ('难兄难弟', 0.44628793001174927),
 ('心疼', 0.4447227716445923)]

In [8]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 加载词向量模型
word2vec = models.Word2Vec.load("../models/word2vec.model")

# 自定义数据集
class TextDataset(Dataset):
    def __init__(self, df, word2vec):
        self.data = [self.process_sentence(s, word2vec) for s in df["review"].tolist()]
        self.label = df["label"].to_numpy()

    def process_sentence(self, sentence, word2vec):
        vectors = [word2vec.wv[w] for w in sentence.split(" ") if w in word2vec.wv.key_to_index]
        return np.array(vectors)

    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return torch.tensor(data), torch.tensor(label)

    def __len__(self):
        return len(self.label)
    
    
def collate_fn(data):
    """
    :param data: 第0维：data，第1维：label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x = [i[0] for i in data]
    y = [i[1] for i in data]
    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, torch.tensor(y, dtype=torch.float32), data_length

In [9]:
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("cpu")
else:
    device = torch.device("cpu")

In [13]:
# 定义超参数

# 学习率
learning_rate = 5e-4
# 输入维度
input_size = 768
# 迭代轮次
num_epochs = 100
# 批次大小
batch_size = 100
# 词向量维度
embed_size = 100
# 隐藏层维度
hidden_size = 64
# LSTM层数
num_layers = 2

# 初始化模型
model = LSTM(embed_size, hidden_size, num_layers).to(device)

# 定义损失函数和优化器
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 加载数据集
# 训练集
train_data = TextDataset(df_train, word2vec)
train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

# 测试集
test_data = TextDataset(df_test, word2vec)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [14]:
def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer):
    # Put model in train mode
    model.train()
    # Setup train loss and train accuracy values
    train_loss, train_acc = 0, 0
    # Loop through data loader data batches
    for i, (X, labels, lengths)  in enumerate(dataloader):
        # Send data to target device
        X = X.to(device)
        labels = labels.to(device)
        # 1. Forward pass
        outputs = model(X, lengths)
        logits = outputs.view(-1)
        # 2. Calculate  and accumulate loss
        loss = loss_fn(logits, labels)
        train_loss += loss.item()
        # 3. Optimizer zero grad
        optimizer.zero_grad()
        # 4. Loss backward
        loss.backward(retain_graph=True)
        # 5. Optimizer step
        optimizer.step()
        # Calculate and accumulate accuracy metric across all batches
        y_pred_class = torch.round(outputs).squeeze()
        train_acc += (y_pred_class == labels).sum().item() / len(labels)
        
    # Adjust metrics to get average loss and accuracy per batch 
    train_loss /= len(dataloader)
    train_acc /= len(dataloader)
    return train_loss, train_acc

def test_step(model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module):
    # Put model in eval mode
    model.eval()
    # Setup test loss and test accuracy values
    test_loss, test_acc = 0, 0
    # Turn on inference context manager
    with torch.inference_mode():
        # Loop through DataLoader batches
        for i, (X, labels, lengths)  in enumerate(dataloader):
            # Send data to target device
            X = X.to(device)
            labels = labels.to(device)
            # 1. Forward pass
            outputs = model(X, lengths)
            logits = outputs.view(-1)
            # 2. Calculate  and accumulate loss
            loss = loss_fn(logits, labels)
            test_loss += loss.item()
            # Calculate and accumulate accuracy
            y_pred_class = torch.round(outputs).squeeze()
            test_acc += (y_pred_class == labels).sum().item() / len(labels)
            
    # Adjust metrics to get average loss and accuracy per batch 
    test_loss /= len(dataloader)
    test_acc /= len(dataloader)
    return test_loss, test_acc

In [15]:
from tqdm.auto import tqdm

results = {
    "train_loss": [],
    "train_acc": [],
    "test_loss": [],
    "test_acc": []
}
for epoch in tqdm(range(num_epochs)):
    train_loss, train_acc = train_step(model=model,
                                       dataloader=train_loader,
                                       loss_fn=loss_fn,
                                       optimizer=optimizer)
    
    test_loss, test_acc = test_step(model=model,
                                    dataloader=test_loader,
                                    loss_fn=loss_fn)
    print(
        f"Epoch: {epoch+1} | "
        f"train_loss: {train_loss:.4f} | "
        f"train_acc: {train_acc:.4f} | "
        f"test_loss: {test_loss:.4f} | "
        f"test_acc: {test_acc:.4f}"
    )

    results["train_loss"].append(train_loss)
    results["train_acc"].append(train_acc)
    results["test_loss"].append(test_loss)
    results["test_acc"].append(test_acc)
print("Finished training!")

  0%|          | 0/100 [00:00<?, ?it/s]


KeyboardInterrupt



In [ ]:
from pathlib import Path

# 1. Create models directory 
MODEL_PATH = Path("../models/")
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# 2. Create model save path 
MODEL_NAME = "01_lstm_model.pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

# 3. Save the model state dict 
print(f"Saving model to: {MODEL_SAVE_PATH}")
torch.save(obj=model.state_dict(), # only saving the state_dict() only saves the models learned parameters
           f=MODEL_SAVE_PATH)