In [112]:
import pandas as pd
import torch 
import torch.nn as nn
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from collections.abc import Iterable, Iterator
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm
import random
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [113]:
class CommentsClassifier(nn.Module):
    def __init__(self, vocab_szie, embedding_size, rnn_hidden_size, num_labels):
        super().__init__()
        self.emb = nn.Embedding(vocab_szie, embedding_size, padding_idx=0)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=rnn_hidden_size, batch_first=True)
        self.classifier = nn.Linear(rnn_hidden_size, num_labels)

    def forward(self, X):
        out = self.emb(X) # (batch_size, seq_len, embedding_size)
        output,_ = self.rnn(out) # (batch_size, seq_len, rnn_hidden_size)
        return self.classifier(output[:,-1,:]) # (batch_size, num_labels)
        pass

In [114]:
class Vocabulary:
    def __init__(self, vocab):
        self.vocab = vocab

    @classmethod
    def from_documents(cls, documents):
        tokens = set() 
        for cmt in documents:
            tokens.update(list(cmt))
        tokens = ["<PAD>", "<UNK>"] + sorted(list(tokens)) # set是无序的，可以在list之后做排序,保证每次构建词典顺序一致
        vocab = {token:i for i, token in enumerate(tokens)} 
        return cls(vocab)

In [115]:
# 数据准备
data = pd.read_pickle(r"F:\VsConde-Python\chen\data\comments.bin")
comments, labels = data["Comment"].values, data["labels"].values

vocab = Vocabulary.from_documents(comments)

In [116]:
state = torch.load("F:\VsConde-Python\chen\source\saved_models\model_objs.bin", weights_only=False)
vocab = state["model_vacob"]

In [117]:
batch_size = 64
lr = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
epoch = 20
embedding_size = 128
rnn_hidden_size = 256
num_labels = 2
vocab_size = len(vocab.vocab)

In [118]:
model = CommentsClassifier(
    vocab_szie=vocab_size, 
    embedding_size=embedding_size,
    rnn_hidden_size=rnn_hidden_size,
    num_labels=num_labels
)

In [119]:
model.load_state_dict(state["model_state"])


<All keys matched successfully>

In [120]:
print(model)

CommentsClassifier(
  (emb): Embedding(4415, 128, padding_idx=0)
  (rnn): LSTM(128, 256, batch_first=True)
  (classifier): Linear(in_features=256, out_features=2, bias=True)
)


In [154]:
X = torch.zeros((125,), dtype=torch.long)
input = "这个电影很好"
input_index = torch.tensor(np.vectorize(vocab.vocab.get)(list(input)), dtype=torch.long)
for i in range(len(input_index)):
    X[i] = input_index[i]
X = X.reshape(1,-1)
# 0: negtive, 1: positive
pred_prob = model(X)
print(pred_prob)
pred = pred_prob.argmax(-1)
pred.item()

tensor([[-1.2605,  1.3717]], grad_fn=<AddmmBackward0>)


1

In [149]:
X = torch.zeros((125,), dtype=torch.long)
input = "这个电影很差"
input_index = torch.tensor(np.vectorize(vocab.vocab.get)(list(input)), dtype=torch.long)
for i in range(len(input_index)):
    X[i] = input_index[i]
X = X.reshape(1,-1)
# 0: negtive, 1: positive
pred_prob = model(X)
print(pred_prob)
pred = pred_prob.argmax(-1)
pred.item()

tensor([[ 1.3652, -1.2173]], grad_fn=<AddmmBackward0>)


0