In [17]:
import time
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext import data
from torchtext.vocab import build_vocab_from_iterator
import jieba
from torch.utils.data import Dataset, DataLoader
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:

def tokenizer(text):
    text = re.sub('@[\u4e00-\u9fa5]*|#(.)*#'," ", text) #先去除用户名和话题标签
    text = re.sub('[^\u4e00-\u9fa5]','',text) #再去除标点和英文
    text = jieba.lcut(text) #分词
    return text

def yield_tokens(data_iter):
    for _, text in data_iter.iterrows():
        yield tokenizer(text['review'])

raw_data = pd.read_csv(r'BaseCode\WeiboSentimentClassification\data\weibo_senti_100k.csv')

vocab = build_vocab_from_iterator(yield_tokens(raw_data), min_freq=5, specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab["<unk>"])


In [19]:
label = raw_data['label']
text = [vocab(tokenizer(s)) for s in raw_data['review']]
padded_text = torch.nn.utils.rnn.pad_sequence([torch.tensor(s,dtype=torch.float32) for s in text], batch_first=True).tolist()


In [20]:
class TextCNNDataSet(Dataset):
    def __init__(self, data, data_targets):
        self.content = data
        self.pos = data_targets

    def __getitem__(self, index):
        return self.content[index], self.pos[index]

    def __len__(self):
        return len(self.pos)

data_iter = TextCNNDataSet(padded_text, label)

In [21]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) 

def collate_batch(batch):
    label_list, text_list, lengths_list = [], [], []
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        text_list.append(_text)
        
    label_list = torch.tensor(label_list, dtype=torch.float32) 
    text_list = torch.tensor(text_list, dtype=torch.int64)   


    lengths_list = [len(sentence) for sentence in text_list]
    lengths_list = torch.tensor(lengths_list, dtype=torch.int64)
    return  text_list.to(device), label_list.to(device), lengths_list.to(device)

In [22]:
class SentimentAnalysis(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, sparse=False)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 4, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

        self.layers = nn.ModuleList([self.embedding, self.rnn, self.fc, self.dropout, self.sigmoid])

    def forward(self, text, text_length):
        embedded = self.embedding(text.T)
        output, (hidden, cell) = self.rnn(embedded)
        hidden = torch.cat((output[0], output[-1]), dim=1)
        hidden = self.dropout(hidden)
        result = self.fc(hidden.squeeze(0))
        result = self.sigmoid(result)
        return result


In [23]:
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 64
BATCH_SIZE = 64
HIDDEN_DIM = 64
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2

LR=0.001

MAX_LEN = len(padded_text[0])

In [8]:
model = SentimentAnalysis(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) # 学习率衰减

model = model.to(device)
criterion = criterion.to(device)

In [9]:
def train(model, dataloader, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for index, (text, label, text_lengths) in enumerate(dataloader):
        optimizer.zero_grad()

        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, label)

        # 计算模型在该batch上的acc
        rounded_preds = torch.round(predictions)
        # loss = criterion(rounded_preds, label)
        correct = (rounded_preds == label).float()
        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()
        
        # #梯度裁剪，防止梯度爆炸
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)


In [10]:
def evaluate(dataloader):
    model.eval()
    total_loss = 0
    total_acc = 0

    with torch.no_grad():
        for index, (text, label, text_lengths) in enumerate(dataloader):
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, label)

            rounded_preds = torch.round(predictions)
            correct = (rounded_preds == label).float()
            acc = correct.sum() / len(correct)

            total_loss += loss.item()
            total_acc += acc.item()

    return total_loss/len(dataloader), total_acc/len(dataloader)

            

In [11]:
data_size = len(data_iter)
train_size = int(data_size * 0.80)
valid_size = int(data_size * 0.15)
test_size = data_size - train_size - valid_size

data_set = to_map_style_dataset(data_iter)
train_set, valid_set, test_set = random_split(data_set, [train_size, valid_size, test_size])

train_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_set, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_set, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(valid_dataloader)

    print(f'Epoch: {epoch+1:02} \n Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'valid Loss: {valid_loss:.3f} | valid Acc: {valid_acc*100:.2f}%')

Epoch: 01 
 Train Loss: 0.647 | Train Acc: 66.33%
valid Loss: 0.603 | valid Acc: 77.88%
Epoch: 02 
 Train Loss: 0.602 | Train Acc: 77.57%
valid Loss: 0.587 | valid Acc: 81.25%
Epoch: 03 
 Train Loss: 0.583 | Train Acc: 82.62%
valid Loss: 0.578 | valid Acc: 82.69%
Epoch: 04 
 Train Loss: 0.580 | Train Acc: 82.56%
valid Loss: 0.572 | valid Acc: 86.59%
Epoch: 05 
 Train Loss: 0.572 | Train Acc: 85.28%
valid Loss: 0.572 | valid Acc: 84.91%


In [12]:
for batch in train_dataloader:
    print(batch)

    print(batch[1].size())
    break
    

(tensor([[ 5114,   102,    14,  ...,     0,     0,     0],
        [  171,   221,   339,  ...,     0,     0,     0],
        [  284,  2628, 24332,  ...,     0,     0,     0],
        ...,
        [   37,     7,   965,  ...,     0,     0,     0],
        [   26,   424,     3,  ...,     0,     0,     0],
        [ 9062,   136,   474,  ...,     0,     0,     0]]), tensor([1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
        1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0.,
        1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0.,
        1., 1., 1., 0., 0., 0., 0., 0., 0., 0.]), tensor([130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
        130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
        130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
        130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
        130, 130, 130, 130, 130, 1

In [13]:
l = [[1,2],
[3,4]]

a = torch.tensor(l)

In [14]:
a.tolist()

[[1, 2], [3, 4]]

In [15]:
# from torch.utils.tensorboard import SummaryWriter
# # tensorboard --logdir=./ --port 8123

# # 创建SummaryWriter对象，log_dir为保存TensorBoard日志的路径
# writer = SummaryWriter(log_dir='logs')

# # 定义模型
# model = SentimentAnalysis(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

# # 将模型添加到TensorBoard中
# text_ = torch.zeros(BATCH_SIZE, MAX_LEN).long()
# text_length_ = torch.zeros(BATCH_SIZE).long()

# writer.add_graph(model, (text_, text_length_))

# # 关闭SummaryWriter对象
# writer.close()