In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchsnooper
from torch.utils.data import TensorDataset, DataLoader,Dataset
from torch.autograd import Variable

from dataFunc import tensor_load, tensor_save

In [2]:
USE_CUDA = torch.cuda.is_available()

In [32]:
#加载训练数据
train_X = tensor_load('./dataset/train_X.npy')
train_y = tensor_load('./dataset/train_y.npy')

valid_X = tensor_load('./dataset/valid_X.npy')
valid_y = tensor_load('./dataset/valid_y.npy')

In [33]:
#torch中需要保证label大于0
train_y += 1
valid_y += 1

In [34]:
valid_X, valid_y = Variable(valid_X).long(), Variable(valid_y).long()

In [102]:
if USE_CUDA:
    train_X, train_y = train_X.cuda(), train_y.cuda()
    valid_X, valid_y = valid_X.cuda(), valid_y.cuda()

#构建训练数据集
train_dataset = TensorDataset(train_X, train_y)
valid_dataset = TensorDataset(valid_X, valid_y)

In [9]:
#加载词典
with open('./dataset/vocab_dict.json', 'r') as f:
    vocab_dict = json.load(f)


In [77]:
class WordAVGModel(nn.Module):
    def __init__(self,vocab_size, embedding_size, output_size, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx = pad_idx)
        self.linear = nn.Linear(embedding_size, output_size)
        
    def forward(self, text):
        embedded = self.embed(text) #text形状 (batch_size, seq_len, embedding_size)
        
        #对句子中的词向量平均，得到的向量表示该句子
        pooled = F.avg_pool2d(embedded, (embedded.shape[1],1)).squeeze() # embedded.shape[1]是句子长度28
        res = self.linear(pooled)
        return res

In [98]:
#正确率计算

from sklearn.metrics import f1_score, precision_score, recall_score

def accuracy_computing(preds, y, method = 'acc'):
    if method == 'acc':
        rounded_preds = preds.argmax(dim = 1)
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)
    elif method == 'map':
        acc = f1_score(y.cpu().detach().numpy(), preds.cpu().detach().numpy(), average = 'macro', labels=[0, 1, 2])
    else:
        acc = None
    return acc
    

In [12]:
#超参定义
VOCAB_SIZE = len(vocab_dict)
EMBEDDING_SIZE = 128
PAD_IDX = vocab_dict['<pad>'] # 就是<pad>的index
UNK_IDX = vocab_dict['<unk>'] #<unk>的index

OUTPUT_SIZE = 3 #输出结果为1个数

In [82]:
BATCH_SIZE = 64
LEARNING_RATE = 0.001
epoches = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [103]:
train_loader = DataLoader(train_dataset,
                        batch_size = BATCH_SIZE,
                        shuffle = True,
                        num_workers = 0)

valid_loader = DataLoader(valid_dataset,
                         batch_size = BATCH_SIZE,
                         shuffle = True,
                         num_workers = 0)

In [78]:
#模型声明
model = WordAVGModel(vocab_size = VOCAB_SIZE, 
                     embedding_size = EMBEDDING_SIZE,
                     output_size = OUTPUT_SIZE,
                     pad_idx = PAD_IDX)

model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

In [79]:
#设置损失函数与优化器
Loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = LEARNING_RATE)

model = model.to(device)
Loss = Loss.to(device)

In [64]:
device

device(type='cuda')

In [104]:
#单epoch训练函数
#@torchsnooper.snoop()
def train(model, Loss, optimizer, train_loader):
    epoch_loss, epoch_acc = 0., 0.
    total_len = 0
    model.train()
    for step, data in enumerate(train_loader):
        batch_x, batch_y = data
        #print(model.device)
        # 将这些数据转换成Variable类型
        batch_x, batch_y = Variable(batch_x).long(), Variable(batch_y).long()
        output = model(batch_x)
        
        acc = accuracy_computing(output, batch_y.squeeze(), method = 'acc')
        #acc = accuracy_computing(output, batch_y.squeeze(), method = 'map')
        loss = Loss(output, batch_y.squeeze())
        #SGD
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch_y)
        epoch_acc += acc.item() * len(batch_y)
        total_len += len(batch_y)
        
        total_loss = epoch_loss / total_len
        total_acc =  epoch_acc / total_len
    
    return total_loss, total_acc
        

In [105]:
#评估模型
def evaluate(model, Loss, optimizer, valid_loader):
    epoch_loss, epoch_acc = 0., 0.
    total_len = 0
    model.eval()
    for step, data in enumerate(valid_loader):
        batch_x, batch_y = data
        # 将这些数据转换成Variable类型
        batch_x, batch_y = Variable(batch_x).long(), Variable(batch_y).long()
        output = model(batch_x)
        
        acc = accuracy_computing(output, batch_y.squeeze(), method = 'acc')
        #acc = accuracy_computing(output, batch_y.squeeze(), method = 'map')
        loss = Loss(output, batch_y.squeeze())
        
        epoch_loss += loss.item() * len(batch_y)
        epoch_acc += acc.item() * len(batch_y)
        total_len += len(batch_y)
        
        total_loss = epoch_loss / total_len
        total_acc =  epoch_acc / total_len
    model.train()
    return total_loss, total_acc
        

In [107]:
#训练开始
best_valid_acc = 0.
best_epoch = 0
best_valid_loss = 0.

for epoch in range(epoches):
    print("进行第{}个epoch".format(epoch))
    
    train_loss, train_acc = train(model, Loss, optimizer, train_loader)
    valid_loss, valid_acc = evaluate(model, Loss, optimizer, valid_loader)
    
    if valid_acc > best_valid_acc:
        best_epoch = epoch
        best_valid_acc = valid_acc 
        torch.save(model.state_dict(), 'wordavg-model.pth')
        
    print('Epoch:', epoch, 'Train Loss:', train_loss, 'Train Accuracy:', train_acc)
    print('Epoch:', epoch, 'Valid Loss:', valid_loss, 'Valid Accuracy:', valid_acc)

print("Train finished!")
print('Best Epoch:', best_epoch, 'Best Valid Loss:', best_valid_loss, 'Best Valid Accuracy:', best_valid_acc)

进行第0个epoch
Epoch: 0 Train Loss: 0.8317320570868868 Train Accuracy: 0.6365211495665017
Epoch: 0 Valid Loss: 0.8348728526352208 Valid Accuracy: 0.6314785055196539
进行第1个epoch
Epoch: 1 Train Loss: 0.8315085263369422 Train Accuracy: 0.6368099237819824
Epoch: 1 Valid Loss: 0.8359780839381327 Valid Accuracy: 0.633085576500841
进行第2个epoch
Epoch: 2 Train Loss: 0.8315407373419437 Train Accuracy: 0.6375381370214114
Epoch: 2 Valid Loss: 0.8361260405310956 Valid Accuracy: 0.6333869023943771
进行第3个epoch


KeyboardInterrupt: 

In [None]:
import spacy

nlp = spacy.load('zh')

def predict_sentence()

In [12]:
#model参数
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

4807501

In [12]:
next(model.parameters()).numel()

4807400