# 微博情感分析

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset
import torch.nn.functional as F 
import time
from tqdm import tqdm
import torchtext.vocab as Vocab
import argparse
import os
import pandas as pd
import numpy as np
import collections
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import datetime

## 数据介绍

0喜悦
1愤怒
2厌恶
3低落

In [2]:
dictionary=np.load("dictionary.npy",allow_pickle=True)
#torch.set_default_tensor_type(torch.FloatTensor)

In [3]:
def labels_encode():
    y=[int(line) for line in open("labels.txt", encoding="utf-8").readlines()]
    y=np.expand_dims(np.array(y),1)
    encoder=OneHotEncoder()
    return encoder.fit_transform(y).toarray()

In [4]:
y=labels_encode()

In [5]:
def texts_encode(X):
    for i in range(len(X)):
        X[i]=[np.int64(dictionary.item()[j]) for j in X[i]]
    return X


In [6]:
X=[line.split() for line in open("texts.txt",encoding="utf-8").readlines()]

In [7]:
X=texts_encode(X)

In [8]:
def cut(X,length):
    for key,value in enumerate(X):
        if len(value)<=length:
            X[key]=value+[0 for i in range (length-len(value))]
        else:
            X[key]=X[key][:length]
    return X

In [9]:
X=np.array(cut(X,20))

In [10]:
class Rnn_simple(nn.Module):
    def __init__(self,vocab,embed_size,num_hiddens,num_layers):
        super(Rnn_simple,self).__init__()
        self.embedding=nn.Embedding((vocab),embed_size)
        self.encoder=nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=True)
        self.decoder=nn.Linear(4*num_hiddens,4)
    def forward(self,X):
        X=self.embedding(X.permute(1,0))
        X,_=self.encoder(X)
        X=torch.cat((X[0],X[-1]),-1)
        X=self.decoder(X)
        return X

In [11]:
vocab,embed_size,num_hiddens,num_layers=len(dictionary.item())+1,100,100,2

In [12]:
net=Rnn_simple(vocab,embed_size,num_hiddens,num_layers)

In [13]:
net

Rnn_simple(
  (embedding): Embedding(24759, 100)
  (encoder): LSTM(100, 100, num_layers=2, bidirectional=True)
  (decoder): Linear(in_features=400, out_features=4, bias=True)
)

In [14]:
X=torch.LongTensor(X)

tensor([[-0.0009, -0.0294,  0.0336, -0.0037],
        [-0.0116,  0.0006,  0.0382, -0.0136],
        [-0.0135,  0.0070,  0.0373, -0.0358],
        ...,
        [-0.0202, -0.0179,  0.0555, -0.0178],
        [-0.0383, -0.0116,  0.0644, -0.0483],
        [ 0.0044, -0.0294,  0.0552, -0.0102]], grad_fn=<AddmmBackward>)

In [15]:
    def evaluate(valdata,vallabel):
        ans=0.;
        with torch.no_grad():
            prelabel=network(valdata)
            for i in range(len(prelabel)):
                if vallabel[i][torch.argmax(prelabel[i])]==1:
                    ans+=1
        print("the accuracy is",ans/float(len(prelabel)))  

In [16]:

def trainloop( n_epochs,dataloader,network,optim,loss_fn):
    for epoch in range(1, n_epochs+1):
        #evaluate(mask_data.data,mask_data.label)
        loss_train = 0.0
        evaluate(X,torch.Tensor(y))
        for input, realout in dataloader:
            predictout = network(input)

            loss = loss_fn(predictout, realout)

            optim.zero_grad()

            loss.backward()
            optim.step()
            loss_train += loss.item()
            #if epoch == 1 or epoch % 100 == 0:
        print(
                f'{datetime.datetime.now()} epoch {epoch} training loss {loss_train/len(dataloader)}')

In [17]:
n_epochs=50
network=net
optim=torch.optim.Adam(network.parameters(),lr=0.01)
dataset=TensorDataset(X,torch.Tensor(y))
dataloader=DataLoader(dataset,batch_size=300,shuffle=True)
loss_fn=nn.BCEWithLogitsLoss()

In [18]:
trainloop(n_epochs,dataloader,network,optim,loss_fn)

the accuracy is 0.14483139856274185
2020-11-28 21:49:12.889386 epoch 1 training loss 0.5243786917282984
the accuracy is 0.5514096185737977
2020-11-28 21:49:18.572848 epoch 2 training loss 0.4582063624492058
the accuracy is 0.6160862354892206
2020-11-28 21:49:24.303086 epoch 3 training loss 0.3573974623129918
the accuracy is 0.6915422885572139
2020-11-28 21:49:30.615241 epoch 4 training loss 0.2570456014229701
the accuracy is 0.8253178551686015
2020-11-28 21:49:37.577732 epoch 5 training loss 0.1704218742939142
the accuracy is 0.9333886124930901
2020-11-28 21:49:44.621048 epoch 6 training loss 0.08517060772730754
the accuracy is 0.9781647318960752
2020-11-28 21:49:50.970137 epoch 7 training loss 0.03618166006456774
the accuracy is 0.9919845218352681
2020-11-28 21:49:57.406990 epoch 8 training loss 0.015587578993290663
the accuracy is 0.9975124378109452
2020-11-28 21:50:03.821116 epoch 9 training loss 0.00611537210464191
the accuracy is 0.9980652294085129
2020-11-28 21:50:10.387632 epoch

KeyboardInterrupt: 