In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import numpy as np
import random
import math

import pandas as pd 
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

import torchtext
from torchtext.vocab import Vectors
from torchtext import data, datasets

USE_CUDA = torch.cuda.is_available()

# 设置随机数的seed，这样保证每次测试的数据一致
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if USE_CUDA:
    torch.cuda.manual_seed(SEED)

# 超参数hyper parameters初始化

In [18]:
# 超参数hyper paramerters 
BATCH_SIZE = 64
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
MAX_VOCAB_SIZE = 50000
NUM_EPOCHS = 2
LEARNING_RATE = 0.001
GRAD_CLIP = 5.0 # 为了防止梯度爆炸，设置的权重上下限的绝对值

# 根据IMDB数据集，对电影评论进行正面和负面评论分类

# 准备数据
1、TorchText中的一个重要概念是Field。Field决定了你的数据会被怎样处理。在我们的情感分类任务中,我们所需要接触到的数据又文本字符串和两种情感，”pos“或者”neg“。
2、Field的参数指定了数据会被怎样处理。
3、我们使用TEXT field来定义如何处理电影评论，使用LABEL field来处理两个情感类别。
4、我们的TEXT field带有tokenize=‘spacy’, 这表示我们会用spaCy tokenizer来tokenize英文句子。如果我们不特别声明tokenize这个参数，那么默认的分词方法是使用空格。

5、安装spaCy
    pip3 install -U spacy
    python3 -m spacy download en
6、LABEL由LabelField定义。这是一种特别的用来处理label的Field。
7、更多关于Fields，参见https://github.com/pytorch/text/blob/master/torchtext/data/field.py
8、和之前一样，我们会设定random seeds使实验可以复现

In [19]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [21]:
# 默认是下载到当前目录的'.data目录下面'，当然也可以通过root='path'来指定path
# 下载地址'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
# 下载完记得解压
train_data,test_data = datasets.IMDB.splits(TEXT,LABEL,root='/Users/zhenwuzhou/AiProject/data/')
#查看每个数据split有多少条数据
print(f'Number of training examples:{len(train_data)}')
print(f'Number of testing examples:{len(test_data)}')

Number of training examples:25000
Number of testing examples:25000


In [17]:
# 查看一个example
print(vars(train_data.examples[0]))

{'text': ['For', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', '.', 'Imagine', 'a', 'movie', 'where', 'Joe', 'Piscopo', 'is', 'actually', 'funny', '!', 'Maureen', 'Stapleton', 'is', 'a', 'scene', 'stealer', '.', 'The', 'Moroni', 'character', 'is', 'an', 'absolute', 'scream', '.', 'Watch', 'for', 'Alan', '"', 'The', 'Skipper', '"', 'Hale', 'jr', '.', 'as', 'a', 'police', 'Sgt', '.'], 'label': 'pos'}


In [117]:
import random
train_data,valid_data = train_data.split(random_state=random.seed(SEED))

In [29]:
# 检查一下现在每个部分有多少条数据
print(f'Number of training examples:{len(train_data)}')
print(f'Number of validation examples:{len(valid_data)}')
print(f'Number of testing examples:{len(test_data)}')

Number of training examples:17500
Number of validation examples:7500
Number of testing examples:25000


![Image1](../common/images/4-pytorch.png) 

In [22]:
# 对训练数据建立词向量表，
# Glove:global vectors for word representation:表示词的全局向量
# 其中vectors="glove.6B.100d"：使用glove预训练的词向量
# 下载地址：http://nlp.stanford.edu/data/glove.6B.zip
# 可以使用vectors_cache来指定glove预训练的词向量缓存位置
TEXT.build_vocab(train_data,max_size=25000,vectors="glove.6B.100d",vectors_cache="/Users/zhenwuzhou/AiProject/data/vector_cache",unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print(f'Unique tokens in TEXT vocabulary:{len(TEXT.vocab)}')
print(f'Unique tokens in lABEL vocabulary:{len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary:25002
Unique tokens in lABEL vocabulary:2


![Image of Yaktocat](../common/images/4-pytorch.png) 

In [32]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 202476), (',', 192116), ('.', 165496), ('a', 109230), ('and', 109174), ('of', 101087), ('to', 93504), ('is', 76398), ('in', 61292), ('I', 54008), ('it', 53328), ('that', 48904), ('"', 44043), ("'s", 43247), ('this', 42369), ('-', 37002), ('/><br', 35684), ('was', 34978), ('as', 30125), ('with', 29740)]


In [33]:
# 我们可以直接使用stoi(string to int) 或者 itos（int to string）来查看我们的单词表
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


In [40]:
# 查看labels
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


# 最后一步的数据预处理
1、最后一步数据的准备是创建iterators。每个iteartion都会返回一个batch的examples。
2、我们会使用BucketIterator。BucketIterator会把长度产不多的句子放到统一个batch中，确保每个batch中不出现太多的padding。
3、严格来说，我们这份notebook中的模型代码都有一个问题，也就是我们把<pad>也当做了模型的输入进行训练。更好的做法是在模型中把由<pad>产生的输出给消除掉。这里我们暂时简单处理，直接把<pad>也用作模型的输入了。由于<pad>数量不多，模型的效果也不差
4、如果我们有GPU，还可以指定每个Iteration返回的tensor都在GPU上

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# BucketIterator 会按照句子长度对句子进行排序，尽量把长度相近的句子放在一个batch中，
# 这样就不会出现太多的padding
train_iterator,valid_iterator,test_iterator = data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size = BATCH_SIZE,
    device=device)

# seq_len * batch_size

In [42]:
batch = next(iter(valid_iterator))
batch


[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.LongTensor of size 49x64]
	[.label]:[torch.FloatTensor of size 64]

In [43]:
print(batch.text)
print(batch.label)

tensor([[   0, 1422,  393,  ...,   66,  148,   11],
        [  46,  520,  395,  ...,   23,  860,   57],
        [2369, 5033, 1589,  ...,   97,    7,   28],
        ...,
        [  39,  520,   68,  ...,    1,    1,    1],
        [  39,  205,    4,  ...,    1,    1,    1],
        [  39,    4,    1,  ...,    1,    1,    1]])
tensor([1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1.,
        0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0.,
        0., 1., 1., 1., 1., 0., 0., 1., 0., 1.])


In [49]:
# itos是一个字符串list，这里list里面通过缩影位置对应这字典表里面的字符串
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))

<unk> from Germany and I love the <unk> . I go 200 times a year . Tonight I saw " Pecker " , it was a wonderful evening . Thank you , Mr. Waters . Everybody who has a chance to see the movie , go ! ! !


# Word Averaging模型
1、我们首先介绍一个简单的Word Averaging模型。这个模型非常简单，我们把每个单词都通过Embedding层投射成word embedding vector，然后把一句话中的所有word vector做个平均，就是整个句子的vector表示了。接下来把这个sentence vector传入一个Linear层，做分类即可。
![image.png](../common/images/4-pytorch03.png)

2、我们使用avg_pool2d来做average pooling。我们的目标是把sentence length那个维度平均成1，然后保留embedding这个维度。
![image.png](../common/images/4-pytorch04.png)

3、avg_pool2d的kernel size是(embedded.shape[1],1),所以句子长度的那个维度压扁。
![image.png](../common/images/4-pytorch05.png) 

# 定义一个模型

In [118]:
import torch.nn as nn

class WordAVGModel(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx):
        super(WordAVGModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedding_size, padding_idx=pad_idx)
        self.linear = nn.Linear(embedding_size,output_size)
        
    def forward(self,text):
        #进行embed操作
        embeded = self.embed(text) # [seq_len,batch_size,embedding_size]
        # 把 seq_len和batch_size换一下位置
        # embeded = embeded.transpose(1,0) # [batch_size,seq_len,embedding_size],这是一种交换维度1和维度2的方法：就是转置
        embeded = embeded.permute(1,0,2) # [batch_size,seq_len,embedding_size], 这是一种任意交换顺序的方式
        
        # 对embeding的每一个维度做针对此维度上句子中所有词的平均池化，最后得到的是一个embeding的句子信息向量
        # 池化后的seq_len维度将会消失：[batch_size,1,embedding_size]，所以可以直接用squeeze()来讲此维度去掉
        pooled = F.avg_pool2d(embeded,(embeded.shape[1],1)).squeeze() # [batch_size,embedding_size]
        
        # 最后在做一层全连接返回
        return self.linear(pooled)


In [119]:
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_SIZE = 100
OUTPUT_SIZE = 1
# 这个值得是字符串'<pad>'在字典中的索引，就是embedding操作中需要传入的值：可能是为了
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] 

model = WordAVGModel(vocab_size=VOCAB_SIZE,
                     embedding_size=EMBEDDING_SIZE,
                     output_size=OUTPUT_SIZE,
                     pad_idx=PAD_IDX)

In [56]:
# 查看模型
model

WordAVGModel(
  (embed): Embedding(25002, 100, padding_idx=1)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

In [57]:
next(model.parameters()).numel() # numel()方法会帮你输出参数有多少个

2500200

In [59]:
# 计算模型一共有多少参数
def count_parameers(model):
    # 把所有需要梯度下降的参数都统计出来，最后得到参数总个数
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameers(model)

2500301

In [61]:
model.embed.weight.data

tensor([[-1.1807,  0.2208, -0.8832,  ...,  0.8083, -0.2904,  0.1586],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.7884, -2.0365, -1.1188,  ..., -0.2311, -1.8755, -0.9730],
        ...,
        [-0.0396, -0.3214, -0.6717,  ...,  0.2224, -0.9313, -0.6235],
        [-1.2636,  1.4050, -0.0456,  ..., -0.9127,  0.1079,  0.8846],
        [-0.2148,  0.5147, -0.9955,  ..., -0.5769,  0.9640,  1.3750]])

# 模型参数的一些初始化

In [120]:
# 这是斯坦福的glove预训练词向量
pretrained_embedding = TEXT.vocab.vectors
print(pretrained_embedding.shape)
print(pretrained_embedding)
# 我们在初始化词向量的时候设置成glov的词向量
model.embed.weight.data.copy_(pretrained_embedding)

# 这个值得是字符串'<unk>'在字典中的索引，
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
# 把'<pad>'和‘<unk>’的词向量权重初始化为0
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

torch.Size([25002, 100])
tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.4765,  0.2254,  0.3035,  ..., -0.2082,  0.1948,  0.8972],
        [-0.2472, -1.1190,  0.3695,  ..., -0.5236, -1.1763,  1.4334],
        [-0.2821,  0.0417,  0.4807,  ..., -0.5425, -0.7024,  1.3024]])


# 定义优化器

In [121]:
# 定义优化器
optimizer = torch.optim.Adam(model.parameters())


# 定义Loss_fun:
![image.png](../common/images/BCELoss.png) 
# BCEWithLogitsLoss = Sigmoid+BCELoss，
# 当网络最后一层使用nn.Sigmoid时，就用BCELoss，
# 当网络最后一层不使用nn.Sigmoid时，就用BCEWithLogitsLoss。

In [122]:
# 因为我们最后一层没有用nn.Sigmoid，所以要用BCEWithLogitsLoss
loss_fun = nn.BCEWithLogitsLoss()

# 根据当前device转成对应的cpu或者gpu格式
model = model.to(device)
loss_fun = loss_fun.to(device)

# 计算预测的准确率

In [123]:
def binart_accuracy(preds,y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


# 定义训练方法

In [124]:
def train(model,iterator,optimier,loss_fun):
    epoch_loss,epoch_acc = 0.,0.
    total_len = 0.
    model.train()
    i = 0;
    for batch in iterator:
        # 前向传播,返回的是一个shape[batch_size*1],我们要把它压平到[batch_size],
        # 这样才能用loss_fun去计算
        preds = model(batch.text).squeeze() # [batch_size]
        # 计算loss
        loss = loss_fun(preds,batch.label)
        
        # 每隔10个epoch打印一下loss
#         if i % 100 == 0:
#             print("batch:",i,"; loss:",loss)
#         i+=1
        
        # 计算正确率
        acc = binart_accuracy(preds,batch.label)
        
        # 清除梯度
        optimier.zero_grad()
        # 反向传播
        loss.backward()
        # 梯度下降
        optimier.step()
        
        # 注意这里的loss是针对一个epoch的每条数据的平均值，
        # 所以要为了后面计算总的loss平均值，
        # 我么需要把每个epoch的loss都计算出来
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    # 最后返回总的平均loss和总的平均正确率acc
    # 但是二分类问题用f1Score会更加准确一些,可以避免偏斜类维问题(正负样本的比例悬殊)
    return epoch_loss/total_len, epoch_acc/total_len

# 定义评价方法

In [67]:
def evaluate(model,iterator,loss_fun):
    epoch_loss,epoch_acc = 0.,0.
    total_len = 0.
    model.eval()
    for batch in iterator:
        # 前向传播
        preds = model(batch.text).squeeze()
        # 计算loss
        loss = loss_fun(preds,batch.label)
        # 计算正确率
        acc = binart_accuracy(preds,batch.label)
        
        # 不需要进行梯度下降
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    # 在评价完毕后需要把模型修改为训练模式
    model.train()
    # 最后返回总的平均loss和总的平均正确率acc
    # 但是二分类问题用f1Score会更加准确一些,可以避免偏斜类维问题(正负样本的比例悬殊)
    return epoch_loss/total_len, epoch_acc/total_len

# 开始训练

In [125]:
NUM_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model,train_iterator,optimizer,loss_fun)
    vaild_loss, valid_acc = evaluate(model,valid_iterator,loss_fun)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(),"/Users/zhenwuzhou/.keras/models/wordavg/1model_dict.pth")
        torch.save(model,"/Users/zhenwuzhou/.keras/models/wordavg/1model.pth")
            
    print("Epoch",epoch,"Train Loss:",train_loss,"Train Acc",train_acc)
    print("Epoch",epoch,"Valid Loss:",vaild_loss,"Valid Acc",valid_acc)

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Epoch 0 Train Loss: 0.6845390934807913 Train Acc 0.6138857142584665
Epoch 0 Valid Loss: 0.6252183569908142 Valid Acc 0.6944000000317891
Epoch 1 Train Loss: 0.6419278195108686 Train Acc 0.7377142857142858
Epoch 1 Valid Loss: 0.531813969039917 Valid Acc 0.7402666666984558
Epoch 2 Train Loss: 0.5729172680309841 Train Acc 0.7889714285850525
Epoch 2 Valid Loss: 0.4714519806067149 Valid Acc 0.7912
Epoch 3 Train Loss: 0.4991948090893882 Train Acc 0.8277714285714286
Epoch 3 Valid Loss: 0.41284211173057556 Valid Acc 0.8317333333333333
Epoch 4 Train Loss: 0.4395295122078487 Train Acc 0.8586285714694432
Epoch 4 Valid Loss: 0.40660085196495055 Valid Acc 0.8490666666666666


KeyboardInterrupt: 

In [126]:
# 从指定路径下加载训练好的模型
model.load_state_dict(torch.load("/Users/zhenwuzhou/.keras/models/wordavg/1model_dict.pth"))

<All keys matched successfully>

In [127]:
# 对给出的一条新的评论进行判断
import spacy
nlp = spacy.load("en")

def predict_sentiment(sentence):
    # 利用spacy来进行对句子进行分词
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device) # [seq_len]
    tensor = tensor.unsqueeze(1) # [seq_len * batch_size(1)]
    pred = torch.sigmoid(model(tensor))
    return pred.item()

In [128]:
# 预测一下新评论
print(predict_sentiment("This film is horrible"))

print(predict_sentiment("This film is terrificl"))

print(predict_sentiment("This film is terrible"))

print(predict_sentiment("This film is good"))

print(predict_sentiment("This film is not bad"))

print(predict_sentiment("This film is not good"))


2.2129119869873648e-08
0.9547051787376404
4.03268041182514e-09
0.9999457597732544
1.0582433367101185e-11
0.7428852915763855


In [142]:
predict_sentiment("good")

0.9999959468841553

# RNN模型来实现情感分类
#1、下面我们尝试吧模型换成一个recurrent neural network(RNN).RNN经常会被用来encode一个sequence：
                ![image.png](../common/images/4-pytroch06.png) 
#2、我们使用最后一个hidden state hT 来表示整个句子。
#3、然后我们把hT通过一个线性变换f(全连接)，然后用来预测句子的情感。
![image.png](../common/images/4-pytroch07.png) 

# 定义个RNN的Model

In [92]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx,hidden_size,drop_out):
        super(RNNModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedding_size, padding_idx=pad_idx)
        # bidirectional表示双向RNN，num_layers表示神经网络层数
        # 这里定义成双向两层的RNN神经网络
        # batch_first = true 可以使得输出的batch_size在第一个维度
        self.lstm = nn.LSTM(embedding_size,hidden_size,bidirectional=True,num_layers=2)
        # 这里改为双向RNN，全连接的输入将变为embedding_size*2
        self.linear = nn.Linear(embedding_size*2,output_size)
        self.dropout = nn.Dropout(drop_out)
        
    def forward(self,text):
        #进行embed操作
        embeded = self.embed(text) # [seq_len,batch_size,embedding_size]
        # 我们为了防止过拟合，做一dropout正则化
        embeded = self.dropout(embeded)
        
        # 然后通过LSTM进行tensor传递计算
        # ouput 包含了每个时刻最后一层输出的h_t的集合：(seq_len, batch, num_directions * hidden_size)
        # hidden 是t=seq_len时即最后一个时刻时每一层的h_t(L = 1-num_layers): (num_layers * num_directions, batch, hidden_size)
        # cell 和hidden一样，只是它存的是cell的信息：(num_layers * num_directions, batch, hidden_size)
        output,(hidden,cell) = self.lstm(embeded)
        
        # hidden：[2*batch_size*hiden_size]
        # h_n of shape (num_layers * num_directions, batch, hidden_size)
        # 因为我们设置了双向num_directions = 2 ，两层num_layers=2
        # hidden存放每一层的两个方法的hidden,
        # 我们去取hidden的最后两个(因为最后一层的双向输出)来做拼接
        hidden = torch.cat([hidden[-1],hidden[-2]],dim=1) # dim=1表示对第一个维度做拼接
        
        hidden = self.dropout(hidden.squeeze())
        
        # 最后在做一层全连接返回
        return self.linear(hidden)

In [23]:
A = [1,2,3]
A[-1] # 取最后一个

3

In [93]:
model = RNNModel(vocab_size=VOCAB_SIZE,
                 embedding_size=EMBEDDING_SIZE,
                 output_size=OUTPUT_SIZE,
                 pad_idx=PAD_IDX,
                 hidden_size=100,
                 drop_out=0.5)

# RNNModel的参数输出化

In [95]:
# 这是斯坦福的glove预训练词向量
pretrained_embedding = TEXT.vocab.vectors
print(pretrained_embedding.shape)
print(pretrained_embedding)
# 我们在初始化词向量的时候设置成glov的词向量
model.embed.weight.data.copy_(pretrained_embedding)

# 这个值得是字符串'<unk>'在字典中的索引，
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
# 把'<pad>'和‘<unk>’的词向量权重初始化为0
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

torch.Size([25002, 100])
tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.4765,  0.2254,  0.3035,  ..., -0.2082,  0.1948,  0.8972],
        [-0.2472, -1.1190,  0.3695,  ..., -0.5236, -1.1763,  1.4334],
        [-0.2821,  0.0417,  0.4807,  ..., -0.5425, -0.7024,  1.3024]])


In [96]:
# 定义优化器
optimizer = torch.optim.Adam(model.parameters())

# 因为我们最后一层没有用nn.Sigmoid，所以要用BCEWithLogitsLoss
loss_fun = nn.BCEWithLogitsLoss()

# 根据当前device转成对应的cpu或者gpu格式
model = model.to(device)
loss_fun = loss_fun.to(device)

# 开始RNN模型训练

In [97]:
NUM_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model,train_iterator,optimizer,loss_fun)
    vaild_loss, valid_acc = evaluate(model,valid_iterator,loss_fun)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(),"/Users/zhenwuzhou/.keras/models/wordavg/rnnmodel_dict.pth")
        torch.save(model,"/Users/zhenwuzhou/.keras/models/wordavg/rnnmodel.pth")
            
    print("Epoch",epoch,"Train Loss:",train_loss,"Train Acc",train_acc)
    print("Epoch",epoch,"Valid Loss:",vaild_loss,"Valid Acc",valid_acc)

KeyboardInterrupt: 

In [98]:
outputs,(hidden,cell) = model.lstm(model.embed(batch.text))

In [99]:
outputs.shape#(seq_len, batch, num_directions * hidden_size)

torch.Size([49, 64, 200])

In [100]:
hidden.shape# (num_layers * num_directions, batch, hidden_size)

torch.Size([4, 64, 100])

# CNN模型来实现情感分类
1、我们先对每个词做embedding
2、把每个句子变成等长的词组成，然后每个句子就可以变成seq_len * embedding_size的矩阵
3、做词向量卷积，用卷积单词数*embedding_size当做filter的kernelSize
4、选取不同单词数的多个filter对文本做卷积操作；
5、卷积后做池化操作MAXpooling或者AVGpooling，最后把不同单词数filter的pooling结果cat连接在一起
6、做全连接最后输出想要的维度的输出

In [113]:
import torch.nn as nn

class CNNModel(nn.Module):
    def __init__(self,vocab_size,embedding_size,output_size,pad_idx,num_filters,filter_sizes,drop_out):
        super(CNNModel,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedding_size, padding_idx=pad_idx)
        # 定义多个词的卷积:filter_sizes将是一个数组[3,4,5],分别取3个词，4个词，5个词的词向量卷积
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,out_channels=num_filters,
                      kernel_size=(fs,embedding_size))for fs in filter_sizes
        ])
        
        # 单一的词数卷积
#         self.conv = nn.Conv2d(in_channels=1,out_channels = num_filters,
#                               kernel_size=(filter_size,embedding_size))
        self.linear = nn.Linear(num_filters * len(filter_sizes),output_size)
        self.dropout = nn.Dropout(drop_out)
        
    def forward(self,text):
        #把[seq_len,batch_size]变成[batch_size,seq_len]
        text = text.permute(1,0) # [batch_size,seq_len]
        embeded = self.embed(text) # [batch_size,seq_len,embedding_size]
        # 因为输入的文本只有一个channel，需要把channel的维度加上
        # conv2d的输入要求为:Input: (N:样本数, C_{in}:channel数, H_{in}:高度, W_{in}:宽度)
        embeded = embeded.unsqueeze(1) # [batch_size,1,seq_len,embedding_size]
        
        # 单个词数卷积 和池化
        # 在卷积操作的时候用relu作为激活函数
#         conved = F.relu(self.conv(embeded)) # [batch_size,num_filters,seq_len-filter_size+1,1]
#         # 把第三个维度压平
#         conved = conved.squeeze(3) # [batch_size,num_filters,seq_len-filter_size+1]
        # 做pooling
#         pooled = F.max_pool1d(conved,conved.shape[2])# [batch_size,num_filters,1]
#         pooled = pooled.squeeze(2)
    
    
    
        # 多个词数量的卷积和池化
        conved = [F.relu(conv(embeded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv,conv.shape[2]).squeeze(2) for conv in conved]
        # 把多个词的卷积连接起来
        pooled = torch.cat(pooled,dim=1) # [batch_size,num_filters * len(filter_sizes)]
        
        # 最后一层做一下dropout
        pooled = self.dropout(pooled)
        
        return self.linear(pooled)

In [114]:
model = CNNModel(vocab_size=VOCAB_SIZE,
                 embedding_size=EMBEDDING_SIZE,
                 output_size=OUTPUT_SIZE,
                 pad_idx=PAD_IDX,
                 num_filters=100, # 用多少个filter
                 filter_sizes= [3,4,5], # 用几个单词做卷积
                 drop_out=0.5)

# 这是斯坦福的glove预训练词向量
pretrained_embedding = TEXT.vocab.vectors
print(pretrained_embedding.shape)
print(pretrained_embedding)
# 我们在初始化词向量的时候设置成glov的词向量
model.embed.weight.data.copy_(pretrained_embedding)

# 这个值得是字符串'<unk>'在字典中的索引，
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
# 把'<pad>'和‘<unk>’的词向量权重初始化为0
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)


# 定义优化器
optimizer = torch.optim.Adam(model.parameters())

# 因为我们最后一层没有用nn.Sigmoid，所以要用BCEWithLogitsLoss
loss_fun = nn.BCEWithLogitsLoss()

# 根据当前device转成对应的cpu或者gpu格式
model = model.to(device)
loss_fun = loss_fun.to(device)



NUM_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model,train_iterator,optimizer,loss_fun)
    vaild_loss, valid_acc = evaluate(model,valid_iterator,loss_fun)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(),"/Users/zhenwuzhou/.keras/models/wordavg/cnnmodel_dict.pth")
        torch.save(model,"/Users/zhenwuzhou/.keras/models/wordavg/cnnmodel.pth")
            
    print("Epoch",epoch,"Train Loss:",train_loss,"Train Acc",train_acc)
    print("Epoch",epoch,"Valid Loss:",vaild_loss,"Valid Acc",valid_acc)

torch.Size([25002, 100])
tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.4765,  0.2254,  0.3035,  ..., -0.2082,  0.1948,  0.8972],
        [-0.2472, -1.1190,  0.3695,  ..., -0.5236, -1.1763,  1.4334],
        [-0.2821,  0.0417,  0.4807,  ..., -0.5425, -0.7024,  1.3024]])


KeyboardInterrupt: 

In [None]:
# 长文本分类:hierarchical分层的
hierarchical LSTM 