In [1]:
import torch
import torch.nn as nn
from torchtext import data
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE.type

'cuda'

In [3]:
BATCH_SIZE=10

In [4]:
data_all = pd.read_csv("../data/sentiment-analysis-on-movie-reviews/train.tsv",delimiter='\t')

In [5]:
train_data,test_data = train_test_split(data_all,test_size=0.2,random_state=2)

In [6]:
train_data.to_csv('../data/task2/train.csv',index=False)
test_data.to_csv('../data/task2/test.csv',index=False)

In [7]:
PAD_TOKEN='<pad>'
TEXT = data.Field(sequential=True,batch_first=True, lower=True, pad_token=PAD_TOKEN)
LABEL = data.Field(sequential=False, batch_first=True, unk_token=None)

In [8]:
datafields = [("PhraseId", None), # 不需要的filed设置为None
              ("SentenceId", None),
              ('Phrase', TEXT),
              ('Sentiment', LABEL)]

In [9]:
train_data = data.TabularDataset(path='../data/task2/train.csv',format='csv',fields=datafields)
test_data = data.TabularDataset(path='../data/task2/test.csv',format='csv',fields=datafields)

In [10]:
#构建词典，字符映射到embedding
#TEXT.vocab.vectors 就是词向量
TEXT.build_vocab(train_data,  vectors= 'glove.6B.50d',
                 unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25))
LABEL.build_vocab(train_data)


#得到索引，PAD_TOKEN='<pad>'
PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
TEXT.vocab.vectors[PAD_INDEX] = 0.0

In [11]:
#构建迭代器
train_iterator = data.BucketIterator(train_data, batch_size=BATCH_SIZE,train=True, shuffle=True,device=DEVICE)

test_iterator = data.Iterator(test_data, batch_size=len(test_data),train=False,sort=False, device=DEVICE)

In [12]:
#部分参数设置
embedding_choice='glove'   #  'static'    'non-static'
num_embeddings = len(TEXT.vocab)
embedding_dim =50
dropout_p=0.5
filters_num=100

vocab_size=len(TEXT.vocab)
label_num=len(LABEL.vocab)
print(vocab_size,label_num)

16517 6


In [13]:
import torch.nn.functional as F
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        self.embedding_choice=embedding_choice

        if self.embedding_choice==  'rand':
            self.embedding=nn.Embedding(num_embeddings,embedding_dim)
        if self.embedding_choice==  'glove':
            self.embedding = nn.Embedding(num_embeddings, embedding_dim,
                padding_idx=PAD_INDEX).from_pretrained(TEXT.vocab.vectors, freeze=True)


        self.conv1 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(3, embedding_dim), padding=(2,0))

        self.conv2 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(4, embedding_dim), padding=(3,0))

        self.conv3 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(5, embedding_dim), padding=(4,0))

        self.dropout = nn.Dropout(dropout_p)

        self.fc = nn.Linear(filters_num * 3, label_num)

    def forward(self,x):      # (Batch_size, Length)
        x=self.embedding(x).unsqueeze(1)      #(Batch_size, Length, Dimention)
                                       #(Batch_size, 1, Length, Dimention)

        x1 = F.relu(self.conv1(x)).squeeze(3)    #(Batch_size, filters_num, length+padding, 1)
                                          #(Batch_size, filters_num, length+padding)
        x1 = F.max_pool1d(x1, x1.size(2)).squeeze(2)  #(Batch_size, filters_num, 1)
                                               #(Batch_size, filters_num)

        x2 = F.relu(self.conv2(x)).squeeze(3)
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze(2)

        x3 = F.relu(self.conv3(x)).squeeze(3)
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze(2)

        x = torch.cat((x1, x2, x3), dim=1)  #(Batch_size, filters_num *3 )
        x = self.dropout(x)      #(Batch_size, filters_num *3 )
        out = self.fc(x)       #(Batch_size, label_num  )
        return out

In [14]:
#构建模型

model=CNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建优化器SGD
criterion = nn.CrossEntropyLoss()   #损失函数

if DEVICE.type=='cuda':
    model.cuda()

In [15]:
#开始训练
import time
epoch=100
best_accuracy=0.0
start_time=time.time()

for i in range(epoch):
    model.train()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    #训练
    for batch in train_iterator:
        steps+=1
        #print(steps)
        optimizer.zero_grad() #  梯度缓存清零

        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)    #[batch_size, label_num]
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()

        loss.backward()
        optimizer.step()

        correct = (torch.max(out, dim=1)[1]  #get the indices
                   .view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        if steps%100==0:
            print("Epoch %d_%.3f%%:  Training average Loss: %f"
                      %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps))

    #每个epoch都验证一下
    model.eval()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(test_iterator.dataset)
    steps = 0.0
    for batch in test_iterator:
        steps+=1
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()

        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%,Total Time:%f"
          %(i, total_loss/steps, total_correct*100/total_data_num,time.time()-start_time))

        if best_accuracy < total_correct/total_data_num :
            best_accuracy =total_correct/total_data_num
            torch.save(model,'model_dict/model_glove/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            print('Model is saved in model_dict/model_glove/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            #torch.cuda.empty_cache()
    break #运行时去除break

Epoch 0_0.801%:  Training average Loss: 1.440895
Epoch 0_1.602%:  Training average Loss: 1.365857
Epoch 0_2.403%:  Training average Loss: 1.304160
Epoch 0_3.204%:  Training average Loss: 1.275250
Epoch 0_4.005%:  Training average Loss: 1.247017
Epoch 0_4.806%:  Training average Loss: 1.227093
Epoch 0_5.607%:  Training average Loss: 1.216458
Epoch 0_6.408%:  Training average Loss: 1.202484
Epoch 0_7.209%:  Training average Loss: 1.194820
Epoch 0_8.010%:  Training average Loss: 1.186466
Epoch 0_8.811%:  Training average Loss: 1.176250
Epoch 0_9.612%:  Training average Loss: 1.171777
Epoch 0_10.413%:  Training average Loss: 1.164574
Epoch 0_11.214%:  Training average Loss: 1.159322
Epoch 0_12.015%:  Training average Loss: 1.156394
Epoch 0_12.815%:  Training average Loss: 1.151346
Epoch 0_13.616%:  Training average Loss: 1.146390
Epoch 0_14.417%:  Training average Loss: 1.143681
Epoch 0_15.218%:  Training average Loss: 1.140035
Epoch 0_16.019%:  Training average Loss: 1.133156
Epoch 0_16.8



In [16]:
embedding_choice='glove'   #  'static'    'non-static'
num_embeddings = len(TEXT.vocab)
embedding_dim =50
dropout_p=0.5
hidden_size=50  #隐藏单元数
num_layers=2  #层数

vocab_size=len(TEXT.vocab)
label_num=len(LABEL.vocab)
print(vocab_size,label_num)

16517 6


In [17]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()

        self.embedding_choice=embedding_choice
        self.hidden_size = hidden_size
        self.num_layers = num_layers


        if self.embedding_choice==  'rand':
            self.embedding=nn.Embedding(num_embeddings,embedding_dim)
        if self.embedding_choice==  'glove':
            self.embedding = nn.Embedding(num_embeddings, embedding_dim,
                padding_idx=PAD_INDEX).from_pretrained(TEXT.vocab.vectors, freeze=True)
        #input_size (输入的特征维度),hidden_size ,num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers,
                            batch_first=True,dropout=dropout_p,bidirectional=True)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(hidden_size * 2, label_num)  # 2 for bidirection



    def forward(self,x):      # (Batch_size, Length)
        # Set initial hidden and cell states
        # h_n (num_layers * num_directions, batch, hidden_size)
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)
        # c_n (num_layers * num_directions, batch, hidden_size):
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)

        if DEVICE.type=='cuda':
            h0=h0.cuda()
            c0=c0.cuda()

        x=self.embedding(x)     #(Batch_size, Length)
                                       #(Batch_size,  Length, Dimention)

        out, _ = self.lstm(x, (h0, c0))   #(Batch_size, Length，Dimention)
                                        # (batch_size, Length, hidden_size)
        out=self.dropout(out)

        out = self.fc(out[:, -1, :])   # (batch_size, Length, hidden_size)
                           # (batch_size, label_num)
        return out

In [18]:
#构建模型

model=LSTM()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建优化器SGD
criterion = nn.CrossEntropyLoss()   #损失函数

if DEVICE.type=='cuda':
    model.cuda()

In [20]:
epoch=100
best_accuracy=0.0
start_time=time.time()

for i in range(epoch):
    model.train()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    #训练
    for batch in train_iterator:
        steps+=1
        #print(steps)
        optimizer.zero_grad() #  梯度缓存清零

        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)    #[batch_size, label_num]
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()

        loss.backward()
        optimizer.step()

        correct = (torch.max(out, dim=1)[1]  #get the indices
                   .view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        if steps%100==0:
            print("Epoch %d_%.3f%%:  Training average Loss: %f"
                      %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps))

    #验证
    model.eval()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(test_iterator.dataset)
    steps = 0.0
    for batch in test_iterator:
        steps+=1
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()

        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%,Total Time:%f"
          %(i, total_loss/steps, total_correct*100/total_data_num,time.time()-start_time))

        if best_accuracy < total_correct/total_data_num :
            best_accuracy =total_correct/total_data_num
            torch.save(model,'model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            print('Model is saved in model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            #torch.cuda.empty_cache()

Epoch 0_0.801%:  Training average Loss: 0.975373
Epoch 0_1.602%:  Training average Loss: 1.001408
Epoch 0_2.403%:  Training average Loss: 1.004385
Epoch 0_3.204%:  Training average Loss: 0.989824
Epoch 0_4.005%:  Training average Loss: 0.994228
Epoch 0_4.806%:  Training average Loss: 0.997428
Epoch 0_5.607%:  Training average Loss: 0.999208
Epoch 0_6.408%:  Training average Loss: 0.997163
Epoch 0_7.209%:  Training average Loss: 0.993184
Epoch 0_8.010%:  Training average Loss: 0.991081
Epoch 0_8.811%:  Training average Loss: 0.990124
Epoch 0_9.612%:  Training average Loss: 0.993935
Epoch 0_10.413%:  Training average Loss: 0.992488
Epoch 0_11.214%:  Training average Loss: 0.992907
Epoch 0_12.015%:  Training average Loss: 0.991923
Epoch 0_12.815%:  Training average Loss: 0.991285
Epoch 0_13.616%:  Training average Loss: 0.988973
Epoch 0_14.417%:  Training average Loss: 0.989150
Epoch 0_15.218%:  Training average Loss: 0.987931
Epoch 0_16.019%:  Training average Loss: 0.989645
Epoch 0_16.8



Epoch 1_0.801%:  Training average Loss: 0.933823
Epoch 1_1.602%:  Training average Loss: 0.934040
Epoch 1_2.403%:  Training average Loss: 0.955319
Epoch 1_3.204%:  Training average Loss: 0.937806
Epoch 1_4.005%:  Training average Loss: 0.928102
Epoch 1_4.806%:  Training average Loss: 0.925860
Epoch 1_5.607%:  Training average Loss: 0.932861
Epoch 1_6.408%:  Training average Loss: 0.937918
Epoch 1_7.209%:  Training average Loss: 0.941890
Epoch 1_8.010%:  Training average Loss: 0.943666
Epoch 1_8.811%:  Training average Loss: 0.936575
Epoch 1_9.612%:  Training average Loss: 0.937170
Epoch 1_10.413%:  Training average Loss: 0.937787
Epoch 1_11.214%:  Training average Loss: 0.938969
Epoch 1_12.015%:  Training average Loss: 0.940052
Epoch 1_12.815%:  Training average Loss: 0.938778
Epoch 1_13.616%:  Training average Loss: 0.940761
Epoch 1_14.417%:  Training average Loss: 0.941858
Epoch 1_15.218%:  Training average Loss: 0.941621
Epoch 1_16.019%:  Training average Loss: 0.943671
Epoch 1_16.8

KeyboardInterrupt: 