In [20]:
import torch
import torch.nn as nn
from torchtext import data
import pandas as pd
from sklearn.model_selection import train_test_split

In [23]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE.type

'cuda'

In [15]:
BATCH_SIZE=10

In [3]:
data_all = pd.read_csv("../data/sentiment-analysis-on-movie-reviews/train.tsv",delimiter='\t')

In [4]:
train_data,test_data = train_test_split(data_all,test_size=0.2,random_state=2)

In [5]:
train_data.to_csv('../data/task2/train.csv',index=False)
test_data.to_csv('../data/task2/test.csv',index=False)

In [6]:
PAD_TOKEN='<pad>'
TEXT = data.Field(sequential=True,batch_first=True, lower=True, pad_token=PAD_TOKEN)
LABEL = data.Field(sequential=False, batch_first=True, unk_token=None)

In [7]:
datafields = [("PhraseId", None), # 不需要的filed设置为None
              ("SentenceId", None),
              ('Phrase', TEXT),
              ('Sentiment', LABEL)]

In [8]:
train_data = data.TabularDataset(path='../data/task2/train.csv',format='csv',fields=datafields)
test_data = data.TabularDataset(path='../data/task2/test.csv',format='csv',fields=datafields)

In [14]:
#构建词典，字符映射到embedding
#TEXT.vocab.vectors 就是词向量
TEXT.build_vocab(train_data,  vectors= 'glove.6B.50d',
                 unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25))
LABEL.build_vocab(train_data)


#得到索引，PAD_TOKEN='<pad>'
PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
TEXT.vocab.vectors[PAD_INDEX] = 0.0

.vector_cache\glove.6B.zip: 862MB [09:46, 1.47MB/s]                                
100%|█████████▉| 399999/400000 [00:18<00:00, 21523.36it/s]


In [17]:
#构建迭代器
train_iterator = data.BucketIterator(train_data, batch_size=BATCH_SIZE,train=True, shuffle=True,device=DEVICE)

test_iterator = data.Iterator(test_data, batch_size=len(test_data),train=False,sort=False, device=DEVICE)

In [18]:
#部分参数设置
embedding_choice='glove'   #  'static'    'non-static'
num_embeddings = len(TEXT.vocab)
embedding_dim =50
dropout_p=0.5
filters_num=100

vocab_size=len(TEXT.vocab)
label_num=len(LABEL.vocab)
print(vocab_size,label_num)

16517 6


In [22]:
import torch.nn.functional as F
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        self.embedding_choice=embedding_choice

        if self.embedding_choice==  'rand':
            self.embedding=nn.Embedding(num_embeddings,embedding_dim)
        if self.embedding_choice==  'glove':
            self.embedding = nn.Embedding(num_embeddings, embedding_dim,
                padding_idx=PAD_INDEX).from_pretrained(TEXT.vocab.vectors, freeze=True)


        self.conv1 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(3, embedding_dim), padding=(2,0))

        self.conv2 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(4, embedding_dim), padding=(3,0))

        self.conv3 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(5, embedding_dim), padding=(4,0))

        self.dropout = nn.Dropout(dropout_p)

        self.fc = nn.Linear(filters_num * 3, label_num)

    def forward(self,x):      # (Batch_size, Length)
        x=self.embedding(x).unsqueeze(1)      #(Batch_size, Length, Dimention)
                                       #(Batch_size, 1, Length, Dimention)

        x1 = F.relu(self.conv1(x)).squeeze(3)    #(Batch_size, filters_num, length+padding, 1)
                                          #(Batch_size, filters_num, length+padding)
        x1 = F.max_pool1d(x1, x1.size(2)).squeeze(2)  #(Batch_size, filters_num, 1)
                                               #(Batch_size, filters_num)

        x2 = F.relu(self.conv2(x)).squeeze(3)
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze(2)

        x3 = F.relu(self.conv3(x)).squeeze(3)
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze(2)

        x = torch.cat((x1, x2, x3), dim=1)  #(Batch_size, filters_num *3 )
        x = self.dropout(x)      #(Batch_size, filters_num *3 )
        out = self.fc(x)       #(Batch_size, label_num  )
        return out

In [24]:
#构建模型

model=CNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建优化器SGD
criterion = nn.CrossEntropyLoss()   #损失函数

if DEVICE.type=='cuda':
    model.cuda()

In [27]:
#开始训练
import time
epoch=100
best_accuracy=0.0
start_time=time.time()

for i in range(epoch):
    model.train()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    #训练
    for batch in train_iterator:
        steps+=1
        #print(steps)
        optimizer.zero_grad() #  梯度缓存清零

        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)    #[batch_size, label_num]
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()

        loss.backward()
        optimizer.step()

        correct = (torch.max(out, dim=1)[1]  #get the indices
                   .view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        if steps%100==0:
            print("Epoch %d_%.3f%%:  Training average Loss: %f"
                      %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps))

    #每个epoch都验证一下
    model.eval()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(test_iterator.dataset)
    steps = 0.0
    for batch in test_iterator:
        steps+=1
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()

        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%,Total Time:%f"
          %(i, total_loss/steps, total_correct*100/total_data_num,time.time()-start_time))

        if best_accuracy < total_correct/total_data_num :
            best_accuracy =total_correct/total_data_num
            torch.save(model,'model_dict/model_glove/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            print('Model is saved in model_dict/model_glove/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            #torch.cuda.empty_cache()
    break #运行时去除break

Epoch 0_0.801%:  Training average Loss: 1.025518
Epoch 0_1.602%:  Training average Loss: 1.017275
Epoch 0_2.403%:  Training average Loss: 1.032822
Epoch 0_3.204%:  Training average Loss: 1.020456
Epoch 0_4.005%:  Training average Loss: 1.007579
Epoch 0_4.806%:  Training average Loss: 1.009438
Epoch 0_5.607%:  Training average Loss: 1.005703
Epoch 0_6.408%:  Training average Loss: 1.005773
Epoch 0_7.209%:  Training average Loss: 1.004576
Epoch 0_8.010%:  Training average Loss: 1.005422
Epoch 0_8.811%:  Training average Loss: 1.011184
Epoch 0_9.612%:  Training average Loss: 1.011259
Epoch 0_10.413%:  Training average Loss: 1.011732
Epoch 0_11.214%:  Training average Loss: 1.009617
Epoch 0_12.015%:  Training average Loss: 1.008926
Epoch 0_12.815%:  Training average Loss: 1.009234
Epoch 0_13.616%:  Training average Loss: 1.011360
Epoch 0_14.417%:  Training average Loss: 1.011482
Epoch 0_15.218%:  Training average Loss: 1.011780
Epoch 0_16.019%:  Training average Loss: 1.013563
Epoch 0_16.8

