In [136]:
import random
from collections import defaultdict
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import tqdm

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(2020)
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

# 数据读取与处理

In [104]:
text_field = torchtext.data.Field(
    lower=True, 
    include_lengths=True,
    batch_first=True
)
label_field = torchtext.data.Field(
    sequential=False,
    use_vocab=False,
    preprocessing=lambda x: 0 if x=='neg' else 1,
    dtype=torch.float
)

# 生成IMDB数据集
train_data, test_data = torchtext.datasets.IMDB.splits(
    text_field=text_field, 
    label_field=label_field,
    root='/media/bnu/data/nlp-practice/sentiment-analysis/',
)
# 划分验证集
train_data, valid_data = train_data.split(split_ratio=0.8)

In [105]:
print('Text Sample:')
print(' '.join(train_data.examples[0].text))
print('-' * 60)
print('Label Sample:')
print(train_data.examples[0].label)
print('-' * 60)
print()
print('Train Examples:', len(train_data))
print('Valid Examples:', len(valid_data))
print('Test Examples:', len(test_data))

Text Sample:
you know this is gonna be a cheesy movie when:<br /><br />1. it was made it the 50's 2. it's in black and white. 3. it has no name actors! 4. screaming makes up for the lack of special effects!<br /><br />well not to be outdone - this movie brilliantly incorporated all four of the above elements to turn this into a true cinematic blunder.<br /><br />okay - shhhhh but i am gonna discuss special effects here - or lack of them - <br /><br />did you catch the underwater scenes? it looks like it was poorly filmed through an aquarium - note the cape flapping in the breeze.<br /><br />and the repeated re-use of stock footage, (exterior house shots, the bridges scenes -- great enhanced the k-r-a-f-tiness of this film - not since "plan 9" - have i seen such creative usage of stock footage.<br /><br />and hey where there was a lack of special effects - not to worry - screaming does take the place of special effects in this movie as well. yes this movie even cleverly used that old ha

In [106]:
# 使用预训练词向量构建单词表
text_field.build_vocab(
    train_data, 
    max_size=25000, 
    vectors='glove.6B.100d',
    vectors_cache='/media/bnu/data/nlp-practice/word-vector',
    unk_init=torch.Tensor.zero_
)

In [107]:
print('Text Vocab Size:', len(text_field.vocab))
print('-' * 60)
print('Text Str -> Index:')
print(list(text_field.vocab.stoi.items())[:10])

Text Vocab Size: 25002
------------------------------------------------------------
Text Str -> Index:
[('<unk>', 0), ('<pad>', 1), ('the', 2), ('a', 3), ('and', 4), ('of', 5), ('to', 6), ('is', 7), ('in', 8), ('i', 9)]


In [108]:
# 创建Iterators
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_sizes=(32, 32, 32),
    device=device,
    shuffle=True
)

In [109]:
print('Inputs Data Shape:', next(iter(train_iter)).text[0].shape)
print('Inputs Length Shape:', next(iter(train_iter)).text[1].shape)
print('Target Data Shape:', next(iter(train_iter)).label.shape)

Inputs Data Shape: torch.Size([32, 484])
Inputs Length Shape: torch.Size([32])
Target Data Shape: torch.Size([32])


# 定义模型

In [118]:
class WordAVGModel(nn.Module):
    
    def __init__(self, n_words, n_embed):
        super(WordAVGModel, self).__init__()
        self.embed = nn.Embedding(n_words, n_embed)
        self.fc = nn.Linear(n_embed, 1)
    
    def forward(self, inputs):
        # inputs shape: (batch_size, max_len)
        
        # x_embed shape: (batch_size, max_len, embed_size)
        x_embed = self.embed(inputs)
        # x_mean shape: (batch_size, embed_size)
        x_mean = torch.mean(x_embed, dim=1)
        
        # outputs shape: (batch_size)
        return self.fc(x_mean).squeeze()
    
    def init_embed(self, pretrained_embeddings):
        self.embed.weight.data.copy_(pretrained_embeddings)
        

inputs = next(iter(train_iter)).text[0]
model = WordAVGModel(len(text_field.vocab), 100)
model.init_embed(text_field.vocab.vectors)
model.to(device)
outputs = model(inputs)
print(outputs.shape)

torch.Size([32])


In [133]:
class BiLSTMModel(nn.Module):
    
    def __init__(self, n_words, n_embed, n_hidden, n_layers):
        super(BiLSTMModel, self).__init__()
        self.embed = nn.Embedding(n_words, n_embed)
        self.rnn = nn.LSTM(n_embed, n_hidden, 
                           num_layers=n_layers, 
                           bidirectional=True, 
                           dropout=0.2,
                           batch_first=True)
        self.fc = nn.Linear(n_hidden * 2, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, inputs):
        # inputs shape: (batch_size, max_len)
        
        # x_embed shape: (batch_size, max_len, embed_size)
        x_embed = self.dropout(self.embed(inputs))
        
        # x_rnn shape: (batch_size, max_len, 2 * hidden_size)
        # h_n shape: (num_layer * num_dir, batch_size, hidden_size)
        x_rnn, (h_n, c_n) = self.rnn(x_embed)
        
        x_hidden = self.dropout(torch.cat((h_n[-1], h_n[-2]), dim=1))
        
        # outputs shape: (batch_size)
        return self.fc(x_hidden).squeeze()
    
    def init_embed(self, pretrained_embeddings):
        self.embed.weight.data.copy_(pretrained_embeddings)
        
        
inputs = next(iter(train_iter)).text[0]
model = BiLSTMModel(len(text_field.vocab), 100, 100, 2)
model.init_embed(text_field.vocab.vectors)
model.to(device)
outputs = model(inputs)
print(outputs.shape)

torch.Size([32])


In [145]:
class CNNModel(nn.Module):
    
    def __init__(self, n_words, n_embed, n_filters, kernel_sizes):
        super(CNNModel, self).__init__()
        self.embed = nn.Embedding(n_words, n_embed)
        self.convs = nn.ModuleList([
            nn.Conv1d(n_embed, n_filters, kernel_size=ks)
            for ks in kernel_sizes
        ])
        self.fc = nn.Linear(len(kernel_sizes) * n_filters, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, inputs):
        # inputs shape: (batch_size, max_len)
        
        # x_embed shape: (batch_size, max_len, embed_size)
        x_embed = self.embed(inputs)
        # x_embed shape: (batch_size, embed_size, max_len)
        x_embed = x_embed.transpose(1, 2)

        
        # x_convs[0] shape: (batch_size, num_filter, max_len - kernel_size + 1)
        x_convs = [F.relu(conv(x_embed)) for conv in self.convs]
        # x_pools[0] shape: (batch_size, num_filter)
        x_pools = [F.max_pool1d(x_conv, x_conv.shape[2]).squeeze() 
                   for x_conv in x_convs]
        
        # x_cat shape: (batch_size, num_kernel * num_filter)
        x_cat = self.dropout(torch.cat(x_pools, dim=1))
        
        # outputs shape: (batch_size)
        return self.fc(x_cat).squeeze()
        
        
    def init_embed(self, pretrained_embeddings):
        self.embed.weight.data.copy_(pretrained_embeddings)
        
        
inputs = next(iter(train_iter)).text[0]
model = CNNModel(len(text_field.vocab), 100, 50, [2, 3, 4, 5])
model.init_embed(text_field.vocab.vectors)
model.to(device)
outputs = model(inputs)
print(outputs.shape)

torch.Size([32])


# 模型训练

In [134]:
class IDMBLearner:
    
    def __init__(self, model):
        self.model = model
        self.model.to(device)
        self.criterion = nn.BCEWithLogitsLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.history = defaultdict(list)
        
    def calc_correct_count(self, outputs, targets):
        preds = torch.round(torch.sigmoid(outputs))
        return (preds == targets).long().sum().item()
        
        
    def fit(self, n_epochs):
        for epoch in range(n_epochs):
            self.model.train()
            total_loss = 0.0
            total_samples, total_correct = 0, 0
            
            pbar = tqdm.notebook.tqdm(train_iter)
            pbar.set_description(f'Epoch {epoch+1} --> Train')
            
            for batch in pbar:
                inputs, targets = batch.text[0].to(device), batch.label.to(device)
                outputs = self.model(inputs)
                
                loss = self.criterion(outputs, targets)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                total_samples += len(targets)
                total_loss += loss.item() * len(targets)
                total_correct += self.calc_correct_count(outputs, targets)
                
                pbar.set_postfix(loss=total_loss/total_samples, acc=total_correct/total_samples)
            
            self.history['train_loss'].append(total_loss/total_samples)
            self.history['train_acc'].append(total_correct/total_samples)
                
            self.model.eval()
            total_loss = 0.0
            total_samples, total_correct = 0, 0
            
            pbar = tqdm.notebook.tqdm(valid_iter)
            pbar.set_description(f'Epoch {epoch+1} --> Valid')
            
            with torch.no_grad():
                for batch in pbar:
                    inputs, targets = batch.text[0].to(device), batch.label.to(device)
                    outputs = self.model(inputs)
                    
                    loss = self.criterion(outputs, targets)
                    
                    total_samples += len(targets)
                    total_loss += loss.item() * len(targets)
                    total_correct += self.calc_correct_count(outputs, targets)
                
                    pbar.set_postfix(loss=total_loss/total_samples, acc=total_correct/total_samples)
                
            self.history['valid_loss'].append(total_loss/total_samples)
            self.history['valid_acc'].append(total_correct/total_samples)

In [120]:
model = WordAVGModel(len(text_field.vocab), 100)
model.init_embed(text_field.vocab.vectors)
model.to(device)
learner = IDMBLearner(model)
learner.fit(10)

HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




In [135]:
model = BiLSTMModel(len(text_field.vocab), 100, 256, 2)
model.init_embed(text_field.vocab.vectors)
model.to(device)
learner = IDMBLearner(model)
learner.fit(10)

HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




In [146]:
model = CNNModel(len(text_field.vocab), 100, 50, [2, 3, 4, 5])
model.init_embed(text_field.vocab.vectors)
model.to(device)
learner = IDMBLearner(model)
learner.fit(10)

HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


