In [840]:
import re
import unicodedata
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import pickle
import csv
import torch.utils.data as Data
import pandas as pd
import math
import torchtext.vocab as Vocab
import collections
import time
import numpy as np
import os
from sklearn.model_selection import train_test_split
from torchtext import data, datasets
PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# torch.set_printoptions(profile=8)
print(torch.__version__, device)

1.4.0 cpu


# Data Processing

In [841]:
# 将一个序列中所有的词记录在all_tokens中以便之后构造词典，然后在该序列后面添加PAD直到序列
# 长度变为max_seq_len，然后将序列保存在all_seqs中
def process_one_seq(seq_tokens, all_tokens, all_seqs, max_seq_len):
    all_tokens.extend(seq_tokens) # add seq_tokens into one list, like extend the list, [..., seq_tokens].
#     print('all_tokens', len(all_tokens))
    seq_tokens += [EOS] + [PAD] * (max_seq_len - len(seq_tokens) - 1)
#     print('after add pad seq_tokens: ', seq_tokens)
    all_seqs.append(seq_tokens) # add seq_tokens become a list element, [...,[seq_tokens]].
#     print('all_seqs', all_seqs)

# 使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor
def build_data(all_tokens, all_seqs):

#   collections.Counter(), A counter is a container that stores elements as dictionary keys, and their counts are stored as dictionary values.
    tokens_dic = collections.Counter(all_tokens) 
#     print('tokens_dic', tokens_dic)
    vocab = Vocab.Vocab(tokens_dic, specials = [PAD, BOS, EOS])
#     print('vocab', vocab)
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs] # use its vocab_dic to represent the sentence. 
#     print('length of indices', len(indices))
#     print('indices', torch.tensor(indices))
    return vocab, torch.tensor(indices)

In [842]:
def normalizeString(s):
    s = s.str.lower()
    s = s.str.replace(r"<br />",r" ")
    # s = re.sub(' +',' ',s)
    s = s.str.replace(r'(\W)(?=\1)', '')
    s = s.str.replace(r"([.!?])", r" \1")
    s = s.str.replace(r"[^a-zA-Z.!?]+", r" ")

    return s

In [843]:
def read_data(max_seq_len, data):
    # in和out分别是input和output的缩写
    in_tokens, in_seqs = [], []
    df = data.copy()
    df['comment'] = normalizeString(df['comment'])
    df.reset_index(inplace=True, drop=True)
#     print(df[:10])
    k = 10
    target = []
    for line in range(k):
#         print('line', line)
        in_seq_tokens = df['comment'][line].split(' ')

        if len(in_seq_tokens) > max_seq_len - 1:
            continue  # 如果加上EOS后长于max_seq_len，则忽略掉此样本

        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        target.append(df.label[line])
        
    in_vocab, in_data = build_data(in_tokens, in_seqs) # in_tokens is the list where contains every word, in_seqs is a list where its element are the sentence in French.
    return in_vocab, in_data, torch.tensor(target)

In [884]:
df = pd.read_csv("./IMDB Dataset.csv", names=['comment', 'label'], header=0, encoding='utf-8')
df['label'] = df['label'].apply(lambda x: 1 if x=='positive' else 0)
df_train, df_test = train_test_split(df, test_size=0.3, random_state=41)
vocab, corpus_indices, label = read_data(300, df_train)
vocab_valid, corpus_indices_valid, label_valid = read_data(300, df_test)

## RNN for Data augmentation

In [886]:
corpus_indices.shape[0]

7

In [877]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, drop_prob = 0):
        super(RNNModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, dropout=drop_prob)
        self.dense = nn.Linear(hidden_size, vocab_size)
        self.state = None

    def forward(self, inputs, state): # inputs: (seq_len, batch), 
        
        embedding = self.embedding(inputs.to(torch.int64))
        Y, state = self.rnn(embedding, self.state)
        output = self.dense(Y[-1,:,:]) # so this is why we need Y[:, -1, :] 最外层的hidden state, because it already contains all the information of last all words.
        return output, state


In [878]:
embed_size = 5
hidden_size = 256
model = RNNModel(vocab_size, embed_size, hidden_size).to(device)
loss_da = nn.CrossEntropyLoss()

In [879]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_indices = torch.tensor(corpus_indices, device=device)
#     print('corpus_indices', corpus_indices)
    data_len = len(corpus_indices)
#     print('data_len', data_len)
    batch_len = data_len // batch_size
#     print('batch_len', batch_len)
    indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)
#     print('indices', indices)
#     print('indices', indices.shape)
    epoch_size = (batch_len - 1) // num_steps
#     print('epoch_size', epoch_size)
    for i in range(epoch_size):
#         i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + num_steps + 1] # 错1位取data
        yield X, Y

In [880]:
def train_and_predict_rnn_pytorch(model, vocab_size, device,
                                corpus_indices, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size):
    print(corpus_indices.shape)
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    state = None
    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_consecutive(corpus_indices, batch_size, num_steps, device) # 相邻采样
#         print('data_iter_consecutive is no errors')
        for X, Y in data_iter:
            if state is not None:
                # 使用detach函数从计算图分离隐藏状态, 这是为了
                # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
                if isinstance (state, tuple): # LSTM, state:(h, c)  
                    state = (state[0].detach(), state[1].detach())
                else:   
                    state = state.detach()
            (output, state) = model(X.permute(1,0), state) # change X shape to (seq, batch_size) instead of (batch_size, seq)
            # output 的形状是(batch_size, vocab) Y(batch_size)
            l = loss_da(output, Y.long())
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.item() * Y.shape[0]
            n += Y.shape[0]
            
    corpus = predict_rnn_pytorch(corpus_indices, pred_len, model, vocab_size, device)
    return corpus

In [881]:
def predict_rnn_pytorch(corpus, num_chars, model, vocab_size, device):

    output = []
    for i in range(3, corpus.shape[0], 4):
        
        X = corpus[i-3:i]

        state = None
        if state is not None:
            if isinstance(state, tuple): # LSTM, state:(h, c)  
                state = (state[0].to(device), state[1].to(device))
            else:   
                state = state.to(device)
        X = torch.tensor(torch.unsqueeze(X,0), device=device).view(3, 1) # X is a list need to become a two demission array then could be change to(3,1 )(seq, batch_size)
        (Y, state) = model(X, state)  # Y (batch_size, vocab)
        corpus[i] = int(Y.argmax(dim=1).item())

    return corpus

In [891]:
start = time.time()

corpus = corpus_indices

for i in range(corpus.shape[0]):

    corpus_indice, tokens_dic, vocab_size = corpus[i], vocab, len(vocab)
    print(corpus_indice.shape)
    num_epochs, batch_size, lr, clipping_theta, num_steps= 20, 8, 1e-3, 1e-2, 4 # 注意这里的学习率设置

#     print('before corpus', corpus[i])
    corpus[i] = train_and_predict_rnn_pytorch(model, vocab_size, device,
                            corpus_indice, tokens_dic, num_epochs, num_steps, 
                            lr, clipping_theta, batch_size)
    if (i+1) % 5 == 0:
            print('i %d, time %.2f sec' % (
                i+1, time.time() - start))

torch.Size([300])
torch.Size([300])


  after removing the cwd from sys.path.
  


torch.Size([300])
torch.Size([300])
torch.Size([300])
torch.Size([300])
torch.Size([300])
torch.Size([300])
torch.Size([300])
torch.Size([300])
i 5, time 5.06 sec
torch.Size([300])
torch.Size([300])
torch.Size([300])
torch.Size([300])


In [943]:
corpus_indices_ori = Data.TensorDataset(corpus_indices, label)
corpus_indices_da = Data.TensorDataset(corpus, label)
corpus_indices_classification = Data.TensorDataset(corpus_indices_valid, label_valid)

## Data classification

In [944]:

if torch.cuda.is_available():
    device = torch.device("cuda")
    use_cuda = True
else:
    device = torch.device("cpu")
    use_cuda = False
vocabLimit = vocab_length
input_dim = 300
# max_sequence_len = 500

## Model

In [945]:
class Model(torch.nn.Module) :
    def __init__(self, input_dim, embedding_dim, hidden_dim) :
        super(Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocabLimit+1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim)
        self.linearOut = nn.Linear(hidden_dim,1)
    def forward(self,inputs,hidden):
        print('inputs is', inputs.shape[0])
        assert 'this is '
        x = self.embeddings(inputs) # embeding 太小会报错
        print('after embedding')
        lstm_out,lstm_h = self.lstm(x)
        x = lstm_out[-1]
        x = self.linearOut(x)
# 		x = F.log_softmax(x)
        return x,lstm_h
    def init_hidden(self) :
        if use_cuda:
            return (Variable(torch.zeros(1, 1, self.hidden_dim)).cuda(),Variable(torch.zeros(1, 300, self.hidden_dim)).cuda())
        else:
            return (Variable(torch.zeros(1, 1, self.hidden_dim)),Variable(torch.zeros(1, 300, self.hidden_dim)))

if use_cuda:
	model = Model(input_dim, 50, 100).cuda()
else:
	model = Model(input_dim, 500, 100)

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 4

torch.save(model.state_dict(), 'model' + str(0)+'.pth')
print('starting training')

starting training


In [946]:
vocab_length = len(vocab)
vocab_length_valid = len(vocab_valid)

In [947]:

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [948]:
INPUT_DIM = vocab_length
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)


## Accuracy

In [949]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc
criterion = nn.BCEWithLogitsLoss()

In [950]:
model = model.to(device)
criterion = criterion.to(device)

import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [951]:
def train(model, corpus_indices, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    data_iter = Data.DataLoader(corpus_indices, batch_size = 4, shuffle=True)
        
    for X, Y in data_iter:
        
        optimizer.zero_grad()
        if use_cuda:
            X = Variable(torch.cuda.LongTensor(X))
        else:
            X = Variable(torch.LongTensor(X))
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

#         print('y_pred', y_pred)
        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)




In [952]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    data_iter = Data.DataLoader(iterator, batch_size = 4, shuffle=True)
    for X, Y in data_iter:
        optimizer.zero_grad()
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)


In [953]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


## Training

In [954]:
def validation(model_for_ori, model_for_aug, traing_data_ori, traing_data_aug, validation_data, optimizer, criterion, num_epochs):

    N_EPOCHS = num_epochs

    best_valid_loss = float('inf')

    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss_ori, train_acc_ori = train(model_for_ori, traing_data_ori, optimizer, criterion)
        valid_loss_ori, valid_acc_ori = evaluate(model_for_ori, validation_data, criterion)
        train_loss_aug, train_acc_aug = train(model_for_aug, traing_data_aug, optimizer, criterion)
        valid_loss_aug, valid_acc_aug = evaluate(model_for_aug, validation_data, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'tut1-model.pt')
            
        if (epoch + 1) % 2 == 0:
#             print('epoch %d, perplexity %f, time %.2f sec' % (
#                 epoch + 1, perplexity, time.time() - start))

            print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print('Before data augmentation')
            print(f'\tTrain Loss: {train_loss_ori:.3f} | Train Acc: {train_acc_ori*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss_ori:.3f} |  Val. Acc: {valid_acc_ori*100:.2f}%')
            print('After data augmentation')
            print(f'\tTrain Loss: {train_loss_aug:.3f} | Train Acc: {train_acc_aug*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss_aug:.3f} |  Val. Acc: {valid_acc_aug*100:.2f}%')

In [958]:
model_for_ori = model
model_for_aug = model
validation(model_for_ori, model_for_aug,corpus_indices_ori, corpus_indices_da, 
           corpus_indices_classification, optimizer, 
           criterion , 6)
# validation(model, corpus_indices_da, corpus_indices_classification, optimizer, criterion, 6, 'Data augmentation')


Epoch:  2 | Epoch Time: 0m 0s
Before data augmentation
	Train Loss: 0.700 | Train Acc: 45.83%
	 Val. Loss: 0.704 |  Val. Acc: 37.50%
After data augmentation
	Train Loss: 0.701 | Train Acc: 41.67%
	 Val. Loss: 0.701 |  Val. Acc: 37.50%
Epoch:  4 | Epoch Time: 0m 0s
Before data augmentation
	Train Loss: 0.697 | Train Acc: 45.83%
	 Val. Loss: 0.699 |  Val. Acc: 37.50%
After data augmentation
	Train Loss: 0.704 | Train Acc: 37.50%
	 Val. Loss: 0.695 |  Val. Acc: 37.50%
Epoch:  6 | Epoch Time: 0m 0s
Before data augmentation
	Train Loss: 0.693 | Train Acc: 58.33%
	 Val. Loss: 0.691 |  Val. Acc: 62.50%
After data augmentation
	Train Loss: 0.692 | Train Acc: 58.33%
	 Val. Loss: 0.689 |  Val. Acc: 62.50%


In [None]:
# train(model, corpus_indices_da, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, corpus_indices_classification, criterion)
    train(model, corpus_indices_da, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, corpus_indices_classification, criterion)
    