In [181]:
import re
import unicodedata
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import pickle
import csv
import torch.utils.data as Data
from sklearn.utils import resample
import pandas as pd
import math
import torchtext.vocab as Vocab
import collections
import time
import numpy as np
import torch.optim as optim
import os
from sklearn.model_selection import train_test_split
from torchtext import data, datasets
PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# torch.set_printoptions(profile=8)
print(torch.__version__, device)

1.4.0 cpu


# Data Processing

In [182]:
class loading_data():
    
    def __init__(self, max_seq_len, data):
        self.max_seq_len = max_seq_len
        self.data = data
    # in和out分别是input和output的缩写
    # 将一个序列中所有的词记录在all_tokens中以便之后构造词典，然后在该序列后面添加PAD直到序列
    # 长度变为max_seq_len，然后将序列保存在all_seqs中
    def process_one_seq(self, seq_tokens, all_tokens, all_seqs, max_seq_len):
        all_tokens.extend(seq_tokens) # add seq_tokens into one list, like extend the list, [..., seq_tokens].
    #     print('all_tokens', len(all_tokens))
        seq_tokens += [EOS] + [PAD] * (max_seq_len - len(seq_tokens) - 1)
    #     print('after add pad seq_tokens: ', seq_tokens)
        all_seqs.append(seq_tokens) # add seq_tokens become a list element, [...,[seq_tokens]].
    #     print('all_seqs', all_seqs)

    # 使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor
    def build_data(self, all_tokens, all_seqs):

    #   collections.Counter(), A counter is a container that stores elements as dictionary keys, and their counts are stored as dictionary values.
        tokens_dic = collections.Counter(all_tokens) 
    #     print('tokens_dic', tokens_dic)
        vocab = Vocab.Vocab(tokens_dic, specials = [PAD, BOS, EOS])
    #     print('vocab', vocab)
        indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs] # use its vocab_dic to represent the sentence. 
    #     print('length of indices', len(indices))
    #     print('indices', torch.tensor(indices))
        return vocab, torch.tensor(indices)
    
    def normalizeString(self, s):
        s = s.str.lower()
        s = s.str.replace(r"<br />",r" ")
        # s = re.sub(' +',' ',s)
        s = s.str.replace(r'(\W)(?=\1)', '')
        s = s.str.replace(r"([.!?])", r" \1")
        s = s.str.replace(r"[^a-zA-Z.!?]+", r" ")

        return s
    
    def main(self):
        
        in_tokens, in_seqs = [], []
        df = self.data.copy()
        df['comment'] = normalizeString(df['comment'])
        df.reset_index(inplace=True, drop=True)
    #     print(df[:10])
        k = 10
        target = []
        for line in range(k):
    #         print('line', line)
            in_seq_tokens = df['comment'][line].split(' ')

            if len(in_seq_tokens) > self.max_seq_len - 1:
                continue  # 如果加上EOS后长于max_seq_len，则忽略掉此样本

            self.process_one_seq(in_seq_tokens, in_tokens, in_seqs, self.max_seq_len)
            target.append(df.label[line])

        in_vocab, in_data = self.build_data(in_tokens, in_seqs) # in_tokens is the list where contains every word, in_seqs is a list where its element are the sentence in French.
        return in_vocab, in_data, torch.tensor(target)

## RNN for Data augmentation

In [183]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, drop_prob = 0):
        super(RNNModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, dropout=drop_prob)
        self.dense = nn.Linear(hidden_size, vocab_size)
        self.state = None

    def forward(self, inputs, state): # inputs: (seq_len, batch), 
        
        embedding = self.embedding(inputs.to(torch.int64))
        Y, state = self.rnn(embedding, self.state)
        output = self.dense(Y[-1,:,:]) # so this is why we need Y[:, -1, :] 最外层的hidden state, because it already contains all the information of last all words.
        return output, state



In [184]:

class augmentation():
    
    def __init__(self, embed_size, hidden_size, vocab_size, loss_da, corpus_indices, device):
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.device = device
        self.vocab_size = vocab_size
        self.vocab = vocab
        self.loss_da = loss_da
        self.corpus_indices = corpus_indices
#         self.batch_size = batch_size
#         self.num_steps = num_steps
    
    def data_iter_consecutive(self, corpus_indices, batch_size, num_steps, device=None):
        
        if device is None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        corpus_indices = torch.tensor(corpus_indices, device=device)

        data_len = len(corpus_indices)
        batch_len = data_len // batch_size
        indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)
        epoch_size = (batch_len - 1) // num_steps
        for i in range(epoch_size):
    #         i = i * num_steps
            X = indices[:, i: i + num_steps]
            Y = indices[:, i + num_steps + 1] # 错1位取data
            yield X, Y
            
    def train_rnn(self, model, vocab_size, device,
                                corpus_indices, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size):
#         device = None
#         print(corpus_indices.shape)
        loss = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        model.to(device)
        state = None
        for epoch in range(num_epochs):
            l_sum, n, start = 0.0, 0, time.time()
            data_iter = self.data_iter_consecutive(corpus_indices, batch_size, num_steps, device) # 相邻采样
    #         print('data_iter_consecutive is no errors')
            for X, Y in data_iter:
                if state is not None:
                    # 使用detach函数从计算图分离隐藏状态, 这是为了
                    # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
                    if isinstance (state, tuple): # LSTM, state:(h, c)  
                        state = (state[0].detach(), state[1].detach())
                    else:   
                        state = state.detach()
                (output, state) = model(X.permute(1,0), state) # change X shape to (seq, batch_size) instead of (batch_size, seq)
                # output 的形状是(batch_size, vocab) Y(batch_size)
                l = loss_da(output, Y.long())

                optimizer.zero_grad()
                l.backward()
                optimizer.step()
                l_sum += l.item() * Y.shape[0]
                n += Y.shape[0]

        corpus = self.predict_rnn_pytorch(corpus_indices, model, vocab_size, device)
        return corpus
    
    def predict_rnn_pytorch(self, corpus, model, vocab_size, device):

        output = []
        for i in range(3, corpus.shape[0], 4):

            X = corpus[i-3:i]
            state = None
            if state is not None:
                if isinstance(state, tuple): # LSTM, state:(h, c)  
                    state = (state[0].to(device), state[1].to(device))
                else:   
                    state = state.to(device)
            X = torch.tensor(torch.unsqueeze(X,0), device=device).view(3, 1) # X is a list need to become a two demission array then could be change to(3,1 )(seq, batch_size)
            (Y, state) = model(X, state)  # Y (batch_size, vocab)
            corpus[i] = int(Y.argmax(dim=1).item())

        return corpus
    
    def main(self):
        
#         if device is None:
#             device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        start = time.time()
        model = RNNModel(self.vocab_size, self.embed_size, self.hidden_size).to(self.device)
        loss_da = nn.CrossEntropyLoss()
        corpus = self.corpus_indices

        for i in range(corpus.shape[0]):

            corpus_indice, tokens_dic, vocab_size = corpus[i], self.vocab, len(self.vocab)
            num_epochs, batch_size, lr, clipping_theta, num_steps= 20, 8, 1e-3, 1e-2, 4 # 注意这里的学习率设置

            corpus[i] = self.train_rnn(model, vocab_size, self.device,
                                    corpus_indice, tokens_dic, num_epochs, num_steps, 
                                    lr, clipping_theta, batch_size)
            
            if (i+1) % 5 == 0:
                    print('i %d, time %.2f sec' % (
                        i+1, time.time() - start))
        return corpus        



In [185]:
# corpus_indices_ori = Data.TensorDataset(corpus_indices, label)
# corpus_indices_da = Data.TensorDataset(corpus, label)
# corpus_indices_classification = Data.TensorDataset(corpus_indices_valid, label_valid)

## Data classification

In [186]:

# if torch.cuda.is_available():
#     device = torch.device("cuda")
#     use_cuda = True
# else:
#     device = torch.device("cpu")
#     use_cuda = False
# vocabLimit = vocab_size
# input_dim = 300
# max_sequence_len = 500

## Model

In [187]:
# vocab_length = len(vocab)
# vocab_length_valid = len(vocab_valid)

In [188]:

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [189]:
# INPUT_DIM = vocab_length
# EMBEDDING_DIM = 100
# HIDDEN_DIM = 256
# OUTPUT_DIM = 1

# model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)


In [190]:

class classification():
    
    def __init__(self, vocab, vocab_vaild, training_data_ori, training_data_aug, validation_data, num_epochs):
        
        self.vocab_length = len(vocab)
        self.vocab_length_valid = len(vocab_valid)
        self.traing_data_ori = training_data_ori
        self.traing_data_aug = training_data_aug
        self.validation_data = validation_data
        self.num_epochs = num_epochs

    
    
    def binary_accuracy(self, preds, y):
        """
        Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
        """
        #round predictions to the closest integer
        rounded_preds = torch.round(torch.sigmoid(preds))
        correct = (rounded_preds == y).float() #convert into float for division 
        acc = correct.sum() / len(correct)
        return acc

    def train(self, model, corpus_indices, optimizer, criterion):
        
        if torch.cuda.is_available():
            device = torch.device("cuda")
            use_cuda = True
        else:
            device = torch.device("cpu")
            use_cuda = False
    
        epoch_loss = 0
        epoch_acc = 0

        data_iter = Data.DataLoader(corpus_indices, batch_size = 4, shuffle=True)

        for X, Y in data_iter:

            optimizer.zero_grad()
            if use_cuda:
                X = Variable(torch.cuda.LongTensor(X))
            else:
                X = Variable(torch.LongTensor(X))

            y_pred = model(X.permute(1, 0)).squeeze(1)

    #         print('y_pred', y_pred)
            loss = criterion(y_pred,Y.to(device).float())
            acc = self.binary_accuracy(y_pred, Y)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        return epoch_loss / len(data_iter), epoch_acc / len(data_iter)
    
    def evaluate(self, model, iterator, optimizer, criterion):

        epoch_loss = 0
        epoch_acc = 0

        model.eval()
        data_iter = Data.DataLoader(iterator, batch_size = 4, shuffle=True)
        
        for X, Y in data_iter:
            optimizer.zero_grad()

            y_pred = model(X.permute(1, 0)).squeeze(1)

            loss = criterion(y_pred,Y.to(device).float())
            acc = self.binary_accuracy(y_pred, Y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        return epoch_loss / len(data_iter), epoch_acc / len(data_iter)
    
    def epoch_time(self, start_time, end_time):
        
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs
    
    def main(self):
        
        if torch.cuda.is_available():
            device = torch.device("cuda")
            use_cuda = True
        else:
            device = torch.device("cpu")
            use_cuda = False
        
        INPUT_DIM = self.vocab_length
        EMBEDDING_DIM = 100
        HIDDEN_DIM = 256
        OUTPUT_DIM = 1
        criterion = nn.BCEWithLogitsLoss()

        model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
        model_for_ori = model
        model_for_aug = model
        model = model.to(device)
        criterion = criterion.to(device)
        optimizer = optim.SGD(model.parameters(), lr=1e-3)
        N_EPOCHS = self.num_epochs

        best_valid_loss = float('inf')

        for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss_ori, train_acc_ori = self.train(model_for_ori, self.traing_data_ori, optimizer, criterion)
            valid_loss_ori, valid_acc_ori = self.evaluate(model_for_ori, self.validation_data, optimizer, criterion)
            train_loss_aug, train_acc_aug = self.train(model_for_aug, self.traing_data_aug, optimizer, criterion)
            valid_loss_aug, valid_acc_aug = self.evaluate(model_for_aug, self.validation_data, optimizer, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = self.epoch_time(start_time, end_time)

            if valid_loss_aug < best_valid_loss:
                best_valid_loss = valid_loss_aug
                torch.save(model.state_dict(), 'tut1-model.pt')

            if (epoch + 1) % 4 == 0:
    #             print('epoch %d, perplexity %f, time %.2f sec' % (
    #                 epoch + 1, perplexity, time.time() - start))

                print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
                print('Before data augmentation')
                print(f'\tTrain Loss: {train_loss_ori:.3f} | Train Acc: {train_acc_ori*100:.2f}%')
                print(f'\t Val. Loss: {valid_loss_ori:.3f} |  Val. Acc: {valid_acc_ori*100:.2f}%')
                print('After data augmentation')
                print(f'\tTrain Loss: {train_loss_aug:.3f} | Train Acc: {train_acc_aug*100:.2f}%')
                print(f'\t Val. Loss: {valid_loss_aug:.3f} |  Val. Acc: {valid_acc_aug*100:.2f}%')
        

In [191]:
def aug_class(df, embed_size, hidden_size, loss_da): 
    
    df_train, df_test = train_test_split(df, test_size=0.3, random_state=41)
 
    for i in [0.1, 0.2]:
        
        print(" %.0f%% of dataset " % (100 * (i)) )
        df_new = resample(df, n_samples=int(i*len(df)),random_state=1, replace=False)
        vocab, corpus_indices, label = loading_data(300, df_train).main()
        vocab_valid, corpus_indices_valid, label_valid = loading_data(300, df_test).main()
          
        corpus = augmentation(embed_size, hidden_size, vocab_size, loss_da, corpus_indices, device = device).main()
        
        corpus_indices_ori = Data.TensorDataset(corpus_indices, label)
        corpus_indices_classification = Data.TensorDataset(corpus_indices_valid, label_valid)
        corpus_indices_da = Data.TensorDataset(corpus, label)
        
        to_class = classification(vocab, vocab_valid, corpus_indices_ori, corpus_indices_da, corpus_indices_classification, 8)

        to_class.main()
   

In [192]:
     
df = pd.read_csv("./IMDB Dataset.csv", names=['comment', 'label'], header=0, encoding='utf-8')
df['label'] = df['label'].apply(lambda x: 1 if x=='positive' else 0)
embed_size = 5
hidden_size = 256
loss_da = nn.CrossEntropyLoss()

aug_class(df, embed_size, hidden_size, loss_da)

 10% of dataset 




i 5, time 5.03 sec
Epoch:  4 | Epoch Time: 0m 0s
Before data augmentation
	Train Loss: 0.704 | Train Acc: 41.67%
	 Val. Loss: 0.706 |  Val. Acc: 37.50%
After data augmentation
	Train Loss: 0.702 | Train Acc: 41.67%
	 Val. Loss: 0.704 |  Val. Acc: 37.50%
Epoch:  8 | Epoch Time: 0m 0s
Before data augmentation
	Train Loss: 0.698 | Train Acc: 12.50%
	 Val. Loss: 0.690 |  Val. Acc: 62.50%
After data augmentation
	Train Loss: 0.694 | Train Acc: 54.17%
	 Val. Loss: 0.690 |  Val. Acc: 62.50%
 20% of dataset 
i 5, time 5.60 sec
Epoch:  4 | Epoch Time: 0m 0s
Before data augmentation
	Train Loss: 0.693 | Train Acc: 54.17%
	 Val. Loss: 0.687 |  Val. Acc: 62.50%
After data augmentation
	Train Loss: 0.689 | Train Acc: 58.33%
	 Val. Loss: 0.686 |  Val. Acc: 62.50%
Epoch:  8 | Epoch Time: 0m 0s
Before data augmentation
	Train Loss: 0.685 | Train Acc: 58.33%
	 Val. Loss: 0.679 |  Val. Acc: 62.50%
After data augmentation
	Train Loss: 0.685 | Train Acc: 58.33%
	 Val. Loss: 0.678 |  Val. Acc: 62.50%


## Accuracy

In [974]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc
criterion = nn.BCEWithLogitsLoss()

In [976]:
def train(model, corpus_indices, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    data_iter = Data.DataLoader(corpus_indices, batch_size = 4, shuffle=True)
        
    for X, Y in data_iter:
        
        optimizer.zero_grad()
        
        if use_cuda:
            X = Variable(torch.cuda.LongTensor(X))
        else:
            X = Variable(torch.LongTensor(X))
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

#         print('y_pred', y_pred)
        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)




In [977]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    data_iter = Data.DataLoader(iterator, batch_size = 4, shuffle=True)
    for X, Y in data_iter:
        optimizer.zero_grad()
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)


In [978]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


## Training

In [163]:
def validation(model_for_ori, model_for_aug, traing_data_ori, traing_data_aug, validation_data, optimizer, criterion, num_epochs):

    N_EPOCHS = num_epochs

    best_valid_loss = float('inf')

    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss_ori, train_acc_ori = train(model_for_ori, traing_data_ori, optimizer, criterion)
        valid_loss_ori, valid_acc_ori = evaluate(model_for_ori, validation_data, criterion)
        train_loss_aug, train_acc_aug = train(model_for_aug, traing_data_aug, optimizer, criterion)
        valid_loss_aug, valid_acc_aug = evaluate(model_for_aug, validation_data, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss_aug < best_valid_loss:
            best_valid_loss = valid_loss_aug
            torch.save(model.state_dict(), 'tut1-model.pt')
            
        if (epoch + 1) % 2 == 0:
#             print('epoch %d, perplexity %f, time %.2f sec' % (
#                 epoch + 1, perplexity, time.time() - start))

            print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            print('Before data augmentation')
            print(f'\tTrain Loss: {train_loss_ori:.3f} | Train Acc: {train_acc_ori*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss_ori:.3f} |  Val. Acc: {valid_acc_ori*100:.2f}%')
            print('After data augmentation')
            print(f'\tTrain Loss: {train_loss_aug:.3f} | Train Acc: {train_acc_aug*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss_aug:.3f} |  Val. Acc: {valid_acc_aug*100:.2f}%')

In [164]:
model_for_ori = model
model_for_aug = model
validation(model_for_ori, model_for_aug,corpus_indices_ori, corpus_indices_da, 
           corpus_indices_classification, optimizer, 
           criterion , 6)
# validation(model, corpus_indices_da, corpus_indices_classification, optimizer, criterion, 6, 'Data augmentation')


NameError: name 'corpus_indices_ori' is not defined

In [165]:
# train(model, corpus_indices_da, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, corpus_indices_classification, criterion)
    train(model, corpus_indices_da, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, corpus_indices_classification, criterion)
    

IndentationError: unexpected indent (<ipython-input-165-5b902677d7d6>, line 3)