In [517]:
import re
import unicodedata
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import pickle
import csv
import torch.utils.data as Data
import pandas as pd
import math
import torchtext.vocab as Vocab
import collections
import time
import numpy as np
import os
from sklearn.model_selection import train_test_split
from torchtext import data, datasets
PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# torch.set_printoptions(profile=8)
print(torch.__version__, device)

1.4.0 cpu


# Data Processing

In [669]:
# 将一个序列中所有的词记录在all_tokens中以便之后构造词典，然后在该序列后面添加PAD直到序列
# 长度变为max_seq_len，然后将序列保存在all_seqs中
def process_one_seq(seq_tokens, all_tokens, all_seqs, max_seq_len):
    all_tokens.extend(seq_tokens) # add seq_tokens into one list, like extend the list, [..., seq_tokens].
#     print('all_tokens', len(all_tokens))
    seq_tokens += [EOS] + [PAD] * (max_seq_len - len(seq_tokens) - 1)
#     print('after add pad seq_tokens: ', seq_tokens)
    all_seqs.append(seq_tokens) # add seq_tokens become a list element, [...,[seq_tokens]].
#     print('all_seqs', all_seqs)

# 使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor
def build_data(all_tokens, all_seqs):

#   collections.Counter(), A counter is a container that stores elements as dictionary keys, and their counts are stored as dictionary values.
    tokens_dic = collections.Counter(all_tokens) 
#     print('tokens_dic', tokens_dic)
    vocab = Vocab.Vocab(tokens_dic, specials = [PAD, BOS, EOS])
#     print('vocab', vocab)
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs] # use its vocab_dic to represent the sentence. 
#     print('length of indices', len(indices))
#     print('indices', torch.tensor(indices))
    return vocab, torch.tensor(indices)

In [670]:
def normalizeString(s):
    s = s.str.lower()
    s = s.str.replace(r"<br />",r" ")
    # s = re.sub(' +',' ',s)
    s = s.str.replace(r'(\W)(?=\1)', '')
    s = s.str.replace(r"([.!?])", r" \1")
    s = s.str.replace(r"[^a-zA-Z.!?]+", r" ")

    return s

In [708]:
def read_data(max_seq_len, data):
    # in和out分别是input和output的缩写
    in_tokens, in_seqs = [], []
    df = data.copy()
    df['comment'] = normalizeString(df['comment'])
    df.reset_index(inplace=True, drop=True)
#     print(df[:10])
    k = 10
    target = []
    for line in range(k):
#         print('line', line)
        in_seq_tokens = df['comment'][line].split(' ')

        if len(in_seq_tokens) > max_seq_len - 1:
            continue  # 如果加上EOS后长于max_seq_len，则忽略掉此样本

        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        target.append(df.label[line])
        
    in_vocab, in_data = build_data(in_tokens, in_seqs) # in_tokens is the list where contains every word, in_seqs is a list where its element are the sentence in French.
    return in_vocab, in_data, torch.tensor(target)

In [672]:
df = pd.read_csv("./IMDB Dataset.csv", names=['comment', 'label'], header=0, encoding='utf-8')
df['label'] = df['label'].apply(lambda x: 1 if x=='positive' else 0)
df_train, df_test = train_test_split(df, test_size=0.3, random_state=41)
vocab, corpus_indices, label = read_data(300, df_train)
vocab_valid, corpus_indices_valid, label_valid = read_data(300, df_test)

                                                                                                                                                                                                                                                     comment  \
0  i would have given this movie a but i laughed so hard so many times that i had to give it a little credit in the off off off off chance the film was supposed to be funny . a movie so bad you ll think chimps wrote it . you ll wish chimps had writt...   
1  since was only a toddler when this show originally aired i just recently picked up the dvd set and am wishing there were more episodes filmed . this show was a s version of the poplular s tv series x files but with a bit more of a comedic light h...   
2  the japanese have probably the most sadistic movies around the world and this is one of the strongest examples .with a running time at about an hour it contains enough sexual violence and gore to disgust every single sane person 

In [673]:
pd.set_option('max_colwidth',250)
df_test.loc[30725]['comment'].rfind('<br /><br />')

760

In [674]:
# data = Data.TensorDataset(corpus_indices, label)

In [675]:
corpus_indices.shape
label.shape

torch.Size([7])

## RNN for Data augmentation

In [735]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, drop_prob = 0):
        super(RNNModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, dropout=drop_prob)
        self.dense = nn.Linear(hidden_size, vocab_size)
        self.state = None

    def forward(self, inputs, state): # inputs: (seq_len, batch), 
        embedding = self.embedding(inputs.to(torch.int64))
#         print('embedding', embedding.shape)
        Y, state = self.rnn(embedding, self.state)
        print('output Y', Y[-1,:,:])
        print('output Y[:,-1,:]', Y[-1,:,:].shape)
        print('shape Y', Y.shape)
        print('self.state', state)
        print('self.state', state.shape)
        output = self.dense(Y[-1,:,:]) # so this is why we need Y[:, -1, :] 最外层的hidden state, because it already contains all the information of last all words.
#         output = self.dense(Y.view(-1, Y.shape[-1])) # why??
#         print('final output', output)
#         print('final output size', output.shape)
        return output, state


In [736]:
embed_size = 5
hidden_size = 256
model = RNNModel(vocab_size, embed_size, hidden_size).to(device)
loss_da = nn.CrossEntropyLoss()

In [737]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_indices = torch.tensor(corpus_indices, device=device)
#     print('corpus_indices', corpus_indices)
    data_len = len(corpus_indices)
#     print('data_len', data_len)
    batch_len = data_len // batch_size
#     print('batch_len', batch_len)
    indices = corpus_indices[0: batch_size*batch_len].view(batch_size, batch_len)
#     print('indices', indices)
#     print('indices', indices.shape)
    epoch_size = (batch_len - 1) // num_steps
#     print('epoch_size', epoch_size)
    for i in range(epoch_size):
#         i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + num_steps + 1] # 错1位取data
        yield X, Y

In [738]:
def train_and_predict_rnn_pytorch(model, vocab_size, device,
                                corpus_indices, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes):
    print(corpus_indices.shape)
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    state = None
    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_consecutive(corpus_indices, batch_size, num_steps, device) # 相邻采样
#         print('data_iter_consecutive is no errors')
        for X, Y in data_iter:
            if state is not None:
                # 使用detach函数从计算图分离隐藏状态, 这是为了
                # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
                if isinstance (state, tuple): # LSTM, state:(h, c)  
                    state = (state[0].detach(), state[1].detach())
                else:   
                    state = state.detach()
            print('X is', X)
            print('X_shape is', X.shape)
#             print('Y is', Y)
#             print('Y_shape is', Y.shape)
            (output, state) = model(X.permute(1,0), state) # change X shape to (seq, batch_size) instead of (batch_size, seq)
            # output 的形状是(batch_size, vocab)
            print('output', output)
            print('output_size', output.shape)

            l = loss_da(output, Y.long())
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.item() * Y.shape[0]
            n += Y.shape[0]
            
    corpus = predict_rnn_pytorch(corpus_indices, pred_len, model, vocab_size, device)
    return corpus

In [743]:
def predict_rnn_pytorch(corpus, num_chars, model, vocab_size, device):
#     print('corpus', corpus.shape[0])
#     print('beginning corpus', corpus)
    output = []
    for i in range(3, corpus.shape[0], 4):
        
        X = corpus[i-3:i]
#         print('X', X)
#         print('X', X.shape)
        state = None
        if state is not None:
            if isinstance(state, tuple): # LSTM, state:(h, c)  
                state = (state[0].to(device), state[1].to(device))
            else:   
                state = state.to(device)
        X = torch.tensor(torch.unsqueeze(X,0), device=device).view(3, 1) # X is a list need to become a two demission array then could be change to(3,1 )(seq, batch_size)
        (Y, state) = model(X, state)  # Y (batch_size, vocab)
        corpus[i] = int(Y.argmax(dim=1).item())

    return corpus

In [744]:
corpus = corpus_indices
corpus_indices.shape

torch.Size([7, 300])

In [745]:
corpus.shape

torch.Size([7, 300])

In [746]:
start = time.time()
for i in range(corpus.shape[0]):
    
#     print('beginnig corpus_indices', corpus_indices)
#     print('beginnig corpus_indices', len(corpus_indices))
    corpus_indice, tokens_dic, vocab_size = corpus[i], vocab, len(vocab)
    print(corpus_indice.shape)
    num_epochs, batch_size, lr, clipping_theta, num_steps= 20, 8, 1e-3, 1e-2, 4 # 注意这里的学习率设置
    pred_period, pred_len, prefixes = 5, 10, ['the burger', 'their pants']
#     print('before corpus', corpus[i])
    corpus[i] = train_and_predict_rnn_pytorch(model, vocab_size, device,
                            corpus_indice, tokens_dic,
                            num_epochs, num_steps, lr, clipping_theta,
                            batch_size, pred_period, pred_len, prefixes)
#     print('after corpus', corpus[i])
    
    if (i+1) % 5 == 0:
            print('i %d, time %.2f sec' % (
                i+1, time.time() - start))
    
#     print('after corpus_indices', corpus_indices)
#     print('beginnig corpus_indices', len(corpus_indices))
    
#     return corpus_indices

torch.Size([300])
torch.Size([300])
X is tensor([[ 10,  97,  16,   3],
        [334,   3,   8,  18],
        [  4,  46, 305, 129],
        [ 90,  14,  68,  33],
        [  4, 513, 312, 134],
        [  0,   0,   0,   0],
        [  0,   0,   0,   0],
        [  0,   0,   0,   0]])
X_shape is torch.Size([8, 4])
output Y tensor([[-0.0385,  0.1145, -0.6340,  ...,  0.3427, -0.0643,  0.4548],
        [ 0.0430,  0.2707,  0.1545,  ...,  0.3980,  0.1179,  0.1248],
        [ 0.6103,  0.7243,  0.5036,  ...,  0.1845,  0.1303, -0.0230],
        ...,
        [ 0.7975,  0.7386, -0.0412,  ...,  0.8416,  0.9845,  0.9354],
        [ 0.7975,  0.7386, -0.0412,  ...,  0.8416,  0.9845,  0.9354],
        [ 0.7975,  0.7386, -0.0412,  ...,  0.8416,  0.9845,  0.9354]],
       grad_fn=<SliceBackward>)
output Y[:,-1,:] torch.Size([8, 256])
shape Y torch.Size([4, 8, 256])
self.state tensor([[[-0.0385,  0.1145, -0.6340,  ...,  0.3427, -0.0643,  0.4548],
         [ 0.0430,  0.2707,  0.1545,  ...,  0.3980,  0.1179, 

  after removing the cwd from sys.path.


tensor([[-0.0083,  0.1867, -0.6818,  ...,  0.3566, -0.0442,  0.4862],
        [ 0.0101,  0.3094,  0.1044,  ...,  0.4263,  0.0457,  0.1235],
        [ 0.6434,  0.7593,  0.5615,  ...,  0.1696, -0.0342, -0.0639],
        ...,
        [ 0.8661,  0.7329, -0.0171,  ...,  0.8845,  0.9828,  0.9702],
        [ 0.8661,  0.7329, -0.0171,  ...,  0.8845,  0.9828,  0.9702],
        [ 0.8661,  0.7329, -0.0171,  ...,  0.8845,  0.9828,  0.9702]],
       grad_fn=<SliceBackward>)
output Y[:,-1,:] torch.Size([8, 256])
shape Y torch.Size([4, 8, 256])
self.state tensor([[[-0.0083,  0.1867, -0.6818,  ...,  0.3566, -0.0442,  0.4862],
         [ 0.0101,  0.3094,  0.1044,  ...,  0.4263,  0.0457,  0.1235],
         [ 0.6434,  0.7593,  0.5615,  ...,  0.1696, -0.0342, -0.0639],
         ...,
         [ 0.8661,  0.7329, -0.0171,  ...,  0.8845,  0.9828,  0.9702],
         [ 0.8661,  0.7329, -0.0171,  ...,  0.8845,  0.9828,  0.9702],
         [ 0.8661,  0.7329, -0.0171,  ...,  0.8845,  0.9828,  0.9702]]],
       grad

  app.launch_new_instance()


output Y tensor([[ 0.4376,  0.3297,  0.0974,  0.1265, -0.0885, -0.8440,  0.6208, -0.9566,
          0.6662, -0.7466, -0.7627, -0.4291, -0.6245,  0.8414, -0.6619,  0.7342,
         -0.3896, -0.4447,  0.2015, -0.4779,  0.6683,  0.6995, -0.1386, -0.2613,
          0.4091, -0.9042,  0.3925, -0.2187, -0.2933, -0.9127, -0.5438,  0.8779,
         -0.9908, -0.2731,  0.6470,  0.4667, -0.0724,  0.9362,  0.0968,  0.6519,
         -0.4834,  0.1284,  0.9360,  0.1745,  0.1951, -0.0540, -0.9633, -0.8369,
         -0.5714,  0.3052, -0.9191, -0.4198,  0.8265,  0.7979,  0.3068, -0.7757,
         -0.8918, -0.3588,  0.7204,  0.4807, -0.4100,  0.0021, -0.0701,  0.8270,
         -0.9474, -0.1054, -0.1285, -0.8983, -0.8923,  0.8480,  0.9346, -0.6214,
         -0.4714, -0.2529,  0.9743, -0.0234, -0.4199,  0.3599, -0.4218, -0.2975,
         -0.9005,  0.1922,  0.6391,  0.3049, -0.4226,  0.8614, -0.0341,  0.0546,
          0.9790, -0.7282, -0.2786, -0.3407, -0.2565,  0.2617, -0.3121, -0.8780,
          0.1956,  

KeyboardInterrupt: 

In [None]:
corpus_indices_da = Data.TensorDataset(corpus, label)

## Data classification

In [626]:
vocab_length = len(vocab)
vocab_length_valid = len(vocab_valid)

In [627]:
torch.tensor([[  4, 171,  76]]).size()[1]

3

In [628]:

if torch.cuda.is_available():
    device = torch.device("cuda")
    use_cuda = True
else:
    device = torch.device("cpu")
    use_cuda = False
vocabLimit = vocab_length
input_dim = 300
# max_sequence_len = 500

## Model

In [629]:
class Model(torch.nn.Module) :
    def __init__(self, input_dim, embedding_dim, hidden_dim) :
        super(Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocabLimit+1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim)
        self.linearOut = nn.Linear(hidden_dim,1)
    def forward(self,inputs,hidden):
        print('inputs is', inputs.shape[0])
        assert 'this is '
        x = self.embeddings(inputs) # embeding 太小会报错
        print('after embedding')
        lstm_out,lstm_h = self.lstm(x)
        x = lstm_out[-1]
        x = self.linearOut(x)
# 		x = F.log_softmax(x)
        return x,lstm_h
    def init_hidden(self) :
        if use_cuda:
            return (Variable(torch.zeros(1, 1, self.hidden_dim)).cuda(),Variable(torch.zeros(1, 300, self.hidden_dim)).cuda())
        else:
            return (Variable(torch.zeros(1, 1, self.hidden_dim)),Variable(torch.zeros(1, 300, self.hidden_dim)))

if use_cuda:
	model = Model(input_dim, 50, 100).cuda()
else:
	model = Model(input_dim, 500, 100)

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 4

torch.save(model.state_dict(), 'model' + str(0)+'.pth')
print('starting training')

starting training


In [631]:
vocab_length = len(vocab)
vocab_length_valid = len(vocab_valid)

In [632]:

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [633]:
INPUT_DIM = vocab_length
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)


## Accuracy

In [634]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc
criterion = nn.BCEWithLogitsLoss()

In [635]:
model = model.to(device)
criterion = criterion.to(device)

import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [636]:
def train(model, corpus_indices, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    data_iter = Data.DataLoader(corpus_indices, batch_size = 4, shuffle=True)
        
    for X, Y in data_iter:
        
        optimizer.zero_grad()
        if use_cuda:
            X = Variable(torch.cuda.LongTensor(X))
        else:
            X = Variable(torch.LongTensor(X))
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

#         print('y_pred', y_pred)
        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)




In [637]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    data_iter = Data.DataLoader(corpus_indices, batch_size = 4, shuffle=True)
    for X, Y in data_iter:
        optimizer.zero_grad()
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)


In [638]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs



## Training

In [642]:
N_EPOCHS = 2

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, corpus_indices_da, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, corpus_indices_valid, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch:  1 | Epoch Time: 0m 0s
	Train Loss: 0.681 | Train Acc: 58.33%
	 Val. Loss: 0.690 |  Val. Acc: 54.17%
Epoch:  2 | Epoch Time: 0m 0s
	Train Loss: 0.692 | Train Acc: 54.17%
	 Val. Loss: 0.690 |  Val. Acc: 54.17%
