In [11]:
import re
import unicodedata
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import pickle
import csv
import torch.utils.data as Data
import pandas as pd
import torchtext.vocab as Vocab
import collections
import numpy as np
import os
from sklearn.model_selection import train_test_split
from torchtext import data, datasets
PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [83]:
# 将一个序列中所有的词记录在all_tokens中以便之后构造词典，然后在该序列后面添加PAD直到序列
# 长度变为max_seq_len，然后将序列保存在all_seqs中
def process_one_seq(seq_tokens, all_tokens, all_seqs, max_seq_len):
#     print('seq_tokens', seq_tokens)
    all_tokens.extend(seq_tokens) # add seq_tokens into one list, like extend the list, [..., seq_tokens].
#     print('all_tokens', len(all_tokens))
    seq_tokens += [EOS] + [PAD] * (max_seq_len - len(seq_tokens) - 1)
#     print('after add pad seq_tokens: ', seq_tokens)
    all_seqs.append(seq_tokens) # add seq_tokens become a list element, [...,[seq_tokens]].
#     print('all_seqs', all_seqs)

# 使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor
def build_data(all_tokens, all_seqs):
#     print('all_tokens', all_tokens)
    
#   collections.Counter(), A counter is a container that stores elements as dictionary keys, and their counts are stored as dictionary values.
    tokens_dic = collections.Counter(all_tokens) 
#     print('tokens_dic', tokens_dic)
    vocab = Vocab.Vocab(tokens_dic, specials = [PAD, BOS, EOS])
#     print('vocab', vocab)
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs] # use its vocab_dic to represent the sentence. 
#     print('length of indices', len(indices))
#     print('indices', torch.tensor(indices))
    return vocab, torch.tensor(indices)

In [85]:
def normalizeString(s):
	s = s.str.lower()
	s = s.str.replace(r"<br />",r" ")
	# s = re.sub(' +',' ',s)
	s = s.str.replace(r'(\W)(?=\1)', '')
	s = s.str.replace(r"([.!?])", r" \1")
	s = s.str.replace(r"[^a-zA-Z.!?]+", r" ")
	
	return s

In [85]:
def read_data(max_seq_len, data):
    # in和out分别是input和output的缩写
    in_tokens, in_seqs = [], []
    df = data.copy()
#     df = pd.read_csv("./IMDB Dataset.csv", names=['comment', 'label'], header=0, encoding='utf-8')
    df['comment'] = normalizeString(df['comment'])
    df.reset_index(inplace=True, drop=True)
    print(df[:10])
    k = 1000
    target = []
    for line in range(k):
#         print('line', line)
        in_seq_tokens = df['comment'][line].split(' ')
#         print('in_seq_tokens', in_seq_tokens)
#         print('len(in_seq_tokens)', len(in_seq_tokens))
#         print('max_seq_len - 1', max_seq_len - 1)
        if len(in_seq_tokens) > max_seq_len - 1:
            continue  # 如果加上EOS后长于max_seq_len，则忽略掉此样本
#         print('here', line)
#         print('df.label[k]', k, df.label[line])
        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        target.append(df.label[line])
#     print('target', len(target))
    in_vocab, in_data = build_data(in_tokens, in_seqs) # in_tokens is the list where contains every word, in_seqs is a list where its element are the sentence in French.
#     print('in_data', len(in_data))

    return in_vocab, Data.TensorDataset(in_data, torch.tensor(target))

In [88]:
df = pd.read_csv("./IMDB Dataset.csv", names=['comment', 'label'], header=0, encoding='utf-8')
df['label'] = df['label'].apply(lambda x: 1 if x=='positive' else 0)
df_train, df_test = train_test_split(df, test_size=0.3, random_state=41)
vocab, corpus_indices = read_data(300, df_train)
# vocab_valid, corpus_indices_valid = read_data(300, df_test)


                                             comment  label
0  i would have given this movie a but i laughed ...      0
1  since was only a toddler when this show origin...      1
2  the japanese have probably the most sadistic m...      1
3  to fight against the death penalty is a just c...      1
4  for my humanities quarter project for school i...      1
5  and it s only january still i m sure of it ! b...      0
6  it could have been a marvelous story based on ...      0
7  well what can i say . what the f k ? there rea...      0
8  riotously cheesy lunacy about lava spewing fro...      0
9  woof ! pretty boring and they might as well ha...      0


In [89]:
vocab_length = len(vocab)
vocab_length_valid = len(vocab_valid.freqs)

In [28]:

if torch.cuda.is_available():
    device = torch.device("cuda")
    use_cuda = True
else:
    device = torch.device("cpu")
    use_cuda = False
vocabLimit = vocab_length
input_dim = 300
# max_sequence_len = 500

In [642]:
class Model(torch.nn.Module) :
    def __init__(self, input_dim, embedding_dim, hidden_dim) :
        super(Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocabLimit+1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim)
        self.linearOut = nn.Linear(hidden_dim,1)
    def forward(self,inputs,hidden):
        print('inputs is', inputs.shape[0])
        assert 'this is '
        x = self.embeddings(inputs) # embeding 太小会报错
        print('after embedding')
        lstm_out,lstm_h = self.lstm(x)
        x = lstm_out[-1]
        x = self.linearOut(x)
# 		x = F.log_softmax(x)
        return x,lstm_h
    def init_hidden(self) :
        if use_cuda:
            return (Variable(torch.zeros(1, 1, self.hidden_dim)).cuda(),Variable(torch.zeros(1, 300, self.hidden_dim)).cuda())
        else:
            return (Variable(torch.zeros(1, 1, self.hidden_dim)),Variable(torch.zeros(1, 300, self.hidden_dim)))

if use_cuda:
	model = Model(input_dim, 50, 100).cuda()
else:
	model = Model(input_dim, 500, 100)

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 4

torch.save(model.state_dict(), 'model' + str(0)+'.pth')
print('starting training')

starting training


In [59]:

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [60]:
INPUT_DIM = vocab_length
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)


In [61]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc
criterion = nn.BCEWithLogitsLoss()

In [62]:
model = model.to(device)
criterion = criterion.to(device)

import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [70]:
def train(model, corpus_indices, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    data_iter = Data.DataLoader(corpus_indices, batch_size = 4, shuffle=True)
        
    for X, Y in data_iter:
        optimizer.zero_grad()
        if use_cuda:
            X = Variable(torch.cuda.LongTensor(X))
        else:
            X = Variable(torch.LongTensor(X))
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

#         print('y_pred', y_pred)
        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)




In [71]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    data_iter = Data.DataLoader(corpus_indices, batch_size = 4, shuffle=True)
    for X, Y in data_iter:
        optimizer.zero_grad()
        
        y_pred = model(X.permute(1, 0)).squeeze(1)

        loss = criterion(y_pred,Y.to(device).float())
        acc = binary_accuracy(y_pred, Y)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(data_iter), epoch_acc / len(data_iter)


In [72]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs



In [73]:
N_EPOCHS = 2

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, corpus_indices, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, corpus_indices_valid, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch:  1 | Epoch Time: 0m 36s
	Train Loss: 0.695 | Train Acc: 49.52%
	 Val. Loss: 0.693 |  Val. Acc: 51.40%
Epoch:  2 | Epoch Time: 0m 34s
	Train Loss: 0.695 | Train Acc: 51.00%
	 Val. Loss: 0.694 |  Val. Acc: 51.18%
