In [0]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
import torch
from torchtext import data , datasets,vocab
import torchtext
import os
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import spacy
import pickle
import glob
import numpy as np
from torch.optim.lr_scheduler import StepLR
spacy.load('en')

<spacy.lang.en.English at 0x7f06445b5588>

In [0]:
with open("/content/drive/My Drive/cs281_final_project/imdb_splitted.pkl","rb") as f:
  data_list = pickle.load(f)

In [0]:
TEXT = data.Field(lower=True,tokenize='spacy')
LABEL = data.Field(sequential=False,)

In [0]:
train = data_list["train"]
val = data_list["validation"]
test = data_list["test"]

In [0]:
class myDataset(torchtext.data.Dataset):
    def __init__(self, df, text_field, label_field,**kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = df
        super(myDataset, self).__init__(examples, fields, **kwargs)

+ train/val/test are a list of `Example` object.
+ use `data.Example.fromlist([text, label], fields)` to genenerate every single instance of `Example`. 
+ In the code above, text is a list of string, label is "pos", "neg". Print out or see the `ipynb train_test_valid_split.ipynb` for details.

In [0]:
train_dat = myDataset(train, TEXT,LABEL)
val_dat = myDataset(val, TEXT,LABEL)
test_dat = myDataset(test, TEXT,LABEL)

In [0]:
TEXT.build_vocab(train_dat,vectors='glove.6B.300d',max_size=25000)
LABEL.build_vocab(train_dat,)

In [0]:
bs = 32
train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train_dat, val_dat, test_dat), 
                                                                         batch_size = bs, repeat = False,
                                                                         sort_key = lambda x : len(x.text))

In [0]:
n_vocab = len(TEXT.vocab)
n_hidden = 300

In [0]:
class IMDBRnn(nn.Module):
    def __init__(self,vocab,hidden_size,n_cat,bs=1,nl=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.bs = bs
        self.nl = nl
        self.e = nn.Embedding(n_vocab,hidden_size)
        self.rnn = nn.LSTM(hidden_size,hidden_size,nl)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size,n_cat)
        self.bn2 = nn.BatchNorm1d(n_cat)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self,inp):
        bs = inp.size()[1]
        if bs != self.bs:
            self.bs = bs
        e_out = self.e(inp)
        h0 = c0 = Variable(e_out.data.new(*(self.nl,self.bs,self.hidden_size)).zero_())
        rnn_o,_ = self.rnn(e_out,(h0,c0)) 
        rnn_o = rnn_o[-1]
        rnn_o = self.bn1(rnn_o)
        fc = F.dropout(self.fc2(rnn_o), 0.30)
        fc = self.bn2(fc)
        return self.softmax(fc)

In [0]:
model = IMDBRnn(n_vocab,n_hidden,2,bs=bs)
model.e.weight.data = TEXT.vocab.vectors

In [0]:
model.load_state_dict(torch.load('/content/drive/My Drive/cs281_final_project/init_state'), strict=False)

In [0]:
model = model.cuda()

In [0]:
is_cuda = torch.cuda.is_available()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-2)
# after every 1 epoch, lr = lr*gamma
scheduler = StepLR(optimizer, step_size=1, gamma = 0.9)

def fit(epoch,model,data_loader,phase='training',volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile=True
    running_loss = 0.0
    running_correct = 0
    for batch_idx , batch in enumerate(data_loader):
        batch.label = batch.label-1
        text , target = batch.text , batch.label
        if is_cuda:
            text,target = text.cuda(),target.cuda()
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model(text)
       
        loss = F.nll_loss(output,target)
        
        running_loss += F.nll_loss(output,target,size_average=False).data
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        if phase == 'training':
            loss.backward()
            optimizer.step()
    
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct/len(data_loader.dataset)
    
    print(f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}')
    return loss,accuracy

train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]
best_model_dict = None
for epoch in range(25):
    scheduler.step()
    best_val_acc = 0
    print("epoch {}".format(epoch))
    epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model,valid_iter,phase='validation')
    if val_epoch_accuracy > best_val_acc:
      best_model_dict = model.state_dict()
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)



epoch 0
training loss is   0.7 and training accuracy is 12518/25000     50.07
validation loss is   0.7 and validation accuracy is 9987/20000     49.94
epoch 1
training loss is  0.69 and training accuracy is 12876/25000      51.5
validation loss is   0.7 and validation accuracy is 10380/20000      51.9
epoch 2
training loss is  0.66 and training accuracy is 14777/25000     59.11
validation loss is  0.69 and validation accuracy is 11428/20000     57.14
epoch 3
training loss is  0.56 and training accuracy is 18012/25000     72.05
validation loss is  0.65 and validation accuracy is 12664/20000     63.32
epoch 4
training loss is  0.51 and training accuracy is 19125/25000      76.5
validation loss is   2.7 and validation accuracy is 9991/20000     49.96
epoch 5
training loss is  0.47 and training accuracy is 19765/25000     79.06
validation loss is  0.55 and validation accuracy is 14683/20000     73.42
epoch 6
training loss is  0.38 and training accuracy is 20896/25000     83.58
validation l

In [0]:
torch.save(best_model_dict, '/content/drive/My Drive/cs281_final_project/my_best_lstm2')

In [0]:
best_model = IMDBRnn(n_vocab,n_hidden,2,bs=bs)

In [0]:
best_model.load_state_dict(torch.load('/content/drive/My Drive/cs281_final_project/my_best_lstm2'))

<All keys matched successfully>

In [0]:
best_model = best_model.cuda()

In [0]:
test_loss , test_accuracy = fit(epoch,best_model,test_iter,phase='validation')



validation loss is   1.2 and validation accuracy is 4033/5000     80.66
