In [1]:
import wget, os, gzip, pickle, random, re, sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np

In [2]:
IMDB_URL = 'http://dlvu.github.io/data/imdb.{}.pkl.gz'
IMDB_FILE = 'imdb.{}.pkl.gz'

In [3]:
PAD, START, END, UNK = '.pad', '.start', '.end', '.unk'

In [4]:
def load_imdb(final=False, val=5000, seed=0, voc=None, char=False):

    cst = 'char' if char else 'word'

    imdb_url = IMDB_URL.format(cst)
    imdb_file = IMDB_FILE.format(cst)

    if not os.path.exists(imdb_file):
        wget.download(imdb_url)

    with gzip.open(imdb_file) as file:
        sequences, labels, i2w, w2i = pickle.load(file)

    if voc is not None and voc < len(i2w):
        nw_sequences = {}

        i2w = i2w[:voc]
        w2i = {w: i for i, w in enumerate(i2w)}

        mx, unk = voc, w2i['.unk']
        for key, seqs in sequences.items():
            nw_sequences[key] = []
            for seq in seqs:
                seq = [s if s < mx else unk for s in seq]
                nw_sequences[key].append(seq)

        sequences = nw_sequences

    if final:
        return (sequences['train'], labels['train']), (sequences['test'], labels['test']), (i2w, w2i), 2

    # Make a validation split
    random.seed(seed)

    x_train, y_train = [], []
    x_val, y_val = [], []

    val_ind = set( random.sample(range(len(sequences['train'])), k=val) )
    for i, (s, l) in enumerate(zip(sequences['train'], labels['train'])):
        if i in val_ind:
            x_val.append(s)
            y_val.append(l)
        else:
            x_train.append(s)
            y_train.append(l)

    return (x_train, y_train), \
           (x_val, y_val), \
           (i2w, w2i), 2

In [5]:
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final = True)

In [6]:
def fixed_data(data):
    max_len = max([len(sent) for sent in data])
    for sent in data:
        temp = [w2i['.pad']] * (max_len - len(sent))
        sent.extend(temp)
    
    return data

In [7]:
fixed_x_train = fixed_data(x_train)

In [8]:
def create_batches(data, batch_size):
    batches = []
    i = 0
    while i < len(data):
        temp = data[i:i+200]
        batches.append(temp)
        i += 200
    
    return batches

In [9]:
batches = create_batches(fixed_x_train, 200)
labels = create_batches(y_train,200)

In [10]:
import itertools
batches = list(itertools.zip_longest(batches,fillvalue=0))

In [11]:
batches = torch.tensor(batches,dtype = torch.long)
labels = torch.tensor(labels, dtype = torch.long)

In [12]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.emb = nn.Embedding(99430,300)
        self.rnn = nn.RNN(300,300,batch_first = True,nonlinearity = 'relu')
        self.fc2 = nn.Linear(300,2)
    def forward(self,x):
        x = self.emb(x)
        x = torch.max(self.rnn(x)[0],1)[0]
        x = self.fc2(x)
    
        return x

net = Net()

In [13]:
optimizer = optim.Adam(net.parameters(), lr = 0.003)

In [14]:
for epoch in range(1):
    running_loss = 0
    for i in range(125):
        inputs = batches[i][0]
        label = labels[i]  
        optimizer.zero_grad()
        outputs = net(inputs) 
        loss = F.cross_entropy(outputs, label)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if i % 20 == 19:    
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0

print('finish training')

[1,    20] loss: 0.766
[1,    40] loss: 0.410
[1,    60] loss: 0.390
[1,    80] loss: 0.363
[1,   100] loss: 0.345
[1,   120] loss: 0.340
finish training


In [15]:
fixed_x_val = fixed_data(x_val)
val_batches = create_batches(fixed_x_val, 200)
val_labels = create_batches(y_val,200)
val_batches = list(itertools.zip_longest(val_batches,fillvalue=0))
val_batches = torch.tensor(val_batches,dtype = torch.long)
val_labels = torch.tensor(val_labels,dtype = torch.long)

In [16]:
correct = 0
total = 0
with torch.no_grad():
    for i in range(125):
        val_inputs = val_batches[i][0]
        val_label = val_labels[i]
        val_outputs = net(val_inputs)
        _, predicted = torch.max(val_outputs.data, 1)
        total += val_label.size(0)
        correct += (predicted == val_label).sum().item()

print('Accuracy of the network on the 5000 validation data: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 5000 validation data: 86 %
