In [1]:
import pandas as pd
import numpy as np
import os
import random

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import torch
import numpy as np
torch.__version__

'0.4.1'

---
### Data Loading

In [15]:
train_dir = 'aclImdb/train'
test_dir = 'aclImdb/test'

In [16]:
## Load the each text into texts list
## corresponding label will be in labels list. 
def Load_Data(path):
    labels = []
    texts = []
    for c in ['neg', 'pos']:
        dir_name = os.path.join(train_dir, c)
        for fname in os.listdir(dir_name):
            if '.txt' in fname :
                f = open(os.path.join(dir_name, fname))
                texts.append(f.read())
                f.close()
            if c == 'neg':
                labels.append(0)
            else:
                labels.append(1)
    return texts,labels

In [17]:
test_data, test_targets = Load_Data(test_dir)
data, targets = Load_Data(test_dir)

In [18]:
## We have to shuffle the list to get a mix of pos and neg in the 
## train and val data set
index = np.arange(25000)
np.random.shuffle(index)
data = list( data[i] for i in index )
targets = list( targets[i] for i in index )

In [19]:
# Split train data into actual train and validation sets

train_split = 20000
train_data = data[:train_split]
train_targets = targets[:train_split]

val_data = data[train_split:]
val_targets = targets[train_split:]

print ("Train dataset size is {}".format(len(train_data)))
print ("Val dataset size is {}".format(len(val_data)))
print ("Test dataset size is {}".format(len(test_data)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [52]:
# Save the targets for this shuffled dataset

pd.DataFrame(train_targets).to_csv("train_targets.csv",index=False)
pd.DataFrame(val_targets).to_csv("val_targets.csv",index=False)
pd.DataFrame(test_targets).to_csv("test_targets.csv",index=False)

In [20]:
# Let's write the tokenization function 

import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
  tokens = tokenizer(sent)
  return [token.text.lower() for token in tokens if (token.text not in punctuations)]



In [51]:
# This is the code cell that tokenizes train/val/test datasets
# However it takes about 15-20 minutes to run it
# For convinience we have provided the preprocessed datasets
# Please see the next code cell
import pickle as pkl

# functions to generate N-grams
def generate_ngrams(token_list, n):
    grams=[]
    temp = zip(*[token_list[i:] for i in range(n)])
    for i in temp:
        grams.append(' '.join(i))
    return grams

def tokenize_dataset(dataset,n):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample)
        tokens = generate_ngrams(tokens, n)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

for i in range(1,5):
    N_grams=i

    # val set tokens
    print ("Tokenizing val data")
    val_data_tokens, _ = tokenize_dataset(val_data,N_grams)
    pkl.dump(val_data_tokens, open("val_data_tokens_" + str(N_grams) +".p", "wb"))

    # test set tokens
    print ("Tokenizing test data")
    test_data_tokens, _ = tokenize_dataset(test_data,N_grams)
    pkl.dump(test_data_tokens, open("test_data_tokens_" + str(N_grams) +".p", "wb"))

    # train set tokens
    print ("Tokenizing train data")
    train_data_tokens, all_train_tokens = tokenize_dataset(train_data,N_grams)
    pkl.dump(train_data_tokens, open("train_data_tokens_" + str(N_grams) +".p", "wb"))
    pkl.dump(all_train_tokens, open("all_train_tokens_" + str(N_grams) +".p", "wb"))
    print(i)

Tokenizing val data
Tokenizing test data
Tokenizing train data
1


In [6]:
# Then, load preprocessed train, val and test datasets
import pickle as pkl

train_data_tokens = pkl.load(open("train_data_tokens_1.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens_1.p", "rb"))

val_data_tokens = pkl.load(open("val_data_tokens_1.p", "rb"))
test_data_tokens = pkl.load(open("test_data_tokens_1.p", "rb"))


#train_data_tokens = pkl.load(open("train_data_tokens_2.p", "rb"))
#all_train_tokens = pkl.load(open("all_train_tokens_2.p", "rb"))

#val_data_tokens = pkl.load(open("val_data_tokens_2.p", "rb"))
#test_data_tokens = pkl.load(open("test_data_tokens_2.p", "rb"))

#train_data_tokens = pkl.load(open("train_data_tokens_3.p", "rb"))
#all_train_tokens = pkl.load(open("all_train_tokens_3.p", "rb"))

#val_data_tokens = pkl.load(open("val_data_tokens_3.p", "rb"))
#test_data_tokens = pkl.load(open("test_data_tokens_3.p", "rb"))

#train_data_tokens = pkl.load(open("train_data_tokens_4.p", "rb"))
#all_train_tokens = pkl.load(open("all_train_tokens_4.p", "rb"))

#val_data_tokens = pkl.load(open("val_data_tokens_4.p", "rb"))
#test_data_tokens = pkl.load(open("test_data_tokens_4.p", "rb"))



# double checking
print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))

print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000
Total number of tokens in train dataset is 4827321


In [7]:
from collections import Counter

max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens)

In [8]:
random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 7014 ; token bleed
Token bleed; token id 7014


In [9]:
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_data_indices)))
print ("Val dataset size is {}".format(len(val_data_indices)))
print ("Test dataset size is {}".format(len(test_data_indices)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [10]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

BATCH_SIZE = 32
train_dataset = NewsGroupDataset(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

test_dataset = NewsGroupDataset(test_data_indices, test_targets)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=False)

In [11]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,20)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out

emb_dim = 100
model = BagOfWords(len(id2token), emb_dim)

In [12]:
learning_rate = 0.01
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


Epoch: [1/10], Step: [101/625], Validation Acc: 77.0
Epoch: [1/10], Step: [201/625], Validation Acc: 83.72
Epoch: [1/10], Step: [301/625], Validation Acc: 85.42
Epoch: [1/10], Step: [401/625], Validation Acc: 85.84
Epoch: [1/10], Step: [501/625], Validation Acc: 84.94
Epoch: [1/10], Step: [601/625], Validation Acc: 85.96
Epoch: [2/10], Step: [101/625], Validation Acc: 86.36
Epoch: [2/10], Step: [201/625], Validation Acc: 86.34
Epoch: [2/10], Step: [301/625], Validation Acc: 85.5
Epoch: [2/10], Step: [401/625], Validation Acc: 86.24
Epoch: [2/10], Step: [501/625], Validation Acc: 86.08
Epoch: [2/10], Step: [601/625], Validation Acc: 86.0
Epoch: [3/10], Step: [101/625], Validation Acc: 86.64
Epoch: [3/10], Step: [201/625], Validation Acc: 85.84
Epoch: [3/10], Step: [301/625], Validation Acc: 85.76
Epoch: [3/10], Step: [401/625], Validation Acc: 85.56
Epoch: [3/10], Step: [501/625], Validation Acc: 85.16
Epoch: [3/10], Step: [601/625], Validation Acc: 85.48
Epoch: [4/10], Step: [101/625],

In [13]:
print ("After training for {} epochs".format(num_epochs))
print ("Val Acc {}".format(test_model(val_loader, model)))


After training for 10 epochs
Val Acc 83.58
