In [5]:
import pickle
import os

cache_dir = os.path.join("./cache", "sentiment_analysis")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        print("CACHE NOT FOUND")
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [6]:
train_X, test_X, train_y, test_y = preprocess_data()
type(train_X), type(train_y)
train_X.extend(test_X)
train_y.extend(test_y)
len(train_X), len(train_y)

Read preprocessed data from cache file: preprocessed_data.pkl


(50000, 50000)

In [7]:
import random
type(train_X), type(train_y)
temp = list(zip(train_X, train_y))
random.shuffle(temp)
train_X, train_y = zip(*temp)
# res1 and res2 come out as tuples, and so must be converted to lists.
train_X, train_y = list(train_X), list(train_y)

In [8]:

type(train_X), type(train_y)
print(len(train_X))
train_list = train_X[:30000]
train_labels_list = train_y[:30000]
test_list = train_X[30000:]
test_labels_list = train_y[30000:]
print(len(train_list), len(train_labels_list), len(test_list), len(test_labels_list))
print(train_list[0], test_list[0], train_labels_list[0], test_labels_list[0])

50000
30000 30000 20000 20000
['basic', 'like', 'verhoeven', 'film', 'film', 'enjoy', 'brilliant', 'pscychosexu', 'stori', 'seen', 'basic', 'instinct', 'realli', 'wonder', 'thriller', 'enjoy', 'much', 'obvious', 'watch', 'anoth', 'verhoeven', 'movi', 'well', 'previou', 'direct', 'block', 'buster', 'hit', 'basic', 'instinct', 'much', 'curiou', 'watch', 'movi', 'yeah', 'movi', 'fulfil', 'hope', 'expect', 'movi', 'fourth', 'man', 'brilliant', 'pscychosexu', 'drama', 'lit', 'bit', 'complex', 'audienc', 'stori', 'movi', 'gay', 'writer', 'name', 'reve', 'krabb', 'alcohol', 'person', 'live', 'moral', 'valu', 'see', 'mani', 'vision', 'may', 'warn', 'futur', 'accid', 'end', 'lectur', 'introduc', 'seduct', 'woman', 'name', 'christin', 'mysteri', 'past', 'want', 'reveal', 'reve', 'sex', 'hous', 'boy', 'next', 'morn', 'watch', 'sexi', 'macho', 'boyfriend', 'pictur', 'tabl', 'person', 'met', 'station', 'curiou', 'meet', 'tell', 'christin', 'invit', 'hous', 'want', 'reveal', 'entir', 'stori', 'verho

In [9]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    for sentence in data:
        for word in sentence:
            if word in word_count.keys():
                word_count[word] = word_count[word] + 1
            else:
                word_count[word] = 1
   
    sorted_words = {k: v for k, v in sorted(word_count.items(), key=lambda item: item[1], reverse=True)}
    sorted_words = [word for word in sorted_words.keys()]
    word_dict = {} # This is what we are building, a dictionary that translates words into integers

    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [10]:
word_dict = build_dict(train_list)

In [11]:
data_dir = './data/pytorch' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    os.makedirs(data_dir)

In [12]:
with open(os.path.join(data_dir, 'word_dict_qrnn.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [13]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_list)
test_X, test_X_len = convert_and_pad_data(word_dict, test_list)


In [14]:
print(train_X.shape, len(train_X_len))
print(test_X.shape, len(test_X_len))
print(train_y[0])

(30000, 500) 30000
(20000, 500) 20000
1


In [15]:
import pandas as pd
pd.concat([pd.DataFrame(train_labels_list), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1).to_csv(os.path.join(data_dir, 'train_qrnn.csv'), header=False, index=False)
pd.concat([pd.DataFrame(test_labels_list), pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1).to_csv(os.path.join(data_dir, 'test_qrnn.csv'), header=False, index=False)

# If Files present start from here

In [1]:
import pandas as pd
import os
import pickle

data_dir = './data/pytorch'
with open(os.path.join(data_dir, 'word_dict_qrnn.pkl'), "rb") as f:
    word_dict = pickle.load(f)

In [2]:
import pandas as pd
import os
train = pd.read_csv(os.path.join(data_dir, 'train_qrnn.csv'), header=None, names=None)
test_sample = pd.read_csv(os.path.join(data_dir, 'test_qrnn.csv'), header=None, names=None)
print(train.shape, test_sample.shape)

(30000, 502) (20000, 502)


In [18]:
from sklearn.model_selection import train_test_split
test, val = train_test_split(test_sample, test_size=0.5)
train.shape, test.shape, val.shape

((30000, 502), (10000, 502), (10000, 502))

In [19]:
import torch
import torch.utils.data

# Turn the input pandas dataframe into tensors
train_y = torch.from_numpy(train[[0]].values).float()
train_X = torch.from_numpy(train.drop([0, 1], axis=1).values).long()

# Build the dataset
train_ds = torch.utils.data.TensorDataset(train_X, train_y)
# Build the dataloader
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=50)

######val data
# Turn the input pandas dataframe into tensors
val_y = torch.from_numpy(val[[0]].values).float()
val_X = torch.from_numpy(val.drop([0, 1], axis=1).values).long()

# Build the dataset
val_ds = torch.utils.data.TensorDataset(val_X, val_y)
# Build the dataloader
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=50)


#### Test data
# Turn the input pandas dataframe into tensors
test_y = torch.from_numpy(test[[0]].values).float()
test_X = torch.from_numpy(test.drop([0, 1], axis=1).values).long()

# Build the dataset
test_ds = torch.utils.data.TensorDataset(test_X, test_y)
# Build the dataloader
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=50)
print(test_y.shape)

torch.Size([10000, 1])


In [22]:
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

class QRNNLayer(nn.Module):
    def __init__(self,batch_size,input_size,n_filters,kernel_size,embed_size,device,dropout):
        super(QRNNLayer,self).__init__()
        self.batch_size = batch_size
        self.input_size = input_size
        self.n_filters = n_filters
        self.kernel_size = kernel_size
        self.embed_size = embed_size
        self.dropout = torch.nn.Dropout(dropout)
        self.device = device
        self.conv1 = torch.nn.Conv1d(self.input_size,self.n_filters,self.kernel_size)
        self.conv2 = torch.nn.Conv1d(self.input_size,self.n_filters,self.kernel_size)
        self.conv3 = torch.nn.Conv1d(self.input_size,self.n_filters,self.kernel_size)
    
    def forward(self,masked_input, h, c):
        Z,F,O = self.masked_conv(masked_input)
        h, c = self.pool(c,Z,F,O)
        masked_input = h
        return masked_input,h,c
    
    def masked_conv(self,x):
        pad = torch.zeros([self.batch_size,1,self.input_size],device=self.device)
        x = torch.cat([pad,x],1).permute(0,2,1)
        Z = torch.tanh((self.conv1(x)))
        F = torch.sigmoid((self.conv2(x)))
        O = torch.sigmoid((self.conv3(x)))
        one_mask = torch.ones_like(F,device=self.device) - F
        F = 1 - self.dropout(one_mask)
        return Z.permute(0,2,1), F.permute(0,2,1), O.permute(0,2,1)
    
    def pool(self, prev_c,Z,F,O):
        c = torch.mul(F,prev_c) + torch.mul(1-F,Z)
        h = torch.mul(O,c)
        return h,c

class QRNN(nn.Module):
    def __init__(self,vocab_size,embed_size,n_filters,kernel_size,batch_size,seq_len,layers,device,dropout):
        super(QRNN,self).__init__()
        self.embed_size = embed_size
        self.n_filters = n_filters
        self.kernel_size = kernel_size
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.num_layer = layers
        self.device = device
        self.embedding = torch.nn.Embedding(vocab_size, embed_size)
        self.dense = torch.nn.Linear(self.seq_len*self.n_filters,1)
        self.QRNN_layers = torch.nn.ModuleList([QRNNLayer(self.batch_size,embed_size if l==0 else n_filters,
                                                         self.n_filters,self.kernel_size,self.embed_size,self.device,
                                                         dropout,) for l in range(self.num_layer)])
        
        
    def forward(self, x, target):
        x = self.embedding(x)
        h = torch.zeros([self.batch_size,self.seq_len,self.n_filters],device=self.device)
        c = torch.zeros_like(h,device=self.device)
        
        masked_input = x
        for l,layer in enumerate(self.QRNN_layers):
            masked_input,h,c = layer(masked_input,h,c)
        dense_input = h.reshape([self.batch_size,-1])
        logits = self.dense(dense_input)
        prediction = torch.sigmoid(logits)
        target = target.view([-1,1])
        correct_pred = torch.eq(torch.round(prediction).type(target.type()),target)
        accuracy = torch.sum(correct_pred)
        return prediction, accuracy

In [24]:
import time
epochs = 10
n_filters = 256
kernel_size = 2
layers= 2
learning_rate = 0.001
print(len(word_dict))
vocab_size = 5000
print(vocab_size)
embed_dims = 32
seq_len = 500
dropout = 0.3
batch_size=50
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = QRNN(vocab_size, embed_dims, n_filters, kernel_size, batch_size, seq_len, layers, device, dropout).to(device)
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

counter = 0
QRNN_acc = []
QRNN_valacc = []

model.train()
for e in range(epochs):
    start_time = time.time()
    for inputs, labels in train_dl:
    
        inputs, labels = inputs.cuda(), labels.cuda()
        model.zero_grad()
        logits, accuracy = model(inputs,labels)
        loss = criterion(logits,labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        if counter%10==0:
            print("Epoch: {}/{}".format(e,epochs),
                         "\tIteration: {}".format(counter),
                         "\tTrain Loss: {:.3f}".format(loss.item()),
                         "\tTrain Accuracy: {:.2f}".format(accuracy.item()*100/batch_size))
            QRNN_acc.append(accuracy.item()*100/batch_size)
        if counter%599==0:
                with torch.no_grad():
                    model.eval()
                    val_acc = []
                    val_loss = []
                    for inputs, labels in val_dl:
                        inputs_val, labels_val = inputs.cuda(), labels.cuda()
                        logits_val,accuracy_val = model(inputs_val,labels_val)
                        loss_val = criterion(logits_val,labels_val.float())
                        val_acc.append(accuracy_val.item()*100/batch_size)
                        val_loss.append(loss_val.item())
                    print("Val Loss: {:.3f}".format(np.mean(val_loss)), "\tVal Acc: {:.3f}".format(np.mean(val_acc)))
                    QRNN_valacc.append(np.mean(val_acc))
                    model.train()
        counter += 1
    print("Time to train epoch: {0} s".format(time.time()-start_time)) 

with torch.no_grad():
    model.eval()
    test_acc = []
    test_loss = []
    for inputs, labels in test_dl:
        input_test, labels_test = inputs.cuda(), labels.cuda()
        logits_test, accuracy_test = model(input_test, labels_test)
        loss_test = criterion(logits_test, labels_test.float())
        test_acc.append(accuracy_test.item()*100/batch_size)
        test_loss.append(loss_test.item())
    print("Test Loss: {:.3f}".format(np.mean(test_loss)), "\tTest Acc: {:.3f}".format(np.mean(test_acc)))

4998
5000
Epoch: 0/10 	Iteration: 0 	Train Loss: 0.696 	Train Accuracy: 46.00
Val Loss: 0.726 	Val Acc: 50.610
Epoch: 0/10 	Iteration: 10 	Train Loss: 0.698 	Train Accuracy: 48.00
Epoch: 0/10 	Iteration: 20 	Train Loss: 0.715 	Train Accuracy: 48.00
Epoch: 0/10 	Iteration: 30 	Train Loss: 0.669 	Train Accuracy: 56.00
Epoch: 0/10 	Iteration: 40 	Train Loss: 0.675 	Train Accuracy: 62.00
Epoch: 0/10 	Iteration: 50 	Train Loss: 0.698 	Train Accuracy: 56.00
Epoch: 0/10 	Iteration: 60 	Train Loss: 0.691 	Train Accuracy: 48.00
Epoch: 0/10 	Iteration: 70 	Train Loss: 0.710 	Train Accuracy: 48.00
Epoch: 0/10 	Iteration: 80 	Train Loss: 0.681 	Train Accuracy: 64.00
Epoch: 0/10 	Iteration: 90 	Train Loss: 0.703 	Train Accuracy: 52.00
Epoch: 0/10 	Iteration: 100 	Train Loss: 0.676 	Train Accuracy: 52.00
Epoch: 0/10 	Iteration: 110 	Train Loss: 0.700 	Train Accuracy: 54.00
Epoch: 0/10 	Iteration: 120 	Train Loss: 0.686 	Train Accuracy: 60.00
Epoch: 0/10 	Iteration: 130 	Train Loss: 0.695 	Train Accu

Epoch: 1/10 	Iteration: 1160 	Train Loss: 0.311 	Train Accuracy: 80.00
Epoch: 1/10 	Iteration: 1170 	Train Loss: 0.207 	Train Accuracy: 96.00
Epoch: 1/10 	Iteration: 1180 	Train Loss: 0.335 	Train Accuracy: 88.00
Epoch: 1/10 	Iteration: 1190 	Train Loss: 0.498 	Train Accuracy: 84.00
Val Loss: 0.377 	Val Acc: 83.770
Time to train epoch: 15.074833393096924 s
Epoch: 2/10 	Iteration: 1200 	Train Loss: 0.165 	Train Accuracy: 94.00
Epoch: 2/10 	Iteration: 1210 	Train Loss: 0.250 	Train Accuracy: 88.00
Epoch: 2/10 	Iteration: 1220 	Train Loss: 0.323 	Train Accuracy: 80.00
Epoch: 2/10 	Iteration: 1230 	Train Loss: 0.254 	Train Accuracy: 86.00
Epoch: 2/10 	Iteration: 1240 	Train Loss: 0.266 	Train Accuracy: 90.00
Epoch: 2/10 	Iteration: 1250 	Train Loss: 0.237 	Train Accuracy: 86.00
Epoch: 2/10 	Iteration: 1260 	Train Loss: 0.142 	Train Accuracy: 96.00
Epoch: 2/10 	Iteration: 1270 	Train Loss: 0.323 	Train Accuracy: 86.00
Epoch: 2/10 	Iteration: 1280 	Train Loss: 0.250 	Train Accuracy: 94.00
Ep

Epoch: 3/10 	Iteration: 2300 	Train Loss: 0.100 	Train Accuracy: 96.00
Epoch: 3/10 	Iteration: 2310 	Train Loss: 0.174 	Train Accuracy: 96.00
Epoch: 3/10 	Iteration: 2320 	Train Loss: 0.233 	Train Accuracy: 88.00
Epoch: 3/10 	Iteration: 2330 	Train Loss: 0.280 	Train Accuracy: 90.00
Epoch: 3/10 	Iteration: 2340 	Train Loss: 0.173 	Train Accuracy: 92.00
Epoch: 3/10 	Iteration: 2350 	Train Loss: 0.050 	Train Accuracy: 100.00
Epoch: 3/10 	Iteration: 2360 	Train Loss: 0.152 	Train Accuracy: 94.00
Epoch: 3/10 	Iteration: 2370 	Train Loss: 0.161 	Train Accuracy: 94.00
Epoch: 3/10 	Iteration: 2380 	Train Loss: 0.074 	Train Accuracy: 98.00
Epoch: 3/10 	Iteration: 2390 	Train Loss: 0.205 	Train Accuracy: 90.00
Val Loss: 0.440 	Val Acc: 84.740
Time to train epoch: 15.554868936538696 s
Epoch: 4/10 	Iteration: 2400 	Train Loss: 0.076 	Train Accuracy: 98.00
Epoch: 4/10 	Iteration: 2410 	Train Loss: 0.131 	Train Accuracy: 94.00
Epoch: 4/10 	Iteration: 2420 	Train Loss: 0.092 	Train Accuracy: 98.00
E

Epoch: 5/10 	Iteration: 3430 	Train Loss: 0.040 	Train Accuracy: 100.00
Epoch: 5/10 	Iteration: 3440 	Train Loss: 0.036 	Train Accuracy: 98.00
Epoch: 5/10 	Iteration: 3450 	Train Loss: 0.171 	Train Accuracy: 96.00
Epoch: 5/10 	Iteration: 3460 	Train Loss: 0.062 	Train Accuracy: 96.00
Epoch: 5/10 	Iteration: 3470 	Train Loss: 0.088 	Train Accuracy: 96.00
Epoch: 5/10 	Iteration: 3480 	Train Loss: 0.025 	Train Accuracy: 100.00
Epoch: 5/10 	Iteration: 3490 	Train Loss: 0.115 	Train Accuracy: 92.00
Epoch: 5/10 	Iteration: 3500 	Train Loss: 0.036 	Train Accuracy: 100.00
Epoch: 5/10 	Iteration: 3510 	Train Loss: 0.103 	Train Accuracy: 94.00
Epoch: 5/10 	Iteration: 3520 	Train Loss: 0.100 	Train Accuracy: 94.00
Epoch: 5/10 	Iteration: 3530 	Train Loss: 0.062 	Train Accuracy: 96.00
Epoch: 5/10 	Iteration: 3540 	Train Loss: 0.092 	Train Accuracy: 96.00
Epoch: 5/10 	Iteration: 3550 	Train Loss: 0.028 	Train Accuracy: 100.00
Epoch: 5/10 	Iteration: 3560 	Train Loss: 0.026 	Train Accuracy: 100.00
E

Epoch: 7/10 	Iteration: 4560 	Train Loss: 0.048 	Train Accuracy: 100.00
Epoch: 7/10 	Iteration: 4570 	Train Loss: 0.073 	Train Accuracy: 96.00
Epoch: 7/10 	Iteration: 4580 	Train Loss: 0.044 	Train Accuracy: 98.00
Epoch: 7/10 	Iteration: 4590 	Train Loss: 0.032 	Train Accuracy: 100.00
Epoch: 7/10 	Iteration: 4600 	Train Loss: 0.089 	Train Accuracy: 94.00
Epoch: 7/10 	Iteration: 4610 	Train Loss: 0.061 	Train Accuracy: 98.00
Epoch: 7/10 	Iteration: 4620 	Train Loss: 0.070 	Train Accuracy: 96.00
Epoch: 7/10 	Iteration: 4630 	Train Loss: 0.031 	Train Accuracy: 100.00
Epoch: 7/10 	Iteration: 4640 	Train Loss: 0.010 	Train Accuracy: 100.00
Epoch: 7/10 	Iteration: 4650 	Train Loss: 0.056 	Train Accuracy: 98.00
Epoch: 7/10 	Iteration: 4660 	Train Loss: 0.010 	Train Accuracy: 100.00
Epoch: 7/10 	Iteration: 4670 	Train Loss: 0.022 	Train Accuracy: 100.00
Epoch: 7/10 	Iteration: 4680 	Train Loss: 0.015 	Train Accuracy: 100.00
Epoch: 7/10 	Iteration: 4690 	Train Loss: 0.057 	Train Accuracy: 98.00

Epoch: 9/10 	Iteration: 5690 	Train Loss: 0.053 	Train Accuracy: 98.00
Epoch: 9/10 	Iteration: 5700 	Train Loss: 0.015 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5710 	Train Loss: 0.001 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5720 	Train Loss: 0.010 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5730 	Train Loss: 0.006 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5740 	Train Loss: 0.010 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5750 	Train Loss: 0.011 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5760 	Train Loss: 0.020 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5770 	Train Loss: 0.008 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5780 	Train Loss: 0.008 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5790 	Train Loss: 0.005 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5800 	Train Loss: 0.010 	Train Accuracy: 100.00
Epoch: 9/10 	Iteration: 5810 	Train Loss: 0.034 	Train Accuracy: 98.00
Epoch: 9/10 	Iteration: 5820 	Train Loss: 0.016 	Train Accuracy: 1