In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import numpy as np
from torch.nn.utils import rnn

In [135]:
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [136]:
import string

In [138]:
def clean(translator,stop_words,ps,**kwargs): # obtains a word to index mapping as well
    count = 0
    vocabulary = Counter()
    w2i = {}
    for label,fname in kwargs.items():
        with open(fname+'_cleaned','w') as ft:
            with open(fname) as fs:
                for line in fs:
                    count+=1
                    label = line[0]
                    review = line[2:]
                    label = line[0]
                    review = line[2:]
                    sents = review.strip().split('.')
                    sents = [[ps.stem(w) for w in word_tokenize(s.translate(translator)) if w not in stop_words] \
                             for s in sents if len(s)>1]
                    words = sum(sents,[])
                    for w in words:
                        vocabulary[w] = 1
                    rev = '.'.join([' '.join(s) for s in sents])
                    ft.write(label+','+rev)
                    ft.write('\n')
                    if count%1000000==0:  
                        print('cleaned reviews: ',count)
                        
    count = 1
    for w in vocabulary:
        w2i[w]=count
        count+=1
    return w2i    

In [139]:
translator = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
w2i = clean(translator,stop_words,ps,file='../Data/test_s.csv')

In [140]:
len(w2i)                    

644

In [141]:
def text2tensor(review,w2i):
    out = [[w2i[w] for w in sents.split()] for sents in review if len(sents)>0] 
    return out

In [142]:
def creatingDataset(fname,w2i):  # dictionary of list of tuples (rev,label)
    dataset = {}
    with open(fname+'_cleaned') as fs:
        for line in fs:
            label = int(line[0])
            review = line[2:]
            temp = review.strip().split('.')
            length = len(temp)
            if length not in dataset:
                dataset[length] = []
            encoded_review = text2tensor(temp,w2i)
            dataset[length].append((encoded_review,label))
    return dataset    

In [143]:
train_dataset = creatingDataset('../Data/test_s.csv',w2i)

In [144]:
for key in train_dataset:
    print(key,len(train_dataset[key]))

6 2
3 13
23 1
2 4
9 1
5 3
11 1


In [11]:
train_dataset.keys()

dict_keys([6, 3, 23, 2, 9, 5, 11])

In [150]:
def createBatches(dataset,batch_size): # generator implementation 
    batch = [] # return a batch of datapoints based on batch_size
    lengths = list(dataset.keys())
    lengths.sort(reverse=True)
    size = 0
    sent_length = []
    for l in lengths:
        for doc in dataset[l]:
            if len(batch)>0:
                curr_len = len(batch[-1][0])
            else:
                curr_len = l
            diff = curr_len - len(doc[0]) 
            if diff<=2 and diff>0:
                size+=1
                batch.append(doc)
                sent_length.append(len(doc[0]))
            elif diff==0:
                batch.append(doc)
                sent_length.append(len(doc[0]))
                size+=1
            else:
                if len(batch)>0:
                    yield (batch,sent_length)
                batch = [doc]
                sent_length = [len(doc[0])]
                size = 1
                
            if size==batch_size:
                yield (batch,sent_length)
                batch = []
                sent_length = []
                size = 0
    yield (batch,sent_length)       

In [196]:
def mergeSentences(batch,lengths):
    sent = []
    label = []
    for review,l in batch:
        sent+=review
        label.append(l)
    return sent,label

In [257]:
wordEnc = wordEncoder(644,15,20,15,0)

In [258]:
sentEnc = sentenceEncoder(15,20,15,2)

In [232]:
for batch,lengths in createBatches(train_dataset,4):
    if len(lengths)<3:
        continue
    sent,label = mergeSentences(batch,lengths)
    sentence_length = [len(s) for s in sent]
    sent = np.array(list(itertools.zip_longest(*sent, fillvalue=0))).T
    X = torch.from_numpy(sent)
    X_lengths = torch.LongTensor(sentence_length)
    X,X_lengths,mapped_index = sortbylength(X,X_lengths)
    batch_size = len(sentence_length)
    
    sent_out = wordEnc(X,X_lengths,batch_size)
    sent_out = sent_out.squeeze()[mapped_index,:]
    
    curr_length = lengths[0]
    
    review_batch = torch.Tensor()
    
    r = 0
    c = sent_out.shape[1]
    for l in lengths:
        if l==curr_length:
            review_batch = torch.cat((review_batch,sent_out[r:r+l,:]))
            r+=l
        else:
            diff = curr_length-l
            review_batch = torch.cat((review_batch,sent_out[r:r+l,:],torch.zeros(diff,c)))
            r+=l
            
    review_batch = review_batch.view(len(lengths),-1,c)
    
    output = sentEnc(review_batch,torch.LongTensor(lengths),len(lengths))
    
    print(output)
    
    break

torch.Size([22, 33, 20])
tensor([21, 20, 15,  7, 13, 18,  0,  1,  2,  4,  3, 19, 10,  9, 17, 14,  5, 11,
        16,  8, 12,  6])
torch.Size([1, 22, 15])
torch.Size([22, 15])
tensor([[[0.4688, 0.4144],
         [0.4674, 0.4158],
         [0.4672, 0.4146],
         [0.4668, 0.4159]]], grad_fn=<SigmoidBackward>)


In [219]:
sent_out = sent_out.squeeze(dim=0)

tensor([[ 0.2259,  0.1208, -0.1167,  0.2175,  0.0648, -0.0127, -0.1858, -0.1479,
         -0.2402,  0.0225,  0.1729, -0.0833,  0.0250, -0.1427, -0.0821],
        [ 0.1890,  0.1215, -0.2516,  0.2067,  0.0165,  0.0355, -0.1486, -0.1060,
         -0.3151,  0.0328,  0.1289, -0.0100,  0.0554, -0.0558,  0.0320],
        [ 0.2342,  0.0826, -0.0822,  0.0912, -0.0436, -0.0420, -0.2195, -0.0939,
         -0.3350,  0.0054,  0.1092,  0.0267,  0.0347, -0.0588,  0.1323],
        [ 0.1675,  0.0539, -0.1533,  0.1286,  0.0015, -0.0280, -0.1306, -0.1284,
         -0.2468,  0.1079,  0.0877,  0.0684,  0.0168,  0.0053,  0.0494],
        [ 0.2044,  0.1118, -0.1821,  0.1809, -0.0049,  0.0298, -0.2043, -0.1616,
         -0.2614,  0.0036,  0.1073, -0.0557,  0.0031, -0.0318, -0.0502]],
       grad_fn=<SliceBackward>)

In [177]:
import itertools
sent = np.array(list(itertools.zip_longest(*sent, fillvalue=0))).T

In [179]:
sent = torch.from_numpy(sent)

In [181]:
lengths

[6, 6, 5, 5]

In [14]:
padding_idx = 0
embed = nn.Embedding(5, 10, padding_idx=padding_idx)

In [54]:
w = torch.tensor([1,2],dtype=torch.long)
embed(w)

tensor([[-0.9230,  0.6575, -1.4323, -0.8788,  0.8231,  0.5026,  0.8340,  1.1535,
         -0.9093, -0.5471],
        [-0.3955, -1.6489,  1.7715,  1.4298,  1.9349, -0.1394, -0.9060, -0.1989,
         -0.5451,  0.9496]], grad_fn=<EmbeddingBackward>)

In [293]:
class wordEncoder(nn.Module):
    def __init__(self,input_size,encoding_size,hidden_size,output_size,padding_idx):
        super(wordEncoder,self).__init__()
        self.hidden_size = hidden_size
        self.batch_first = True

        self.embedding = nn.Embedding(input_size, encoding_size, padding_idx=padding_idx)
        self.e2i = nn.Linear(encoding_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        self.h2o = nn.Linear(2*hidden_size, output_size)
        self.tanh = nn.Tanh()
        self.u_w = nn.Parameter(torch.rand(output_size)) # word context
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, X, X_lengths, batch_size):
        
        self.hidden = self.initHidden(batch_size)
        X = self.embedding(X)
        X = self.e2i(X)

        X = rnn.pack_padded_sequence(X, X_lengths, batch_first=True)

        X, self.hidden = self.gru(X, self.hidden)

        X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        H = torch.unbind(X,dim=0)
        X = self.h2o(X)
        X = self.tanh(X)
        
        Y = torch.unbind(X,dim=0)
        
        Y_1 = torch.Tensor()
        for i in range(X_lengths.shape[0]):
            x = self.softmax(torch.sum(Y[i][:X_lengths[i].item()]*self.u_w,dim=1)).view(-1,1)
            Y_1 = torch.cat((Y_1,torch.sum(H[i][:X_lengths[i].item()]*x,dim=0).view(1,1,-1)),dim=1)
        
        return Y_1
        
    def initHidden(self,batch_size):
         return torch.zeros(2, batch_size, self.hidden_size)    

In [4]:
inp = torch.LongTensor(4,5).random_(1,10)

In [5]:
length = torch.LongTensor([5,5,4,3])

In [256]:
word_encoder = wordEncoder(10,15,20,15,0)

In [19]:
out = word_encoder(inp,length,4)

torch.Size([4, 5, 20])


In [9]:
out

tensor([[[-0.0998, -0.1925,  0.1959, -0.2869, -0.0759, -0.3332,  0.0517,
           0.2547,  0.2556,  0.0702,  0.2077, -0.2388,  0.0432,  0.2740,
           0.1155],
         [ 0.0053, -0.2452,  0.1693, -0.1561,  0.0096, -0.2666,  0.0875,
           0.1585,  0.2412,  0.1068,  0.1697, -0.2195,  0.1377,  0.0993,
           0.0125],
         [ 0.0769, -0.0422,  0.1295, -0.1372,  0.0156, -0.3321,  0.0245,
           0.2075,  0.1447,  0.1233,  0.1157, -0.2569,  0.2089,  0.0249,
           0.0300],
         [-0.1507, -0.2629,  0.1344, -0.1698, -0.0832, -0.1477,  0.1191,
           0.1807,  0.1812,  0.0761,  0.1205, -0.2088, -0.0027,  0.1596,
           0.0796]]], grad_fn=<CatBackward>)

In [10]:
inp_1 = torch.LongTensor(3,5).random_(1,10)
length_1 = torch.LongTensor([5,5,5]) 

In [11]:
out_1 = word_encoder(inp_1,length_1,3)

In [22]:
out_1.shape

torch.Size([1, 3, 15])

In [30]:
out_1.squeeze().shape

torch.Size([3, 15])

In [31]:
z = torch.zeros(1,15)

In [32]:
z.shape

torch.Size([1, 15])

In [34]:
out_1 = torch.cat((out_1.squeeze(),z)).unsqueeze(dim=0)

In [35]:
out_1.shape

torch.Size([1, 4, 15])

In [36]:
out.shape

torch.Size([1, 4, 15])

In [37]:
input_s = torch.cat((out,out_1),dim=0)

In [38]:
input_s.shape

torch.Size([2, 4, 15])

In [13]:
out_1.shape

torch.Size([1, 3, 15])

In [14]:
out.shape

torch.Size([1, 4, 15])

In [292]:
class sentenceEncoder(nn.Module):
    def __init__(self,input_size,hidden_size,repr_size,output_size):
        super(sentenceEncoder,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.g2r = nn.Linear(2*hidden_size,repr_size)
        self.tanh = nn.Tanh()
        self.u_s = nn.Parameter(torch.rand(repr_size)) # sentence context
        self.softmax = nn.Softmax(dim=0)
        self.r2o = nn.Linear(2*hidden_size,output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, X, X_lengths, batch_size):
        self.hidden = self.initHidden(batch_size)

        X = rnn.pack_padded_sequence(X, X_lengths, batch_first=True)

        X, self.hidden = self.gru(X, self.hidden)

        X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        H = torch.unbind(X,dim=0) # hidden state obtained from each sentence    
        
        X = self.g2r(X)
        X = self.tanh(X)
        
        Y = torch.unbind(X,dim=0)
        
        Y_1 = torch.Tensor()
        for i in range(X_lengths.shape[0]):
            x = self.softmax(torch.sum(Y[i][:X_lengths[i].item()]*self.u_s,dim=1)).view(-1,1)
            Y_1 = torch.cat((Y_1,torch.sum(H[i][:X_lengths[i].item()]*x,dim=0).view(1,1,-1)),dim=1)
        
        output = self.r2o(Y_1)
        output = self.sigmoid(output)
        
        return output
        
    def initHidden(self,batch_size):
         return torch.zeros(2, batch_size, self.hidden_size)

In [55]:
sent_enc = sentenceEncoder(15,20,15,2)

In [40]:
inp_s_length = torch.LongTensor([4,3])

In [56]:
sent_enc(input_s,inp_s_length,2)

tensor([[[0.5318, 0.5057],
         [0.5416, 0.5014]]], grad_fn=<SigmoidBackward>)

In [57]:
# inp (4,5), inp_1 (3,5), length (5,5,4,3), length_1 (5,5,5), 
len_sent = torch.LongTensor([4,3])

In [58]:
all_sent = torch.cat((inp,inp_1),dim=0)
all_sent_len = torch.cat((length,length_1)) 

In [63]:
all_sent[2,4] = 0
all_sent[3,4] = 0
all_sent[3,3] = 0

In [64]:
all_sent

tensor([[3, 3, 8, 3, 2],
        [8, 7, 7, 5, 8],
        [9, 7, 6, 9, 0],
        [1, 1, 8, 0, 0],
        [8, 2, 2, 4, 3],
        [7, 5, 1, 3, 8],
        [1, 7, 6, 7, 3]])

In [65]:
all_sent_len

tensor([5, 5, 4, 3, 5, 5, 5])

In [82]:
labels = torch.LongTensor([0,1])

In [225]:
# concatenating sentences from all reviews in a batch and then sorting them based on length would change the order
# to keep track of the sequence of the sentences we need to remember the original mapping
# this routine keeps track of the sequence of the sentences among all the reviews
# need when passing to the sentence encoder...

def originalMap(indices):  
    count = 0
    m = {}
    for i in indices:
        m[i.item()] = count
        count+=1

    m_p = []    
    for i,val in sorted(m.items(),key=lambda x:x[0]):
        m_p.append(val)

    return torch.LongTensor(m_p)

In [77]:
def sortbylength(all_sent,all_sent_len):
    sorted_lengths, indices = torch.sort(all_sent_len,descending=True)
    mapped_index = originalMap(indices)
    return all_sent[torch.LongTensor(indices),:],sorted_lengths,mapped_index

In [78]:
X,X_lengths,mapped_index = sortbylength(all_sent,all_sent_len)

In [83]:
data_train = (all_sent,all_sent_len,labels)

In [103]:
all_sent.shape

torch.Size([7, 5])

In [289]:
wordEnc = wordEncoder(644,15,20,15,0)
sentEnc = sentenceEncoder(40,20,15,2)

In [290]:
def train(wordEnc,sentEnc,train_dataset,batch_size=4,epochs=1,learning_rate=0.001):
    
    wordEnc_optimizer = optim.Adam(wordEnc.parameters(), lr=learning_rate)
    sentEnc_optimizer = optim.Adam(sentEnc.parameters(), lr=learning_rate)
    
    criterion = nn.CrossEntropyLoss()
    
    for _ in range(epochs):
        for batch,lengths in createBatches(train_dataset,4):
            if len(lengths)<3:
                continue
            sent,label = mergeSentences(batch,lengths)
            label = torch.LongTensor(label)
            print(label)
            sentence_length = [len(s) for s in sent]
            sent = np.array(list(itertools.zip_longest(*sent, fillvalue=0))).T
            X = torch.from_numpy(sent)
            X_lengths = torch.LongTensor(sentence_length)
            X,X_lengths,mapped_index = sortbylength(X,X_lengths)
            batch_size = len(sentence_length)

            sent_out = wordEnc(X,X_lengths,batch_size)
            print('output from word encoder obtained')
            print(sent_out.shape)
            
            sent_out = sent_out.squeeze()[mapped_index,:]

            curr_length = lengths[0]

            review_batch = torch.Tensor()

            r = 0
            c = sent_out.shape[1]
            for l in lengths:
                if l==curr_length:
                    review_batch = torch.cat((review_batch,sent_out[r:r+l,:]))
                    r+=l
                else:
                    diff = curr_length-l
                    review_batch = torch.cat((review_batch,sent_out[r:r+l,:],torch.zeros(diff,c)))
                    r+=l

            review_batch = review_batch.view(len(lengths),-1,c)

            print('review batch')
            print(review_batch.shape)
            output = sentEnc(review_batch,torch.LongTensor(lengths),len(lengths))
            
            print(output.shape)
            
            loss = criterion(output.squeeze(),label)
            
            print(loss)
            wordEnc_optimizer.zero_grad()
            sentEnc_optimizer.zero_grad()
            loss.backward()
            sentEnc_optimizer.step()
            wordEnc_optimizer.step()
            break

            

In [291]:
train(wordEnc,sentEnc,train_dataset)

tensor([0, 1, 0, 0])
hidden_size
torch.Size([33, 40])
output from word encoder obtained
torch.Size([1, 22, 40])
review batch
torch.Size([4, 6, 40])
output from GRU obtained
size of hidden state
torch.Size([6, 40])
linear and tanh transformation done
torch.Size([4, 6, 15])
torch.Size([1, 4, 2])
tensor(0.6865, grad_fn=<NllLossBackward>)


In [118]:
wordEnc = wordEncoder(10,15,20,15,0)
sentEnc = sentenceEncoder(15,20,15,2)

In [105]:
X,X_lengths,mapped_index = sortbylength(data_train[0],data_train[1])

In [107]:
out = wordEnc(X,X_lengths,7)

torch.Size([7, 5, 20])


In [108]:
out.shape

torch.Size([1, 7, 15])

In [109]:
out_1 = torch.cat((out.squeeze(),torch.zeros(1,15))).unsqueeze(dim=0)

In [110]:
out_1.shape

torch.Size([1, 8, 15])

In [111]:
out_1 = out_1.view(2,-1,15)

In [112]:
out_1.shape

torch.Size([2, 4, 15])

In [116]:
sent_enc(out_1,torch.LongTensor([4,3]),2)

tensor([[[0.5392, 0.5036],
         [0.5387, 0.5031]]], grad_fn=<SigmoidBackward>)

In [131]:
train(wordEnc,sentEnc,data_train)

torch.Size([7, 5, 20])
tensor(0.6927, grad_fn=<NllLoss2DBackward>)


In [1]:
import pickle

In [2]:
with open('word2index.pickle','rb') as fs:
    w2i = pickle.load(fs)

In [4]:
def creatingDataset(fname,w2i):  # dictionary of list of tuples (rev,label)
    dataset = {}
    with open(fname+'_cleaned') as fs:
        for line in fs:
            label = int(line[0])
            review = line[2:]
            temp = review.strip().split('.')
            length = len(temp)
            if length not in dataset:
                dataset[length] = []
            encoded_review = text2tensor(temp,w2i)
            if len(encoded_review)>0:
                dataset[length].append((encoded_review,label))
    return dataset

In [6]:
def text2tensor(review,w2i):
    out = [[w2i[w] for w in sents.split()] for sents in review if len(sents)>0]
    return out

In [7]:
train_dataset = creatingDataset('../Data/train.csv', w2i)

In [34]:
def createBatches(dataset, batch_size):  # generator implementation
    batch = []  # return a batch of datapoints based on batch_size
    lengths = list(dataset.keys())
    lengths.sort()
    size = 0
    sent_length = []
    
    for l in lengths[:100]:
        for doc in dataset[l]:
            batch.append(doc)
            sent_length.append(len(doc[0]))
            size+=1
            if size==batch_size:
                yield(batch,sent_length)
                batch = []
                sent_length = []
                size = 0
                
        yield(batch,sent_length)
        batch = []
        sent_length = []
        size = 0

In [22]:
# distribution of sizes:
keys = list(train_dataset.keys())

In [24]:
keys.sort()

In [31]:
reviews = []
for k in keys:
    print(k,len(train_dataset[k]))
    reviews.append(len(train_dataset[k]))

1 2899549
2 5740726
3 5897990
4 3913050
5 2479805
6 1669319
7 1184796
8 872828
9 663894
10 518487
11 416298
12 337900
13 278558
14 232642
15 195719
16 166574
17 142636
18 122986
19 107156
20 93563
21 81876
22 72085
23 63604
24 56658
25 49760
26 43882
27 39101
28 35149
29 31613
30 27989
31 25111
32 22792
33 20464
34 18696
35 16859
36 15446
37 13678
38 12562
39 11543
40 10479
41 9574
42 8890
43 8033
44 7472
45 6873
46 6376
47 5855
48 5365
49 4987
50 4649
51 4212
52 3897
53 3599
54 3276
55 3075
56 2830
57 2672
58 2459
59 2258
60 2105
61 1957
62 1832
63 1737
64 1633
65 1612
66 1423
67 1276
68 1284
69 1191
70 1057
71 1071
72 989
73 917
74 905
75 815
76 748
77 770
78 647
79 634
80 609
81 566
82 574
83 503
84 472
85 449
86 404
87 387
88 371
89 387
90 378
91 357
92 311
93 315
94 270
95 268
96 280
97 243
98 242
99 220
100 196
101 196
102 173
103 211
104 184
105 159
106 157
107 196
108 149
109 130
110 114
111 152
112 131
113 103
114 115
115 94
116 116
117 117
118 100
119 101
120 98
121 85
122 79

In [33]:
sum(reviews)

28738514

In [37]:
count = 0
for batch,lengths in createBatches(train_dataset,128):
    count+=1
    if count%1000==0:
        print(count,len(batch),len(lengths))

1000 128 128
2000 128 128
3000 128 128
4000 128 128
5000 128 128
6000 128 128
7000 128 128
8000 128 128
9000 128 128
10000 128 128
11000 128 128
12000 128 128
13000 128 128
14000 128 128
15000 128 128
16000 128 128
17000 128 128
18000 128 128
19000 128 128
20000 128 128
21000 128 128
22000 128 128
23000 128 128
24000 128 128
25000 128 128
26000 128 128
27000 128 128
28000 128 128
29000 128 128
30000 128 128
31000 128 128
32000 128 128
33000 128 128
34000 128 128
35000 128 128
36000 128 128
37000 128 128
38000 128 128
39000 128 128
40000 128 128
41000 128 128
42000 128 128
43000 128 128
44000 128 128
45000 128 128
46000 128 128
47000 128 128
48000 128 128
49000 128 128
50000 128 128
51000 128 128
52000 128 128
53000 128 128
54000 128 128
55000 128 128
56000 128 128
57000 128 128
58000 128 128
59000 128 128
60000 128 128
61000 128 128
62000 128 128
63000 128 128
64000 128 128
65000 128 128
66000 128 128
67000 128 128
68000 128 128
69000 128 128
70000 128 128
71000 128 128
72000 128 128
7