In [59]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import numpy as np

In [3]:
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [7]:
import string

In [55]:
def clean(translator,stop_words,ps,**kwargs): # obtains a word to index mapping as well
    count = 0
    vocabulary = Counter()
    w2i = {}
    for label,fname in kwargs.items():
        with open(fname+'_cleaned','w') as ft:
            with open(fname) as fs:
                for line in fs:
                    count+=1
                    label = line[0]
                    review = line[2:]
                    label = line[0]
                    review = line[2:]
                    sents = review.strip().split('.')
                    sents = [[ps.stem(w) for w in word_tokenize(s.translate(translator)) if w not in stop_words] \
                             for s in sents if len(s)>1]
                    words = sum(sents,[])
                    for w in words:
                        vocabulary[w] = 1
                    rev = '.'.join([' '.join(s) for s in sents])
                    ft.write(label+','+rev)
                    ft.write('\n')
                    if count%1000000==0:  
                        print('cleaned reviews: ',count)
                        
    count = 1
    for w in vocabulary:
        w2i[w]=count
        count+=1
    return w2i    

In [56]:
translator = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
w2i = clean(translator,stop_words,ps,file='../Data/test_s.csv')

In [57]:
len(w2i)                    

644

In [104]:
def text2tensor(review,w2i):
    out = [[w2i[w] for w in sents.split()] for sents in review if len(sents)>0] 
    return out

In [105]:
def creatingDataset(fname,w2i):  # dictionary of list of tuples (rev,label)
    dataset = {}
    with open(fname+'_cleaned') as fs:
        for line in fs:
            label = int(line[0])
            review = line[2:]
            temp = review.strip().split('.')
            length = len(temp)
            if length not in dataset:
                dataset[length] = []
            encoded_review = text2tensor(temp,w2i)
            dataset[length].append((encoded_review,label))
    return dataset    

In [106]:
train_dataset = creatingDataset('../Data/test_s.csv',w2i)

In [107]:
for key in train_dataset:
    print(key,len(train_dataset[key]))

6 2
3 13
23 1
2 4
9 1
5 3
11 1


In [83]:
train_dataset.keys()

dict_keys([6, 3, 23, 2, 9, 5, 11])

In [108]:
def createBatches(dataset,batch_size): # generator implementation 
    batch = [] # return a batch of datapoints based on batch_size
    lengths = list(dataset.keys())
    lengths.sort(reverse=True)
    size = 0
    
    for l in lengths:
        for doc in dataset[l]:
            if len(batch)>0:
                curr_len = len(batch[-1][0])
            else:
                curr_len = l
            diff = curr_len - len(doc[0]) 
            if diff<=2 and diff>0:
                for _ in range(diff):
                    doc[0].append([0])
                size+=1
                batch.append(doc)
            elif diff==0:
                batch.append(doc)
                size+=1
            else:
                if len(batch)>0:
                    yield batch
                batch = [doc]
                size = 1
                
            if size==batch_size:
                yield batch
                batch = []
                size = 0
    yield batch       

In [109]:
for batch in createBatches(train_dataset,4):
    print(len(batch))
    for b in batch:
        print(b)
        print('--------') 
    print('==========')

1
([[78, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97], [98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109], [110, 111, 112, 113, 114, 115, 72, 116, 117, 118, 119, 120, 121, 122], [123, 124, 125, 126, 127, 128, 129, 90, 130], [131, 78, 132, 133, 134, 135, 136, 125, 137, 138, 139, 140, 141], [142, 143, 144, 145, 146, 147], [148, 110, 113, 98, 143, 149, 138, 150, 151, 152], [102, 153, 78, 154, 155, 156, 157, 158, 98], [159, 160, 161, 79, 100, 101, 125, 162, 163, 50, 164], [165, 166, 167, 168, 169, 170, 171, 172], [173, 110, 174, 60, 175, 96, 176, 138, 177, 178, 179, 40, 180], [68, 181, 182, 172], [52, 183, 184, 185, 136, 125, 11, 186, 187, 188, 189, 190, 125], [191, 192, 193, 41, 194, 195, 196, 197, 198, 199, 200, 136, 201, 202], [203, 184, 185, 156, 27, 204, 110, 205, 115], [34, 206, 207, 208, 28, 93, 209, 57, 58, 210], [52, 211, 53, 212, 213], [214, 187, 86, 215], [216, 217, 218, 219, 220, 221, 222, 223, 164, 224, 96, 225], [226, 6, 227, 109, 228, 229, 230, 231, 232], 

In [None]:
class WordEncoder(nn.Module):
    def __init__(self,input_size,embedding_size,output_size,layers,padding_ids):
        super(WordEncoder,self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.layers = layers
        self.batch_first = True
        self.embedding = nn.Embedding(input_size, embedding_size, padding_idx=padding_idx)
        self.e2i = nn.Linear(encoding_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=self.layers, bidirectional=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()