In [1]:
import codecs
import re
import random
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

use_cuda = torch.cuda.is_available()


In [2]:
def load_data(fpath,label):
    data = []
    with codecs.open(fpath,'r','utf-8',errors='ignore') as f:
        lines = f.readlines()
        for l in lines:
            l = l.rstrip()
            data.append((l.split(' '),label))
    return data
pos = load_data('./dataset/rt-polarity.pos',1)
neg = load_data('./dataset/rt-polarity.neg',0)
data = pos+neg

In [3]:
max_sentence_len = max([len(sentence) for sentence, _ in data])

vocab=[]

for d, _ in data:
    for w in d:
        if w not in vocab:
            vocab.append(w)
#vocab = sorted(vocab)
vocab_size = len(vocab)


w2i = {w:i for i,w in enumerate(vocab)}
i2w = {i:w for i,w in enumerate(vocab)}

In [4]:
div_idx = (int)(len(data)*0.8)
random.shuffle(data)
train_data = data[:div_idx]
test_data = data[div_idx:]

In [5]:
class Net(nn.Module):
    def __init__(self,vocab_size,embd_size,out_chs,filter_heights):
        super(Net,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embd_size)
        self.conv = nn.ModuleList([nn.Conv2d(1,out_chs,(fh,embd_size)) for fh in filter_heights])
        self.dropout = nn.Dropout(.5)
        self.fc1 = nn.Linear(out_chs*len(filter_heights),1)
        
    def forward(self,x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv]
        x = [F.max_pool1d(i,i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x,1)
        x = self.dropout(x)
        x = self.fc1(x)
        probs = F.sigmoid(x)
        return probs
        #return x

In [8]:
def train(model, data, batch_size, n_epoch):
    model.train() 
    if use_cuda:
        model.cuda()
    losses = []
    optimizer = torch.optim.Adadelta(model.parameters(),lr = 0.01)
    for epoch in range(n_epoch):
        epoch_loss = 0.0
        random.shuffle(data)
        for i in range(0, len(data)-batch_size, batch_size): 
            in_data, labels = [], []
            for sentence, label in data[i: i+batch_size]:
                index_vec = [w2i[w] for w in sentence]
                pad_len = max(0, max_sentence_len - len(index_vec))
                index_vec += [0] * pad_len
                index_vec = index_vec[:max_sentence_len] 
                in_data.append(index_vec)
                labels.append(label)
            sent_var = Variable(torch.LongTensor(in_data))
            if use_cuda: sent_var = sent_var.cuda()

            target_var = Variable(torch.Tensor(labels).unsqueeze(1))
            if use_cuda: target_var = target_var.cuda()
            optimizer.zero_grad()
            probs = model(sent_var)
            loss = F.binary_cross_entropy(probs, target_var)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.data[0]
        print('epoch: {:d}, loss: {:.3f}'.format(epoch, epoch_loss))
        losses.append(epoch_loss)
    print('Training avg loss: {:.3f}'.format(sum(losses)/len(losses)))
        
    return model, losses

def test(model, data, n_test, min_sentence_len):
    model.eval()
    loss = 0
    correct = 0
    for sentence, label in data[:n_test]:
        if len(sentence) < min_sentence_len:  
            continue
        index_vec = [w2i[w] for w in sentence]
        sent_var = Variable(torch.LongTensor([index_vec]))
        if use_cuda: sent_var = sent_var.cuda()
        out = model(sent_var)
        score = out.data[0][0]
        pred = 1 if score > .5 else 0
        if pred == label:
            correct += 1
        loss += math.pow((label-score), 2)
    print('Test acc: {:.3f} ({:d}/{:d})'.format(correct/n_test, correct, n_test))
    print('Test loss: {:.3f}'.format(loss/n_test))
    
out_ch = 100
embd_size = 300
batch_size = 50
n_epoch = 100
filter_size = [3,4,5]
print('filter : ',filter_size)
model = Net(vocab_size,embd_size,out_ch,filter_size)
model,losses = train(model,train_data,batch_size,n_epoch)
test(model,test_data,len(test_data),max(filter_size))
print('')

filter :  [3, 4, 5]




epoch: 0, loss: 124.552
epoch: 1, loss: 121.787
epoch: 2, loss: 118.910
epoch: 3, loss: 117.775
epoch: 4, loss: 114.830
epoch: 5, loss: 113.177
epoch: 6, loss: 111.654
epoch: 7, loss: 110.186
epoch: 8, loss: 109.919
epoch: 9, loss: 107.810
epoch: 10, loss: 106.595
epoch: 11, loss: 105.173
epoch: 12, loss: 103.547
epoch: 13, loss: 103.499
epoch: 14, loss: 102.050
epoch: 15, loss: 100.896
epoch: 16, loss: 99.516
epoch: 17, loss: 98.870
epoch: 18, loss: 97.758
epoch: 19, loss: 96.275
epoch: 20, loss: 95.927
epoch: 21, loss: 94.593
epoch: 22, loss: 94.019
epoch: 23, loss: 92.675
epoch: 24, loss: 91.722
epoch: 25, loss: 90.320
epoch: 26, loss: 90.041
epoch: 27, loss: 88.505
epoch: 28, loss: 87.740
epoch: 29, loss: 86.762
epoch: 30, loss: 86.267
epoch: 31, loss: 84.732
epoch: 32, loss: 83.893
epoch: 33, loss: 83.227
epoch: 34, loss: 82.342
epoch: 35, loss: 80.500
epoch: 36, loss: 79.936
epoch: 37, loss: 78.745
epoch: 38, loss: 77.818
epoch: 39, loss: 76.878
epoch: 40, loss: 76.183
epoch: 41,