In [13]:
import codecs
import random
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.utils import shuffle
from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import datapath
import numpy as np
import argparse
import copy
use_cuda = torch.cuda.is_available()

In [14]:
def load_data(fpath,label):
    data = []
    with codecs.open(fpath,'r','utf-8',errors='ignore') as f:
        lines = f.readlines()
        for l in lines:
            l = l.rstrip()
            data.append((l.split(' '),label))
    return data
pos = load_data('./dataset/rt-polarity.pos',1)
neg = load_data('./dataset/rt-polarity.neg',0)
data = pos+neg

In [15]:
max_sentence_len = max([len(sentence) for sentence, _ in data])

vocab = []

for d, _ in data:
    for w in d:
        if w not in vocab:
            vocab.append(w)
vocab = sorted(vocab)
vocab_size = len(vocab)

w2i = {w:i for i,w in enumerate(vocab)}
i2w = {i:w for i,w in enumerate(vocab)}
word_vectors = KeyedVectors.load_word2vec_format('./dataset/GoogleNews-vectors-negative300.bin', binary=True)
wv_matrix = []
for i in range(len(vocab)):
            word = i2w[i]
            if word in word_vectors.vocab:
                wv_matrix.append(word_vectors.word_vec(word))
            else:
                wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))

wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
wv_matrix.append(np.zeros(300).astype("float32"))
wv_matrix = np.array(wv_matrix)

In [16]:
div_idx = (int)(len(data)*0.8)
random.shuffle(data)
train_data = data[:div_idx]
test_data = data[div_idx:]

In [17]:
class Net(nn.Module):
    def __init__(self,vocab_size,embd_size,out_chs,filter_heights,pretrained_vec):
        super(Net,self).__init__()
        self.embedding = nn.Embedding(vocab_size+2,embd_size)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_vec))
        self.embedding.weight.requires_grad = False
        self.conv = nn.ModuleList([nn.Conv2d(1,out_chs,(fh,embd_size)) for fh in filter_heights])
        self.dropout = nn.Dropout(.5)
        self.fc1 = nn.Linear(out_chs*len(filter_heights),1)
        
    def forward(self,x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv]
        x = [F.max_pool1d(i,i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x,1)
        x = self.dropout(x)
        x = self.fc1(x)
        probs = F.sigmoid(x)
        return probs

In [18]:
def train(model,data,batch_size,n_epoch):
    model.train()
    if use_cuda:
        model.cuda()
    losses = []
    optimizer = torch.optim.Adadelta(model.parameters(),lr = 0.1)
    for epoch in range(n_epoch):
        epoch_loss = 0.0
        random.shuffle(data)
        for i in range(0,len(data)-batch_size,batch_size):
            in_data,labels = [],[]
            for sentence, label in data[i:i+batch_size]:
                index_vec = [w2i[w] for w in sentence]
                pad_len = max(0,max_sentence_len - len(index_vec))
                index_vec +=[0]*pad_len
                index_vec = index_vec[:max_sentence_len]
                in_data.append(index_vec)
                labels.append(label)
            sent_var = Variable(torch.LongTensor(in_data))
            if use_cuda:
                sent_var = sent_var.cuda()
            target_var = Variable(torch.Tensor(labels).unsqueeze(1))
            if use_cuda:
                target_var = target_var.cuda()
            optimizer.zero_grad()
            probs = model(sent_var)
            loss = F.binary_cross_entropy(probs,target_var)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.data[0]
        
        print('epoch : {:d},loss : {:.3f}'.format(epoch,epoch_loss))
        losses.append(epoch_loss)
    print('Training avg loss : {:.3f}'.format(sum(losses)/len(losses)))
    
    return model, losses

def test(model,data, n_test,min_sentence_len):
    model.eval()
    loss = 0
    correct = 0
    for sentence,label in data[:n_test]:
        if len(sentence) < min_sentence_len:
            continue
        index_vec = [w2i[w] for w in sentence]
        sent_var = Variable(torch.LongTensor([index_vec]))
        if use_cuda:
            sent_var = sent_var.cuda()
        out = model(sent_var)
        score = out.data[0][0]
        pred = 1 if score > .5 else 0
        if pred == label:
            correct += 1
        loss += math.pow((label-score),2)
    print('Test acc : {:.3f} ({:d}/{:d})'.format(correct/n_test,correct,n_test))
    print('Test loss : {:.3f}'.format(loss/n_test))
    
out_ch = 100
embd_size = 300
batch_size = 50
n_epoch = 100
filter_size = [3,4,5]
print('filter : ',filter_size)
model = Net(vocab_size,embd_size,out_ch,filter_size,wv_matrix)
model,losses = train(model,train_data,batch_size,n_epoch)
test(model,test_data,len(test_data),max(filter_size))
print('')

filter :  [3, 4, 5]




epoch : 0,loss : 116.459
epoch : 1,loss : 111.562
epoch : 2,loss : 104.062
epoch : 3,loss : 95.606
epoch : 4,loss : 88.790
epoch : 5,loss : 83.161
epoch : 6,loss : 79.748
epoch : 7,loss : 75.888
epoch : 8,loss : 72.786
epoch : 9,loss : 70.050
epoch : 10,loss : 67.809
epoch : 11,loss : 64.930
epoch : 12,loss : 62.838
epoch : 13,loss : 60.718
epoch : 14,loss : 57.756
epoch : 15,loss : 56.360
epoch : 16,loss : 53.740
epoch : 17,loss : 51.481
epoch : 18,loss : 49.253
epoch : 19,loss : 46.844
epoch : 20,loss : 44.070
epoch : 21,loss : 41.968
epoch : 22,loss : 39.649
epoch : 23,loss : 37.803
epoch : 24,loss : 36.137
epoch : 25,loss : 33.495
epoch : 26,loss : 32.319
epoch : 27,loss : 30.072
epoch : 28,loss : 29.044
epoch : 29,loss : 27.127
epoch : 30,loss : 25.298
epoch : 31,loss : 24.396
epoch : 32,loss : 22.690
epoch : 33,loss : 21.419
epoch : 34,loss : 20.130
epoch : 35,loss : 18.996
epoch : 36,loss : 17.532
epoch : 37,loss : 17.212
epoch : 38,loss : 15.326
epoch : 39,loss : 14.377
epoch :