In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

EPOCH2 = 20
num_class = 5
sentence_maxlength = 60
EMBEDDING_SIZE = 100
filter_size = [2, 3, 4, 5]
num_filter = 128
dropout_rate = 0.5

params = {'batch_size' : 64,
          'shuffle' : True
          }


In [2]:

def load_data(path):
    file = pd.read_csv(path, sep='\t', header=0, index_col='PhraseId')
    file = np.array(file)
    num = file.shape[0]
    for i in range(num):
        file[i][1] = file[i][1].lower()
    return file, num
    
def read_pretrain_vector(path):
    emb = []
    vocab = []
    dic = {}
    index = 0
    with open(path, 'r', encoding='UTF-8') as f:
        for line in f.readlines():
            row = line.strip().split()
            emb.append(row[1:])
            vocab.append(row[0])
            dic[row[0]] = index
            index += 1
    return vocab, emb, dic

class CustomDataset(data.Dataset):
    def __init__(self, datas, labels):
        self.datas = datas
        self.labels = labels
        
    def __getitem__(self, index):
        sentence, label = self.datas[index], self.labels[index]
        return sentence, label
    
    def __len__(self):
        return len(self.datas)

    
    
def make_dicts(text):
    # text : list of sentences
    dicts = set()
    for sentence in text:
        sentence_list = sentence.split()
        for word in sentence_list:
            dicts.add(word)
    return dicts
        

def one_hot_vector(value, num):
    out = np.zeros(num)
    out[value] = 1
    return out


def make_data(dataset, dictts, train=True):
    num = dataset.shape[0]
    out = []
    label = []
    for i in range(num):
        sentence = dataset[i][1]
        embed = []
        sentence_list = sentence.split()
        for word in sentence_list:
            embed.append(dictts[word])
        if len(embed) <= sentence_maxlength:
            for j in range(sentence_maxlength - len(embed)):
                embed.append(0)
        else:
            embed = embed[:sentence_maxlength]
        out.append(embed)
        if train == True:
            label.append(dataset[i][2])
        
    return out, label

In [3]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.num_filters_total = num_filter * len(filter_size)
        self.conv_block1 = nn.Sequential(
             nn.Conv2d(1, num_filter, (filter_size[0], EMBEDDING_SIZE), bias=True),
             nn.Dropout(dropout_rate),
             nn.ReLU(),
             nn.MaxPool2d((sentence_maxlength - filter_size[0] + 1, 1)),
                                         )
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(1, num_filter, (filter_size[1], EMBEDDING_SIZE), bias=True),
            nn.Dropout(dropout_rate),
            nn.ReLU(),
            nn.MaxPool2d((sentence_maxlength - filter_size[1] + 1, 1)),
                                        )
        self.conv_block3 = nn.Sequential(
            nn.Conv2d(1, num_filter, (filter_size[2], EMBEDDING_SIZE), bias=True),
            nn.Dropout(dropout_rate),
            nn.ReLU(),
            nn.MaxPool2d((sentence_maxlength - filter_size[2] + 1, 1)),
                                    )
        self.conv_block4 = nn.Sequential(
            nn.Conv2d(1, num_filter, (filter_size[3], EMBEDDING_SIZE), bias=True),
            nn.Dropout(dropout_rate),
            nn.ReLU(),
            nn.MaxPool2d((sentence_maxlength - filter_size[3] + 1, 1)),
                                    )
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(self.num_filters_total, num_class)
        
# input : [batch_size, 1, height=sentence_maxlength, width=EMBEDDING_SIZE]             
    def forward(self, x):
        pool_out = []
        conv_out1 = self.conv_block1(x) 
        pool1 = conv_out1.permute(0, 3, 2, 1)   
        #[batch_size, num_filters, height=1, width=1]
        pool_out.append(pool1)
        conv_out2 = self.conv_block2(x) 
        pool2 = conv_out2.permute(0, 3, 2, 1)  
        pool_out.append(pool2)
        conv_out3 = self.conv_block3(x) 
        pool3 = conv_out3.permute(0, 3, 2, 1)   
        pool_out.append(pool3)
        conv_out4 = self.conv_block4(x) 
        pool4 = conv_out4.permute(0, 3, 2, 1)   
        pool_out.append(pool4)
        h_pool = torch.cat(pool_out, 3)
        h_pool_flat = torch.reshape(h_pool, [-1, self.num_filters_total])
        h_pool_flat = self.dropout(h_pool_flat)
        out = self.fc(h_pool_flat)
        return out
        

In [4]:
train_data, num_train = load_data('train.tsv')
test_data, num_test = load_data('test.tsv')
glove_vocab, glove_emb, dic = read_pretrain_vector('glove.6B.100d.txt')

In [5]:
train_sentence = []
test_sentence = []
for i in range(num_train):
        train_sentence.append(train_data[i][1])
for i in range(num_test):
        test_sentence.append(test_data[i][1])

In [6]:
train_dict = make_dicts(train_sentence)
test_dict = make_dicts(test_sentence)
tot_dict = train_dict | test_dict
tot_dict = sorted(list(tot_dict))
tot_num_word = len(tot_dict)
dicts = {w : i for i, w in enumerate(tot_dict)}

In [7]:
pretrain_weight = np.zeros([tot_num_word, EMBEDDING_SIZE])
for i in range(tot_num_word):
        word = tot_dict[i]
        if word in glove_vocab:
            num = dic[word]
            vector = glove_emb[num]
            vector = np.array(vector)
            pretrain_weight[i] = vector
        else:
            pretrain_weight[i] = np.random.rand(1, EMBEDDING_SIZE) 

In [8]:
pretrain_weight.shape[0]
tot_dict[19020]

'whetted'

In [9]:
    train_emb, train_label = make_data(train_data, dicts, train=True)
    test_emb, _ = make_data(test_data, dicts, train=False)
    train_emb = np.array(train_emb)
    train_label = np.array(train_label)
    test_emb = np.array(test_emb)

In [10]:
train_set = CustomDataset(train_emb, train_label)
train_generator = data.DataLoader(train_set, **params)

In [11]:
iter(train_generator)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x27c543f5198>

In [13]:
net = CNN().to(device)
weight = torch.FloatTensor(pretrain_weight)
embedd = nn.Embedding.from_pretrained(weight).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.003, weight_decay=0.001)
net.train(mode=True)
for epoch in range(EPOCH2):
        for i, (batch_train,batch_label) in enumerate(train_generator): 
            #(batch_train, batch_label)
            batch_train = batch_train.long()
            batch_train = batch_train.to(device)
            batch_train = embedd(batch_train)
            batch_train = batch_train.unsqueeze(1)
            batch_label = batch_label.to(device)
            out = net(batch_train)
            batch_label = batch_label.long()
            loss = criterion(out, batch_label)
            if i%1000==0:
                print('Epoch:{},  loss:{:.3f}'.format(epoch + 1,  loss.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        

Epoch:1,  loss:1.341
Epoch:1,  loss:1.160
Epoch:1,  loss:1.310
Epoch:2,  loss:1.188
Epoch:2,  loss:1.160
Epoch:2,  loss:1.002
Epoch:3,  loss:1.193
Epoch:3,  loss:1.095
Epoch:3,  loss:1.344
Epoch:4,  loss:1.032
Epoch:4,  loss:1.080
Epoch:4,  loss:1.015
Epoch:5,  loss:0.987
Epoch:5,  loss:1.076
Epoch:5,  loss:1.262
Epoch:6,  loss:1.123
Epoch:6,  loss:1.059
Epoch:6,  loss:1.012
Epoch:7,  loss:0.858
Epoch:7,  loss:1.141
Epoch:7,  loss:1.002
Epoch:8,  loss:0.980
Epoch:8,  loss:1.034
Epoch:8,  loss:0.985
Epoch:9,  loss:1.250
Epoch:9,  loss:1.019
Epoch:9,  loss:1.155
Epoch:10,  loss:1.213
Epoch:10,  loss:1.056
Epoch:10,  loss:0.955
Epoch:11,  loss:1.014
Epoch:11,  loss:0.889
Epoch:11,  loss:1.175
Epoch:12,  loss:1.156
Epoch:12,  loss:1.139
Epoch:12,  loss:1.350
Epoch:13,  loss:0.954
Epoch:13,  loss:1.214
Epoch:13,  loss:1.078
Epoch:14,  loss:1.314
Epoch:14,  loss:1.153
Epoch:14,  loss:0.830
Epoch:15,  loss:1.106
Epoch:15,  loss:1.085
Epoch:15,  loss:1.309
Epoch:16,  loss:0.955
Epoch:16,  loss

In [None]:
net.eval()
k = num_test // 10000 + 1
for i in range(k):
        begin = i * 10000
        end = (i + 1) * 10000
        if end > num_test:
            end = num_test
        mini_test = test_emb[begin : end]
        mini_test = torch.from_numpy(mini_test).long()
        mini_test = mini_test.to(device)
        mini_test = embedd(mini_test)
        mini_test = mini_test.unsqueeze(1)
        result = net(mini_test)
        result = result.cpu()
        result = result.detach()
        result = F.softmax(result,dim=1)
        result_ = np.argmax(result, axis=1)
        result_ = list(result_)
        num_list = list(range(156061 + begin, 156061 + end))
        #print(num_list)
        dataframe = pd.DataFrame({'PhraseId':num_list, 'Sentiment':result_})
        print(dataframe)
        #dataframe.to_csv('q2_L2_dropout_textcnn_mySubmission%d.csv' 
        #                 % (i + 1), index=False, sep=',')