In [1]:
from torchtext import data,datasets
from torchtext.vocab import GloVe,FastText,CharNGram
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torch
from torchtext.datasets.imdb import IMDB
import sys

In [2]:
torch.__version__
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n[GCC 7.3.0]'

In [3]:
sys.getdefaultencoding()

'utf-8'

In [4]:
is_cuda = False

if torch.cuda.is_available():
    is_cuda=True


In [6]:
TEXT = data.Field(lower=True, batch_first=True,fix_length=40)
LABEL = data.Field(sequential=False)

In [7]:
train, test = IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 9.58MB/s]


In [8]:
type(train)

torchtext.datasets.imdb.IMDB

In [9]:
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

train.fields {'text': <torchtext.data.field.Field object at 0x7fc2504bd048>, 'label': <torchtext.data.field.Field object at 0x7fc250497f28>}
len(train) 25000
vars(train[0]) {'text': ['at', 'first,this', 'movie', 'seems', 'so', 'bad', 'that', 'i', 'almost', 'fell', 'in', 'a', 'trance', 'the', 'first', 'time', 'i', 'saw', 'it.it', 'was', 'like', 'a', 'bad', 'dream.a', 'cosmic', 'bore.but', 'i', 'gave', 'it', 'a', 'second', 'chance,then', 'another', 'and', 'another,etc...i', 'finally', 'got', 'addicted', 'to', 'this', 'film,due', 'to', "it's", 'dreamlike', 'slow', 'pace,wonderful', 'natural', 'sets,bathed', 'in', 'a', 'mellow', 'autumn', 'light', 'and', 'especially', 'the', 'musical', 'score,which', 'is', 'made', 'of', 'some', "70's", 'progressive', 'rock', 'and', 'absolute', 'exquisite', 'folk', 'songs', 'by', 'actor/singer/songwriter', 'derek', 'lamb(the', 'troubadour).you', 'should', 'notice', 'the', 'song', 'about', 'hazel', 'wood,silver', 'trout', 'and', 'lady', 'vanishing', 'in', 't

In [10]:
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300),max_size=10000,min_freq=10)
LABEL.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [03:51, 3.73MB/s]                           
100%|█████████▉| 399089/400000 [00:28<00:00, 14456.36it/s]

In [11]:
LABEL.vocab.freqs

Counter({'pos': 12500, 'neg': 12500})

In [12]:
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

train.fields {'text': <torchtext.data.field.Field object at 0x7fc2504bd048>, 'label': <torchtext.data.field.Field object at 0x7fc250497f28>}
len(train) 25000
vars(train[0]) {'text': ['at', 'first,this', 'movie', 'seems', 'so', 'bad', 'that', 'i', 'almost', 'fell', 'in', 'a', 'trance', 'the', 'first', 'time', 'i', 'saw', 'it.it', 'was', 'like', 'a', 'bad', 'dream.a', 'cosmic', 'bore.but', 'i', 'gave', 'it', 'a', 'second', 'chance,then', 'another', 'and', 'another,etc...i', 'finally', 'got', 'addicted', 'to', 'this', 'film,due', 'to', "it's", 'dreamlike', 'slow', 'pace,wonderful', 'natural', 'sets,bathed', 'in', 'a', 'mellow', 'autumn', 'light', 'and', 'especially', 'the', 'musical', 'score,which', 'is', 'made', 'of', 'some', "70's", 'progressive', 'rock', 'and', 'absolute', 'exquisite', 'folk', 'songs', 'by', 'actor/singer/songwriter', 'derek', 'lamb(the', 'troubadour).you', 'should', 'notice', 'the', 'song', 'about', 'hazel', 'wood,silver', 'trout', 'and', 'lady', 'vanishing', 'in', 't

100%|█████████▉| 399089/400000 [00:40<00:00, 14456.36it/s]

In [13]:
d = vars(TEXT.vocab)

In [14]:
d.keys()

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])

In [15]:
TEXT.vocab.vectors



tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7724, -0.1800,  0.2072,  ...,  0.6736,  0.2263, -0.2919],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [16]:
len(TEXT.vocab.stoi)

10002

In [19]:
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=32, device='cpu')

train_iter.repeat = False
test_iter.repeat = False

In [20]:
class EmbNet(nn.Module):
    def __init__(self,emb_size,hidden_size1,hidden_size2=400):
        super().__init__()
        self.embedding = nn.Embedding(emb_size,hidden_size1)
        self.fc = nn.Linear(hidden_size2,3)
        
    def forward(self,x):
        embeds = self.embedding(x).view(x.size(0),-1)
        out = self.fc(embeds)
        return F.log_softmax(out,dim=-1)
        

In [21]:
model = EmbNet(len(TEXT.vocab.stoi),10)
model = model.cuda()

In [22]:
optimizer = optim.Adam(model.parameters(),lr=0.001)

In [23]:
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=32, device='cpu', shuffle=True)
train_iter.repeat = False
test_iter.repeat = False

In [24]:
def fit(epoch,model,data_loader,phase='training',volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile=True
    running_loss = 0.0
    running_correct = 0
    for batch_idx , batch in enumerate(data_loader):
        text , target = batch.text , batch.label
        if is_cuda:
            text,target = text.cuda(),target.cuda()
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model(text)
        loss = F.nll_loss(output,target)
        
        running_loss += F.nll_loss(output,target, reduction='sum').data
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        if phase == 'training':
            loss.backward()
            optimizer.step()
    
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct.item()/len(data_loader.dataset)
    
    print(f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}')
    return loss,accuracy

In [25]:
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

In [26]:
%%time
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

for epoch in range(1,10):
    epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

training loss is  0.74 and training accuracy is 12866/25000     51.46
validation loss is   0.7 and validation accuracy is 13510/25000     54.04
training loss is  0.68 and training accuracy is 14360/25000     57.44
validation loss is  0.68 and validation accuracy is 14579/25000     58.32
training loss is  0.64 and training accuracy is 15774/25000      63.1
validation loss is  0.65 and validation accuracy is 15587/25000     62.35
training loss is   0.6 and training accuracy is 16992/25000     67.97
validation loss is  0.62 and validation accuracy is 16436/25000     65.74
training loss is  0.55 and training accuracy is 17849/25000      71.4
validation loss is  0.61 and validation accuracy is 16859/25000     67.44
training loss is  0.51 and training accuracy is 18512/25000     74.05
validation loss is   0.6 and validation accuracy is 17181/25000     68.72
training loss is  0.48 and training accuracy is 19112/25000     76.45
validation loss is   0.6 and validation accuracy is 17374/25000   

## 사전 학습 Glove 워드 임베딩 사용하기

In [27]:
TEXT = data.Field(lower=True, batch_first=True,fix_length=40)
LABEL = data.Field(sequential=False)

train, test = IMDB.splits(TEXT, LABEL)

TEXT.build_vocab(train, test, vectors=GloVe(name='6B', dim=300),max_size=10000,min_freq=10)
LABEL.build_vocab(train)

In [28]:
class EmbNet(nn.Module):
    def __init__(self,emb_size,hidden_size1,hidden_size2=400):
        super().__init__()
        self.embedding = nn.Embedding(emb_size,hidden_size1)
        self.fc1 = nn.Linear(hidden_size2,3)

        
    def forward(self,x):
        embeds = self.embedding(x).view(x.size(0),-1)
        out = self.fc1(embeds)
        return F.log_softmax(out,dim=-1)

In [29]:
model = EmbNet(len(TEXT.vocab.stoi),300,12000)
model = model.cuda()

In [30]:
model.embedding.weight.data = TEXT.vocab.vectors.cuda()

In [31]:
model.embedding.weight.requires_grad = False

In [32]:
#optimizer = optim.SGD(model.parameters(),lr=0.001)
optimizer = optim.Adam([ param for param in model.parameters() if param.requires_grad == True],lr=0.001)

In [33]:
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=64, device='cpu',shuffle=True)
train_iter.repeat = False
test_iter.repeat = False

In [34]:
def fit(epoch,model,data_loader,phase='training',volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile=True
    running_loss = 0.0
    running_correct = 0
    for batch_idx , batch in enumerate(data_loader):
        text , target = batch.text , batch.label
        if is_cuda:
            text,target = text.cuda(),target.cuda()
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model(text)
        loss = F.nll_loss(output,target)
        
        running_loss += F.nll_loss(output,target, reduction='sum').data
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        if phase == 'training':
            loss.backward()
            optimizer.step()
    
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct.item()/len(data_loader.dataset)
    
    print(f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}')
    return loss,accuracy

In [35]:
%%time
for epoch in range(1,10):
    epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

training loss is  0.66 and training accuracy is 15675/25000      62.7
validation loss is  0.68 and validation accuracy is 16011/25000     64.04
training loss is  0.56 and training accuracy is 17788/25000     71.15
validation loss is  0.65 and validation accuracy is 16526/25000      66.1
training loss is  0.53 and training accuracy is 18334/25000     73.34
validation loss is  0.69 and validation accuracy is 16326/25000      65.3
training loss is  0.51 and training accuracy is 18706/25000     74.82
validation loss is  0.71 and validation accuracy is 16275/25000      65.1
training loss is  0.49 and training accuracy is 19012/25000     76.05
validation loss is  0.72 and validation accuracy is 16286/25000     65.14
training loss is  0.48 and training accuracy is 19201/25000      76.8
validation loss is  0.74 and validation accuracy is 16205/25000     64.82
training loss is  0.47 and training accuracy is 19487/25000     77.95
validation loss is  0.75 and validation accuracy is 16319/25000   

In [36]:
%%time
for epoch in range(1,10):
    epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

training loss is  0.44 and training accuracy is 19816/25000     79.26
validation loss is  0.79 and validation accuracy is 16175/25000      64.7
training loss is  0.44 and training accuracy is 19842/25000     79.37
validation loss is  0.82 and validation accuracy is 16073/25000     64.29
training loss is  0.44 and training accuracy is 19830/25000     79.32
validation loss is  0.83 and validation accuracy is 16129/25000     64.52
training loss is  0.43 and training accuracy is 19929/25000     79.72
validation loss is  0.85 and validation accuracy is 16113/25000     64.45
training loss is  0.42 and training accuracy is 20022/25000     80.09
validation loss is  0.87 and validation accuracy is 16064/25000     64.26
training loss is  0.42 and training accuracy is 20128/25000     80.51
validation loss is  0.87 and validation accuracy is 15952/25000     63.81
training loss is  0.42 and training accuracy is 20179/25000     80.72
validation loss is  0.89 and validation accuracy is 16102/25000   