In [1]:
import torchtext
torchtext.__version__

'0.14.1'

In [2]:
!pip install torchtext==0.8.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.8.1
  Downloading torchtext-0.8.1-cp39-cp39-manylinux1_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting torch==1.7.1
  Downloading torch-1.7.1-cp39-cp39-manylinux1_x86_64.whl (776.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.8/776.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.13.1+cu116
    Uninstalling torch-1.13.1+cu116:
      Successfully uninstalled torch-1.13.1+cu116
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.14.1
    Uninstalling torchtext-0.14.1:
      Successfully uninstalled torchtext-0.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the pa

In [1]:
from torchtext import data 
from torchtext import datasets

TEXT = data.Field(lower = True, batch_first = True)
LABEL = data.Field(sequential=True)

train, test = datasets.IMDB.splits(TEXT, LABEL)



downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 10.5MB/s]


In [2]:
import torch 
from torchtext import data, datasets
TEXT = data.Field(batch_first = True, 
                  fix_length = 500,
                  tokenize = str.split,
                  pad_first = True,
                  pad_token = '[PAD]',
                  unk_token='[UNK]')

LABEL = data.LabelField(dtype = torch.float)

train_data, test_data  = datasets.IMDB.splits(text_field = TEXT,
                                             label_field = LABEL)



In [3]:
print(f'Train_data_Length : {len(train_data.examples)}')
print(f'test_data_Length : {len(test_data.examples)}')

Train_data_Length : 25000
test_data_Length : 25000


In [4]:
print(train_data.fields)

{'text': <torchtext.data.field.Field object at 0x7f75a61de2b0>, 'label': <torchtext.data.field.LabelField object at 0x7f75a61de940>}


In [5]:
print('---Data Sample---')
print('\nInput : ')
print(' '.join(vars(train_data.examples[1])['text']), '\\n')
print('\nLabel : ')
print(vars(train_data.examples[1])['label'])

---Data Sample---

Input : 
I was drawn to this movie the moment I saw a preview of it on Oscar night. When I read about Kay Pollak, I was hooked. We Americans are suckers for a comeback kid.<br /><br />I understand this movie was a huge draw in Sweden. As a very provincial American I can only speculate on the reason. Perhaps it is because of the provocative joke that the Lena character makes at the beginning of the movie and other social comment but perhaps it is because of the central message which I believe has the same appeal everywhere in affluent societies.<br /><br />The message of this movie for me is the same as the movie Titanic. Life is short people and as far as anyone really knows it's all we've got. It can be taken away at any time. So isn't it a pity that we spend so much time hiding behind walls separating us from other people because we're so afraid of being hurt? Tearing down the walls is painful but feeling alive lies on the other side of those wretched walls. Feelin

In [12]:
import re

def PreProcessingText(input_sentence) :
    input_sentence = input_sentence.lower()
    # <br /> 처리
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence)
    input_sentence = re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_\'{|}~]', ' ', input_sentence)
    input_sentence = re.sub('\s+', ' ', input_sentence)
    if input_sentence :
        return input_sentence
for example in train_data.examples :
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

for example in test_data.examples :
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()


In [17]:
TEXT.build_vocab(train_data,
                 min_freq=2,
                 max_size = None,
                 vectors  = 'glove.6B.300d')
LABEL.build_vocab(train_data)

In [18]:
print(f'Vocab Size : {len(TEXT.vocab)}')

print('Vocab Examples : ')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()) :
    if idx >= 10 :
        break 
    print('\\t', k, v)

print('---------------------------------------')

print(f'Label Size : {len(LABEL.vocab)}')

for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()) :
    if idx >= 10 :
        break 
    print('\\t', k, v)

Vocab Size : 43348
Vocab Examples : 
\t [UNK] 0
\t [PAD] 1
\t the 2
\t and 3
\t a 4
\t of 5
\t to 6
\t is 7
\t it 8
\t in 9
---------------------------------------
Label Size : 2
\t pos 0
\t neg 1


In [19]:
import random 

train_data, valid_data = train_data.split(random_state= random.seed(0),
                                          split_ratio = 0.8)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets = (train_data, valid_data, test_data), batch_size = 32, device = device)



In [20]:
import torch.nn as nn 
class SentenceClassification(nn.Module) :
    def __init__(self, **model_config) :
        super(SentenceClassification, self).__init__()

        if model_config['emb_type'] == 'glove' or 'fasttext' :
            self.emb = nn.Embedding(model_config['vocab_size'],
                                    model_config['emb_dim'],
                                    _weight = TEXT.vocab.vectors)
            
        else :
            self.emb = nn.Embedding(model_config['vocab_size'],
                                    model_config['emb_dim'])
            
        self.bidirectional = model_config['bidirectional']
        self.num_directions = 2 if model_config['bidirectional'] else 1 
        self.model_type = model_config['model_type']

        self.RNN = nn.RNN(input_size = model_config['emb_dim'],
                          hidden_size = model_config['hidden_dim'],
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])
        
        self.LSTM = nn.LSTM(input_size = model_config['emb_dim'],
                          hidden_size = model_config['hidden_dim'],
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])
        
        self.GRU = nn.GRU(input_size = model_config['emb_dim'],
                          hidden_size = model_config['hidden_dim'],
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])
        
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_directions,
                            model_config['output_dim'])

        self.drop = nn.Dropout(model_config['dropout'])


    def forward(self, x) :
        emb = self.emb(x)

        if self.model_type == 'RNN' :
            output, hidden = self.RNN(emb)
        elif self.model_type == 'LSTM' :
            output, hidden = self.LSTM(emb)
        elif self.model_type == 'GRU' :
            output, hidden = self.GRU(emb)
        else :
            raise NameError('Select model_type in [RNN, LSTM, GUR]')

        last_output = output[:, -1, :]

        return self.fc(self.drop(last_output))

In [21]:
sample_for_check = next(iter(train_iterator))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)



KeyError: ignored

In [13]:
next(iter(train_iterator))



KeyError: ignored

In [None]:
# del model

In [None]:
model_config = {'emb_type' : 'glove', 'emb_dim' : 300, 'vocab_size' : len(TEXT.vocab), 'batch_size' : 32}
model_config.update(dict(batch_first = True, 
                         model_type = 'RNN',
                         bidirectional = True,
                         hidden_dim = 128,
                         output_dim = 1,
                         dropout = 0))
model = SentenceClassification(**model_config).to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device) 

def binary_accuracy(pred,y) :
    rounded_preds = torch.round(torch.sigmoid(pred))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct) 
    return acc 

predictions = model.forward(sample_for_check.text).squeeze()
loss = loss_fn(predictions, sample_for_check.label)
acc = binary_accuracy(predictions, sample_for_check.label)

print(predictions)
print(loss.item(), acc)

In [None]:
sample_for_check.text

In [None]:
model_config

In [None]:
TEXT.vocab.vectors

In [None]:
def train(model, iterator, optimizer, loss_fn, idx_Epoch, **model_params) :
    Epoch_loss = 0
    Epoch_acc = 0

    model.train()
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator) :
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)

        sys.stdout.write(
            '\r' + f'[Train] Epoch : {idx_Epoch : ^3}'\
            f'[{(idx+1) * batch_size} / {len(iterator) * batch_size}'\
            f'  Loss : {loss.item():.4}'\
            f'  Acc : {acc.item():.4}'\
        )
    
        loss.backward()
        optimizer.step()

        Epoch_loss += loss.item()
        Epoch_acc += acc.item()

    return Epoch_loss / len(iterator), Epoch_acc / len(iterator) 

def evaluate(model, iterator, loss_fn) :
    model.eval()

    Epoch_loss = 0 
    Epoch_acc = 0 

    with torch.no_grad() :
        for batch in iterator :
            predictions = model(batch.text).squeeze()
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            Epoch_loss += loss.item()
            Epoch_acc += acc.item()

    return Epoch_loss / len(iterator), Epoch_acc / len(iterator)

In [None]:
del model

In [None]:
model_config.update(dict(batch_first = True, 
                         model_type = 'RNN',
                         bidirectional = True,
                         hidden_dim = 128,
                         output_dim = 1,
                         dropout = 0))
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

N_EPOCH = 5
best_valid_loss = float('inf')
model_name = f'{"bi-" if model_config["bidirectional"] else ""}{model_config["model_type"]}_{model_config["emb_type"]}'

print('-------------------------------------')
print(f'Model name : {model_name}')
print('-------------------------------------')

import sys
for Epoch in range(1, N_EPOCH+1) :
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, Epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss :
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t saved at  {Epoch}-Epoch')

    print(f'\t Epoch : {Epoch} | Train Loss : {train_loss:.4} | Train_Acc : {train_acc:.4}')
    print(f'\t Epoch : {Epoch} | Valid Loss : {valid_loss:.4} | Valid_Acc : {valid_acc:.4}')