In [1]:
import torchtext
torchtext.__version__

'0.8.1'

In [None]:
!pip install torchtext==0.8.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.8.1
  Downloading torchtext-0.8.1-cp39-cp39-manylinux1_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Collecting torch==1.7.1
  Downloading torch-1.7.1-cp39-cp39-manylinux1_x86_64.whl (776.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.8/776.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.13.1+cu116
    Uninstalling torch-1.13.1+cu116:


In [2]:
from torchtext import data 
from torchtext import datasets

TEXT = data.Field(lower = True, batch_first = True)
LABEL = data.Field(sequential=True)

train, test = datasets.IMDB.splits(TEXT, LABEL)



downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:05<00:00, 16.5MB/s]


In [3]:
import torch 
from torchtext import data, datasets
TEXT = data.Field(batch_first = True, 
                  fix_length = 500,
                  tokenize = str.split,
                  pad_first = True,
                  pad_token = '[PAD]',
                  unk_token='[UNK]')

LABEL = data.LabelField(dtype = torch.float)

train_data, test_data  = datasets.IMDB.splits(text_field = TEXT,
                                             label_field = LABEL)



In [5]:
print(f'Train_data_Length : {len(train_data.examples)}')
print(f'test_data_Length : {len(test_data.examples)}')

Train_data_Length : 25000
test_data_Length : 25000


In [6]:
print(train_data.fields)

{'text': <torchtext.data.field.Field object at 0x7f2c2d80a7f0>, 'label': <torchtext.data.field.LabelField object at 0x7f2c2d829730>}


In [9]:
print('---Data Sample---')
print('\nInput : ')
print(' '.join(vars(train_data.examples[1])['text']), '\\n')
print('\nLabel : ')
print(vars(train_data.examples[1])['label'])

---Data Sample---

Input : 
Certainly any others I have seen pale in comparison. The series gives balanced coverage to all theatres of operation. No one country is given undue credit for the Allied victory. Laurence Olivier brings great weight and dignity to his role as narrator. \n

Label : 
pos


In [34]:
import re

def PreProcessingText(input_sentence) :
    input_sentence = input_sentence.lower()
    # <br /> 처리
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence)
    input_sentence = re.sub('[^a-z0-9]', ' ', input_sentence)
    input_sentence = re.sub('\\s+', ' ', input_sentence)
    if input_sentence :
        return input_sentence
for example in train_data.examples :
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

for example in test_data.examples :
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()


In [36]:
TEXT.build_vocab(train_data,
                 min_freq=2,
                 max_size = None,
                 vectors  = 'glove.6B.300d')
LABEL.build_vocab(train_data)

In [37]:
print(f'Vocab Size : {len(TEXT.vocab)}')

print('Vocab Examples : ')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()) :
    if idx >= 10 :
        break 
    print('\\t', k, v)

print('---------------------------------------')

print(f'Label Size : {len(LABEL.vocab)}')

for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()) :
    if idx >= 10 :
        break 
    print('\\t', k, v)

Vocab Size : 340
Vocab Examples : 
\t [UNK] 0
\t [PAD] 1
\t a 2
\t b 3
\t ab 4
\t ba 5
\t 10 6
\t bab 7
\t 2 8
\t bb 9
---------------------------------------
Label Size : 2
\t neg 0
\t pos 1


In [38]:
import random 

train_data, valid_data = train_data.split(random_state= random.seed(0),
                                          split_ratio = 0.8)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets = (train_data, valid_data, test_data), batch_size = 32, device = device)



In [67]:
import torch.nn as nn 
class SentenceClassification(nn.Module) :
    def __init__(self, **model_config) :
        super(SentenceClassification, self).__init__()

        if model_config['emb_type'] == 'glove' or 'fasttext' :
            self.emb = nn.Embedding(model_config['vocab_size'],
                                    model_config['emb_dim'],
                                    _weight = TEXT.vocab.vectors)
            
        else :
            self.emb = nn.Embedding(model_config['vocab_size'],
                                    model_config['emb_dim'])
            
        self.bidirectional = model_config['bidirectional']
        self.num_directions = 2 if model_config['bidirectional'] else 1 
        self.model_type = model_config['model_type']

        self.RNN = nn.RNN(input_size = model_config['emb_dim'],
                          hidden_size = model_config['hidden_dim'],
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])
        
        self.LSTM = nn.LSTM(input_size = model_config['emb_dim'],
                          hidden_size = model_config['hidden_dim'],
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])
        
        self.GRU = nn.GRU(input_size = model_config['emb_dim'],
                          hidden_size = model_config['hidden_dim'],
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])
        
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_directions,
                            model_config['output_dim'])

        self.drop = nn.Dropout(model_config['dropout'])


    def forward(self, x) :
        emb = self.emb(x)

        if self.model_type == 'RNN' :
            output, hidden = self.RNN(emb)
        elif self.model_type == 'LSTM' :
            output, hidden = self.LSTM(emb)
        elif self.model_type == 'GRU' :
            output, hidden = self.GRU(emb)
        else :
            raise NameError('Select model_type in [RNN, LSTM, GUR]')

        last_output = output[:, -1, :]

        return self.fc(self.drop(last_output))

In [68]:
sample_for_check = next(iter(train_iterator))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.LongTensor of size 32x500]
	[.label]:[torch.FloatTensor of size 32]
tensor([[1, 1, 1,  ..., 2, 2, 2],
        [1, 1, 1,  ..., 2, 2, 2],
        [1, 1, 1,  ..., 2, 2, 3],
        ...,
        [1, 1, 1,  ..., 3, 2, 2],
        [1, 1, 1,  ..., 2, 2, 2],
        [1, 1, 1,  ..., 2, 2, 2]])
tensor([1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1.,
        1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1.])




In [69]:
del model

In [71]:
model_config = {'emb_type' : 'glove', 'emb_dim' : 300, 'vocab_size' : len(TEXT.vocab), 'batch_size' : 32}
model_config.update(dict(batch_first = True, 
                         model_type = 'RNN',
                         bidirectional = True,
                         hidden_dim = 128,
                         output_dim = 1,
                         dropout = 0))
model = SentenceClassification(**model_config).to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device) 

def binary_accuracy(pred,y) :
    rounded_preds = torch.round(torch.sigmoid(pred))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct) 
    return acc 

predictions = model.forward(sample_for_check.text).squeeze()
loss = loss_fn(predictions, sample_for_check.label)
acc = binary_accuracy(predictions, sample_for_check.label)

print(predictions)
print(loss.item(), acc)

tensor([ 0.0717,  0.0470, -0.1022,  0.2179,  0.0679,  0.0679, -0.1241,  0.1152,
         0.0636, -0.1022,  0.0687,  0.0677,  0.0854,  0.0998,  0.1848,  0.0633,
         0.0679, -0.1022,  0.0679,  0.0357,  0.0697,  0.0534,  0.0679,  0.1529,
         0.2122,  0.1775,  0.1346,  0.0721, -0.0441, -0.0415,  0.0686,  0.0369],
       grad_fn=<SqueezeBackward0>)
0.6852755546569824 tensor(0.5625)


In [52]:
sample_for_check.text

tensor([[ 1,  1,  1,  ...,  2,  8,  2],
        [ 1,  1,  1,  ...,  3,  3,  2],
        [ 1,  1,  1,  ...,  3,  2,  3],
        ...,
        [ 1,  1,  1,  ...,  2, 26,  2],
        [ 1,  1,  1,  ...,  2,  2,  2],
        [ 1,  1,  1,  ...,  2,  2,  2]])

In [59]:
model_config

{'emb_type': 'glove',
 'emb_dim': 300,
 'vocab_size': 340,
 'batch_first': True,
 'model_type': 'RNN',
 'bidirectional': True,
 'hidden_dim': 128,
 'output_dim': 1,
 'dropout': 0}

In [17]:
TEXT.vocab.vectors

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.5106, -0.1357,  0.2680,  ...,  0.1161, -0.0522,  0.3853],
        [ 0.1081, -0.3056, -0.0545,  ..., -0.0378,  0.0010,  0.7474],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])