All datasets are subclasses of torchtext.data.Dataset,
which inherits from torch.utils.data.Dataset i.e, they have split and iters methods implemented.


In [4]:
import torchtext
from torchtext import datasets
from torchtext.data import Field
from torchtext.vocab import GloVe

In [None]:
#train_iter, test_iter = datasets.IMDB.iters(batch_size=4)

In [None]:
TEXT = Field(lower=True, include_lengths=True, batch_first=True)
LABEL = Field(sequential=False)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)



In [5]:
# make iterator for splits
train_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, test), batch_size=3, device=0)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


## From Practical Torchtext

https://github.com/keitakurita/practical-torchtext

In [13]:
!ls data/practical_pytorch/

test.csv  train.csv  valid.csv


In [29]:
import pandas as pd
import numpy as np
import torch
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

data_folder = "data/practical_pytorch/"

pd.read_csv(data_folder + "train.csv").head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [15]:
pd.read_csv(data_folder + "valid.csv").head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,000eefc67a2c930f,Radial symmetry \n\nSeveral now extinct lineag...,0,0,0,0,0,0
1,000f35deef84dc4a,There's no need to apologize. A Wikipedia arti...,0,0,0,0,0,0


In [17]:
from torchtext.data import Field
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

tv_datafields = [("id", None), # we won't be needing the id, 
                              # so we pass in None as the field
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), ("threat", LABEL),
                 ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]

trn, vld = TabularDataset.splits(
        path=data_folder, 
        train='train.csv', validation="valid.csv",
        format='csv',
        skip_header=True,
        fields=tv_datafields)


In [18]:
tst_datafields = [("id", None), 
                 ("comment_text", TEXT)
]

tst = TabularDataset(
        path= data_folder + "test.csv", # the file path
        format='csv',
        skip_header=True, 
        fields=tst_datafields)


In [None]:
#

In [21]:
TEXT.build_vocab(trn) #(trx, vectors=GloVe(name='6B', dim=300))

In [22]:
TEXT.vocab.freqs.most_common(10)

[('the', 78),
 ('to', 41),
 ('you', 33),
 ('of', 30),
 ('and', 26),
 ('a', 26),
 ('is', 24),
 ('that', 22),
 ('i', 20),
 ('if', 19)]

In [24]:
trn[0].__dict__.keys()

dict_keys(['comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])

In [26]:
trn[0].comment_text[0:5]

['explanation', 'why', 'the', 'edits', 'made']

In [30]:
train_iter, val_iter = BucketIterator.splits(
        (trn, vld),
        batch_sizes=(64, 64),
        device=device,
        sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [31]:
batch = next(train_iter.__iter__()); batch


[torchtext.data.batch.Batch of size 25]
	[.comment_text]:[torch.cuda.LongTensor of size 494x25 (GPU 0)]
	[.toxic]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.severe_toxic]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.threat]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.obscene]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.insult]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.identity_hate]:[torch.cuda.LongTensor of size 25 (GPU 0)]

In [32]:
batch.__dict__.keys()

dict_keys(['batch_size', 'dataset', 'fields', 'input_fields', 'target_fields', 'comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])

In [33]:
test_iter = Iterator(tst, batch_size=64, device=device, sort=False,
                     sort_within_batch=False, repeat=False)

In [34]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)
    

train_dl = BatchWrapper(train_iter, "comment_text", 
                        ["toxic", "severe_toxic", "obscene", "threat",
                         "insult", "identity_hate"])
valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic",
                                                   "obscene", "threat", "insult",
                                                   "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)

In [37]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() 
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1,
                               dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds
    

em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz)
model.to(device)
model

SimpleBiLSTMBaseline(
  (embedding): Embedding(784, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=500, out_features=6, bias=True)
)

In [47]:
import tqdm


opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs = 30

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()

        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.item() + x.size(0)#loss.data[0] * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item() * x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.67it/s]

100%|██████████| 1/1 [00:00<00:00, 12.34it/s]

100%|██████████| 1/1 [00:00<00:00, 14.87it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

Epoch: 1, Training Loss: 19.7675, Validation Loss: 2.0425
Epoch: 2, Training Loss: 19.7676, Validation Loss: 1.9519
Epoch: 3, Training Loss: 19.7658, Validation Loss: 2.1259


100%|██████████| 1/1 [00:00<00:00, 15.22it/s]

100%|██████████| 1/1 [00:00<00:00, 17.71it/s]

100%|██████████| 1/1 [00:00<00:00, 17.15it/s]

100%|██████████| 1/1 [00:00<00:00, 17.56it/s]

Epoch: 4, Training Loss: 19.7660, Validation Loss: 2.1997
Epoch: 5, Training Loss: 19.7661, Validation Loss: 2.2132
Epoch: 6, Training Loss: 19.7661, Validation Loss: 2.1882




100%|██████████| 1/1 [00:00<00:00, 17.34it/s]

100%|██████████| 1/1 [00:00<00:00, 17.53it/s]

100%|██████████| 1/1 [00:00<00:00, 17.46it/s]

Epoch: 7, Training Loss: 19.7659, Validation Loss: 2.1762
Epoch: 8, Training Loss: 19.7658, Validation Loss: 2.1911
Epoch: 9, Training Loss: 19.7658, Validation Loss: 2.2463




100%|██████████| 1/1 [00:00<00:00, 17.62it/s]

100%|██████████| 1/1 [00:00<00:00, 17.36it/s]

100%|██████████| 1/1 [00:00<00:00, 17.49it/s]

Epoch: 10, Training Loss: 19.7658, Validation Loss: 2.3242
Epoch: 11, Training Loss: 19.7658, Validation Loss: 2.3830
Epoch: 12, Training Loss: 19.7658, Validation Loss: 2.3815




100%|██████████| 1/1 [00:00<00:00, 17.59it/s]

100%|██████████| 1/1 [00:00<00:00, 17.59it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

Epoch: 13, Training Loss: 19.7658, Validation Loss: 2.3661
Epoch: 14, Training Loss: 19.7657, Validation Loss: 2.4545
Epoch: 15, Training Loss: 19.7655, Validation Loss: 2.4653


100%|██████████| 1/1 [00:00<00:00, 17.08it/s]

100%|██████████| 1/1 [00:00<00:00, 17.34it/s]

100%|██████████| 1/1 [00:00<00:00, 17.65it/s]

100%|██████████| 1/1 [00:00<00:00, 17.44it/s]

Epoch: 16, Training Loss: 19.7650, Validation Loss: 2.5071
Epoch: 17, Training Loss: 19.7643, Validation Loss: 2.5903
Epoch: 18, Training Loss: 19.7635, Validation Loss: 2.6039




100%|██████████| 1/1 [00:00<00:00, 17.53it/s]

100%|██████████| 1/1 [00:00<00:00, 17.48it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

Epoch: 19, Training Loss: 19.7626, Validation Loss: 2.6680
Epoch: 20, Training Loss: 19.7624, Validation Loss: 2.8529
Epoch: 21, Training Loss: 19.7622, Validation Loss: 2.4607


100%|██████████| 1/1 [00:00<00:00, 17.55it/s]

100%|██████████| 1/1 [00:00<00:00, 17.49it/s]

100%|██████████| 1/1 [00:00<00:00, 17.30it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

Epoch: 22, Training Loss: 19.7619, Validation Loss: 2.6596
Epoch: 23, Training Loss: 19.7616, Validation Loss: 2.8700
Epoch: 24, Training Loss: 19.7617, Validation Loss: 3.0955


100%|██████████| 1/1 [00:00<00:00, 17.19it/s]

100%|██████████| 1/1 [00:00<00:00, 17.38it/s]

100%|██████████| 1/1 [00:00<00:00, 17.22it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

Epoch: 25, Training Loss: 19.7611, Validation Loss: 3.1806
Epoch: 26, Training Loss: 19.7610, Validation Loss: 3.2209
Epoch: 27, Training Loss: 19.7608, Validation Loss: 3.3348


100%|██████████| 1/1 [00:00<00:00, 17.40it/s]

100%|██████████| 1/1 [00:00<00:00, 17.01it/s]

100%|██████████| 1/1 [00:00<00:00, 17.33it/s]

Epoch: 28, Training Loss: 19.7606, Validation Loss: 3.4046
Epoch: 29, Training Loss: 19.7605, Validation Loss: 3.5304
Epoch: 30, Training Loss: 19.7605, Validation Loss: 3.6833





In [44]:
x.size(0)

494