# Torchtext Code Along & Notes

**NB!** The current (30/3/2018) release of PyTorch Torchtext has some bugs preventing execution of this notebook. The updated version of Torchtext used in this notebook can be retrieved via:
```
pip install --upgrade git+https://github.com/pytorch/text
```

-- Wayne Nixalo

---

## Clean Run

### 1. Imports & Paths

In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import tqdm

In [2]:
path  = pathlib.Path('../../data/')
comp  = pathlib.Path('competitions/jigsaw-toxic-comment-classification-challenge/')
TRAIN_DATA_FILE = pathlib.Path('train.csv')
TEST_DATA_FILE  = pathlib.Path('test.csv')

device=0 # index of GPU. -1:CPU

### 2. Declare Fields

In [6]:
tokenize = lambda x: x.split()

TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

### 3. Construct Dataset

In [9]:
%%time
tv_datafields = [("id", None),      ("comment_text", TEXT), 
                 ("toxic", LABEL),  ("severe_toxic", LABEL), 
                 ("threat", LABEL), ("obscene", LABEL), 
                 ("insult", LABEL), ("identity_hate", LABEL)]
trn, vld = TabularDataset.splits(
        path=path/comp,
        train=TRAIN_DATA_FILE, validation=TRAIN_DATA_FILE,
        format='csv', skip_header=True,
        fields=tv_datafields)

# for the test data we don't have any labels
tst_datafields = [("id", None), ("comment_text", TEXT)]

tst = TabularDataset(
        path=path/comp/TEST_DATA_FILE,
        format='csv',
        skip_header=True,
        fields=tst_datafields)

TEXT.build_vocab(trn)

CPU times: user 21.7 s, sys: 1.25 s, total: 23 s
Wall time: 22.9 s


### 4. Constructing the Iterator

In [16]:
%%time

train_iter, val_iter = BucketIterator.splits(
    (trn, vld),  # datasets for Iterator to draw data from
    batch_sizes=(64,64),
    device=device, # GPU/CPU
    sort_key = lambda x: len(x.comment_text), # fn to group data with (here: comment length)
    sort_within_batch=False,
    repeat=False # this Iterator will be wrapped
)

# test set shouldn't be shuffled ==> use standard Iterator
test_iter = Iterator(
    tst, batch_size=64, device=device, sort=False, 
    sort_within_batch=False, repeat=False
)

### 5. Wrapping the Iterator

In [20]:
class BatchWrapper:
    def __init__(self, dl, x, y):
        self.dl, self.x, self.y = dl, x, y

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x) # assuming only 1 input in this wrapper

            if self.y is not None: # we'll cocncat y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x,y)

    def __len__(self):
        return len(self.dl)

In [21]:
train_dl = BatchWrapper(train_iter, "comment_text", list(trn.fields.keys())[2:])
valid_dl = BatchWrapper(val_iter, "comment_text", list(trn.fields.keys())[2:])
test_dl  = BatchWrapper(test_iter, "comment_text", None)

### 6. Model Architecture -- LSTM

In [24]:
class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout, bidirectional=False)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
        
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [25]:
emb_sz = 100
nh = 500
nl = 3
model = SimpleLSTMBaseline(nh, emb_dim=emb_sz)

if device != -1:
    model.cuda()

model

SimpleLSTMBaseline(
  (embedding): Embedding(470342, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList(
  )
  (predictor): Linear(in_features=500, out_features=6, bias=True)
)

### 7. Training Loop

In [27]:
%%time

# opt = optim.Adam(model.parameters(), lr=1e-2, weight_decay=0.1)
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_fn = nn.BCEWithLogitsLoss()

epochs = 2

for epoch in range(1, epochs+1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # we can easily iterate over our data thanks to our wrapper
        opt.zero_grad()
        preds = model(x)
        loss = loss_fn(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.data[0] * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    # calculate validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_fn(preds, y)
        val_loss += loss.data[0] * x.size(0)
        
    val_loss /= len(vld)
    print(f'Epoch: {epoch}, Training Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')

### 8. Writing Predictions

In [29]:
test_preds = []
for x, y in tqdm.tqdm(test_dl):
    preds = model(x)
    # if your data is on the GPU you need to move it back to the CPU
    preds = preds.data.cpu().numpy()
    # preds = preds.data.numpy()
    # the actual outputs of the model are logits, so we need to pass these 
    # values to the sigmoid function
    preds = 1 / (1 + np.exp(-preds))
    test_preds.append(preds)
test_preds = np.concatenate(test_preds, axis=0)

100%|██████████| 2394/2394 [03:59<00:00, 10.02it/s]


In [33]:
df = pd.read_csv(path/comp/"sample_submission.csv")
for i,col in enumerate(["toxic","severe_toxic","threat","obscene","insult","identity_hate"]):
    df[col] = test_preds[:, i]
df.to_csv(path/comp/"submission_torchtext_LSTM_00.csv",index=False)

## Run with Light Notes

In [1]:
import pandas as pd
import numpy as np
import pathlib
import torch

In [2]:
path  = pathlib.Path('../../data/')
comp  = pathlib.Path('competitions/jigsaw-toxic-comment-classification-challenge/')
TRAIN_DATA_FILE = pathlib.Path('train.csv')
TEST_DATA_FILE  = pathlib.Path('test.csv')

***NOTE***: Specify `device=0` to use GPU:0, else `=-1` for CPU.

In [3]:
device=0 # index of GPU. -1:CPU

### ~~Fill Missing Values~~

(not needed in updated torchtext: `torchtext.data.Field` class already specifies `<unk>` token for unknown words. -- Does this handle empty fields? Yes: the Torchtext dataloader automatically zero-pads sequences in batch.

In [4]:
# train = pd.read_csv(path/comp/TRAIN_DATA_FILE)
# test  = pd.read_csv(path/comp/TEST_DATA_FILE)

# train["comment_text"] = train["comment_text"].fillna("_na_").values
# test["comment_text"]  = test["comment_text"].fillna("_na_").values

# # update paths to copies on disk
# TRAIN_DATA_FILE = pathlib.Path('nafill_' + str(TRAIN_DATA_FILE))
# TEST_DATA_FILE  = pathlib.Path('nafill_' + str(TEST_DATA_FILE))

# train.to_csv(path/comp/TRAIN_DATA_FILE)
# test.to_csv(path/comp/TEST_DATA_FILE)

### Declare Fields

`torchtext.data.Field` class determines how data is preprocessed and converted into numeric format.

In [5]:
from torchtext.data import Field

Want "comment_text" field to be lowercase, tokenized on whitespace, and preprocessed.

In [6]:
tokenize = lambda x: x.split()

TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

Labels are already in binary encoding. Just need tell `Field` that they're already processed. Do this with `use_vocab=False`:

In [7]:
LABEL = Field(sequential=False, use_vocab=False)

### Construct Dataset

TabularDataset handles CSV, TSV, and JSON data reading:

In [8]:
from torchtext.data import TabularDataset

In [9]:
%%time
tv_datafields = [("id", None),      ("comment_text", TEXT), 
                 ("toxic", LABEL),  ("severe_toxic", LABEL), 
                 ("threat", LABEL), ("obscene", LABEL), 
                 ("insult", LABEL), ("identity_hate", LABEL)]
trn, vld = TabularDataset.splits(
        path=path/comp,
        train=TRAIN_DATA_FILE, validation=TRAIN_DATA_FILE,
        format='csv', skip_header=True,
        fields=tv_datafields)

# for the test data we don't have any labels
tst_datafields = [("id", None), ("comment_text", TEXT)]

tst = TabularDataset(
        path=path/comp/TEST_DATA_FILE,
#         test=TEST_DATA_FILE,
#         path='../../data/competitions/jigsaw-toxic-comment-classification-challenge/test.csv',
        format='csv',
        skip_header=True,
        fields=tst_datafields)

CPU times: user 21.7 s, sys: 1.25 s, total: 23 s
Wall time: 22.9 s


It's instructive to take a look inside the Dataset. Datasets can be indexed like normal lists, so looking at the first element:

In [10]:
trn[0]

<torchtext.data.example.Example at 0x7f97ef12e7f0>

The `Example` object bundles the attributes of a single datapoint together. At this point the text has been tokenized.

In [11]:
trn[0].__dict__.keys()

dict_keys(['comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])

In [12]:
trn[0].comment_text[:3]

['explanation', 'why', 'the']

For the `TEXT` field to convert words to integers, it needs to be told what the entire vocabulary is. To do this, we run `TEXT.build_vocab`, passing in the dataset to build the vocabulary on.

In [13]:
%%time
TEXT.build_vocab(trn)

CPU times: user 3.88 s, sys: 68 ms, total: 3.95 s
Wall time: 3.95 s


`vocab.freqs` is a `collections.Counter` object, so we can take a look at the most frequent words.

In [14]:
TEXT.vocab.freqs.most_common(10)

[('the', 490031),
 ('to', 294069),
 ('of', 222834),
 ('and', 218120),
 ('a', 211778),
 ('i', 196695),
 ('you', 187782),
 ('is', 170753),
 ('that', 146478),
 ('in', 140540)]

This makes Torchtext go through all the elements in the training set, check the contents corresponding to the `TEXT` field, and register the words in its vocabulary. Torchtext has its own class called `Vocab` for handling the vocabulary. The `Vocab` class holds a mapping frm word to id in its `stoi` attribute and a reverse mapping in its `itos` attribute. In addition to this, it cn automatically build an embedding matrix for you using various pretrained embeddings like word2vec. The `Vocab` class can also take options like `max_size` and `min_freq` that dictate how many words are in the vocabulary or how many times a word has to appear to be registered in the vocabulary. Words that aren't included in the vocabulary will be converted into `<unk>`, the "unknown" token.

Now that we have our data formatted and read into memeory, we turn to the next step: creating an Iterator to pass the data to our model:

### Constructing the Iterator

'Iterator' is the Torchtext DataLoader with some extra NLP-specific functionalty.

In [15]:
from torchtext.data import Iterator, BucketIterator

`BucketIterator` automatically shuffles & buckets input sequences into seqs of similar length. Allows for efficient padding. You *must* tell `BucketIterator` what attribute you want to bucket the data on.

Here we want to bucket based on lengths of `comment_text` field. For test data, we don't want to shuffle since we'll be outputting predictions at end fo training -- that's why we use a standard `Iterator`.

In [16]:
train_iter, val_iter = BucketIterator.splits(
    (trn, vld),  # datasets for Iterator to draw data from
    batch_sizes=(64,64),
    device=device, # GPU/CPU
    sort_key = lambda x: len(x.comment_text), # fn to group data with (here: comment length)
    sort_within_batch=False,
    repeat=False # this Iterator will be wrapped
)

# test set shouldn't be shuffled ==> use standard Iterator
test_iter = Iterator(
    tst, batch_size=64, device=device, sort=False, 
    sort_within_batch=False, repeat=False
)

Output of `BucketIterator`:

In [17]:
batch = next(train_iter.__iter__()); batch


[torchtext.data.batch.Batch of size 64]
	[.comment_text]:[torch.cuda.LongTensor of size 22x64 (GPU 0)]
	[.toxic]:[torch.cuda.LongTensor of size 64 (GPU 0)]
	[.severe_toxic]:[torch.cuda.LongTensor of size 64 (GPU 0)]
	[.threat]:[torch.cuda.LongTensor of size 64 (GPU 0)]
	[.obscene]:[torch.cuda.LongTensor of size 64 (GPU 0)]
	[.insult]:[torch.cuda.LongTensor of size 64 (GPU 0)]
	[.identity_hate]:[torch.cuda.LongTensor of size 64 (GPU 0)]

In [18]:
next(test_iter.__iter__())


[torchtext.data.batch.Batch of size 64]
	[.comment_text]:[torch.cuda.LongTensor of size 290x64 (GPU 0)]

The length of the `.comment_text` tensor is determined by the max-length comment in that minibatch.

The batch has all the fields we passed to the Dataset as attributes. The batch data can be accessed through the attribute with the same name:

In [19]:
batch.__dict__.keys()

dict_keys(['batch_size', 'dataset', 'train', 'fields', 'comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])

### Wrapping the Iterator

The `Iterator` returns a custom datatype: `torch.data.Batch` `Batch` has similar API to `Example`: w/ a batch of data from each field as attributes. We can use a simple wrapper to ease use. $\longrightarrow$ convert `Batch` to tuple: (x, y):

In [20]:
class BatchWrapper:
    def __init__(self, dl, x, y):
        self.dl, self.x, self.y = dl, x, y

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x) # assuming only 1 input in this wrapper

            if self.y is not None: # we'll cocncat y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x,y)

    def __len__(self):
        return len(self.dl)

In [21]:
train_dl = BatchWrapper(train_iter, "comment_text", list(trn.fields.keys())[2:])

valid_dl = BatchWrapper(val_iter, "comment_text", list(trn.fields.keys())[2:])

test_dl  = BatchWrapper(test_iter, "comment_text", None)


In [22]:
print(trn.fields.keys())
print(list(trn.fields.keys())[2:])

dict_keys(['id', 'comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])
['toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate']


All we're doing here is converting the `Batch` object into a tuple of inputs and outputs:

In [24]:
next(train_dl.__iter__())

(Variable containing:
  4.0099e+05  1.0910e+03  5.5000e+01  ...   1.0690e+03  1.6950e+03  1.2500e+02
  4.6200e+02  1.1468e+05  5.9000e+01  ...   2.7000e+01  2.0000e+00  1.3000e+01
  2.8000e+01  9.0000e+00  1.3600e+02  ...   7.0000e+00  9.9200e+02  4.1500e+02
                 ...                   ⋱                   ...                
  5.4600e+02  2.0000e+00  2.8550e+03  ...   2.6400e+02  1.2000e+01  2.0940e+03
  2.5700e+02  2.4137e+05  3.0416e+04  ...   3.5927e+04  1.8400e+03  1.3000e+01
  1.0000e+00  1.0000e+00  1.0000e+00  ...   1.5373e+04  2.6000e+02  5.8690e+03
 [torch.cuda.LongTensor of size 38x64 (GPU 0)], Variable containing:
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     

In [22]:
next(test_dl.__iter__()) # this wont screw up ordering will it?

(Variable containing:
  0.0000e+00  1.8000e+01  3.9310e+03  ...   1.8000e+01  4.4400e+02  1.8000e+01
  3.9500e+02  3.4705e+04  7.0000e+00  ...   1.1371e+04  1.1730e+03  1.9780e+03
  0.0000e+00  7.0000e+00  2.2430e+03  ...   1.0783e+04  2.9600e+02  8.0000e+00
                 ...                   ⋱                   ...                
  1.0000e+00  1.0000e+00  1.0000e+00  ...   1.0000e+00  1.0000e+00  1.0000e+00
  1.0000e+00  1.0000e+00  1.0000e+00  ...   1.0000e+00  1.0000e+00  1.0000e+00
  1.0000e+00  1.0000e+00  1.0000e+00  ...   1.0000e+00  1.0000e+00  1.0000e+00
 [torch.cuda.LongTensor of size 764x64 (GPU 0)], 
  0
 [torch.FloatTensor of size 1])

Now we're ready to start training a model!

### Training a Text Classifier -- LSTM

We'll use a simple LSTM as a baseline example

In [23]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [24]:
class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout, bidirectional=False)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
        
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [25]:
emb_sz = 100
nh = 500
nl = 3
model = SimpleLSTMBaseline(nh, emb_dim=emb_sz)

if device != -1:
    model.cuda()

model

SimpleLSTMBaseline(
  (embedding): Embedding(470342, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList(
  )
  (predictor): Linear(in_features=500, out_features=6, bias=True)
)

### Training Loop

We can iterate using our wrapped `Iterator`, and the data will automatically be passed to us after being moved to the GPU and numericalized appropriately.

***NB***: RNNs suffer from gradient instability without regularization.

In [26]:
import tqdm

In [27]:
# opt = optim.Adam(model.parameters(), lr=1e-2, weight_decay=0.1)
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_fn = nn.BCEWithLogitsLoss()

epochs = 2

In [28]:
%%time
for epoch in range(1, epochs+1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # we can easily iterate over our data thanks to our wrapper
        opt.zero_grad()
        preds = model(x)
        loss = loss_fn(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.data[0] * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    # calculate validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_fn(preds, y)
        val_loss += loss.data[0] * x.size(0)
        
    val_loss /= len(vld)
    print(f'Epoch: {epoch}, Training Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')

100%|██████████| 2494/2494 [02:59<00:00, 13.87it/s]
  0%|          | 0/2494 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.0740, Validation Loss: 0.0515


100%|██████████| 2494/2494 [03:00<00:00, 13.83it/s]


Epoch: 2, Training Loss: 0.0510, Validation Loss: 0.0467
CPU times: user 5min 8s, sys: 2min 5s, total: 7min 13s
Wall time: 7min 11s


### Writing Predictions

In [31]:
test_dl

<__main__.BatchWrapper at 0x7fb044a6f048>

In [29]:
test_preds = []
for x, y in tqdm.tqdm(test_dl):
    preds = model(x)
    # if your data is on the GPU you need to move it back to the CPU
    preds = preds.data.cpu().numpy()
    # preds = preds.data.numpy()
    # the actual outputs of the model are logits, so we need to pass these 
    # values to the sigmoid function
    preds = 1 / (1 + np.exp(-preds))
    test_preds.append(preds)
test_preds = np.concatenate(test_preds, axis=0)

100%|██████████| 2394/2394 [03:59<00:00, 10.02it/s]


In [30]:
test_preds.shape

(153164, 6)

In [7]:
# import pickle

# quick save
# pickle.dump(test_preds, open(path/comp/'test_preds_temp.pkl', 'wb'))

# quick load
# file = open(path/comp/'test_preds_temp.pkl', 'rb')
# test_preds = pickle.load(file)

In [33]:
df = pd.read_csv(path/comp/"sample_submission.csv")
for i,col in enumerate(["toxic","severe_toxic","threat","obscene","insult","identity_hate"]):
    df[col] = test_preds[:, i]
    
# df.drop("comment_text", axis=1).to_csv(path/comp/"submission_torchtext_LSTM_00.csv", index=False)
df.to_csv(path/comp/"submission_torchtext_LSTM_00.csv",index=False)

In [34]:
df = pd.read_csv(path/comp/"submission_torchtext_LSTM_00.csv")
df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.008045,8.3e-05,1.1e-05,0.000283,0.000305,0.000103
1,0000247867823ef7,0.037026,0.000122,8.2e-05,0.000841,0.001089,0.000864
2,00013b17ad220c46,0.925357,0.018958,0.001791,0.541257,0.468671,0.033942
3,00017563c3f7919a,0.017331,0.000181,6.2e-05,0.000273,0.0005,0.000267
4,00017695ad8997eb,0.003544,3.3e-05,1.3e-05,0.000162,0.000174,0.000116


A previous submission (Keras/TensorFlow LSTM using GloVe embeddings and dropout) for reference:

In [31]:
df = pd.read_csv(path/comp/"submission_LSTM_glove_01.csv")
df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.992614,0.4229196,0.943973,0.068383,0.884356,0.297003
1,0000247867823ef7,0.000673,7.136537e-07,0.000113,2e-06,5.5e-05,2.1e-05
2,00013b17ad220c46,0.001434,3.640881e-06,0.000333,5e-06,0.000125,4.7e-05
3,00017563c3f7919a,0.001091,1.288999e-06,0.00013,6e-06,9.4e-05,1.6e-05
4,00017695ad8997eb,0.006187,1.352115e-05,0.000948,3.6e-05,0.000619,0.000126


This model gets a 0.5 score. Checking above, this is not due to shuffling the test-set, the data is in proper order. Instead the LSTM hasn't 'learned' how to classify the data yet.

That being said, this represents a successful baseline for training and submitting a Torchtext NLP model. With the PyTorch-specific mechanics handled (data preprocessing, loading, etc), the design can easily be improved.

## Checking `test_dl` loading properly:

In [1]:
import pandas as pd
import pathlib
import torch
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [2]:
path  = pathlib.Path('../../data/')
comp  = pathlib.Path('competitions/jigsaw-toxic-comment-classification-challenge/')
TRAIN_DATA_FILE = pathlib.Path('train.csv')
TEST_DATA_FILE  = pathlib.Path('test.csv')

In [3]:
path_to_test_data_file = '../../data/competitions/jigsaw-toxic-comment-classification-challenge/test.csv'
alt_path_to_test_data_file = 'practical-torchtext/data/test.csv'

In [4]:
PATH = {0:path/comp/TEST_DATA_FILE, 1:path_to_test_data_file, 2:alt_path_to_test_data_file}

In [5]:
device=0

In [6]:
class BatchWrapper:
    def __init__(self, dl, x, y):
        self.dl, self.x, self.y = dl, x, y

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x) # assuming only 1 input in this wrapper

            if self.y is not None: # we'll cocncat y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x,y)

    def __len__(self):
        return len(self.dl)

In [16]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

In [17]:
tst_datafields = [("id",None), ("comment_text", TEXT)]
tst = TabularDataset(path=PATH[2], format='csv', 
                     skip_header=True, fields=tst_datafields)
test_iter = Iterator(tst, batch_size=64, device=device, sort=False,
                     sort_within_batch=False, repeat=False)
test_dl = BatchWrapper(test_iter, "comment_text", None)

In [18]:
# build vocab (numericalize)
TEXT.build_vocab(tst)

In [19]:
next(test_dl.__iter__())

(Variable containing:
   197   193   199  ...     18   664   192
   493   128   294  ...      6    22     5
    75    15     6  ...    118   310    17
        ...          ⋱          ...       
     1     1     1  ...      1     1     1
     1     1     1  ...      1     1     1
     1     1     1  ...      1     1     1
 [torch.cuda.LongTensor of size 158x33 (GPU 0)], 
  0
 [torch.FloatTensor of size 1])

## Note Heavy partial walkthrough

In [None]:
import pandas as pd
import pathlib

In [2]:
path  = pathlib.Path('../../data/')
comp  = pathlib.Path('competitions/jigsaw-toxic-comment-classification-challenge/')
TRAIN = pathlib.Path(path/comp/'train.csv')
TEST  = pathlib.Path(path/comp/'test.csv')

### 1. Overview

1. Read data from disk
2. Tokenize text
3. Create word-unique-integer mappings
4. Convert text to list of integers
5. Load data into format req'd by DL framekwork
6. Pad text so all seqs same len ==> for batch processing

Torchtext follows the basic formula for transforming data into working input for your neural network:

<img src="https://i0.wp.com/mlexplained.com/wp-content/uploads/2018/02/%E3%82%B9%E3%82%AF%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%B7%E3%83%A7%E3%83%83%E3%83%88-2018-02-07-10.32.59.png?w=1500"/>

### 2. Declaring Fields

Torch text takes a declarative approach to laoding its data: you tell torchtext how you want the data to look, and torchtext hands it for you.

The way you do this is by declaring a Field. The Field specifies how you want a certain (you guessed it) field to be processed. Let's look at an example:

In [None]:
from torchtext.data import Field

tokenize = lambda x : x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

LABEL = Field(sequential=False, use_vocab=False)

In the Toxic Comment Classification dataset there are 2 kinds of fields: the common text and the labels (toxic, severe toxic, etc..)

In [None]:
pd.read_csv(TRAIN).head(2)

If you're passing a field that's already numericalized by default and not sequential, you should pass `use_vocab=False` and `sequential=False`

For the comment text, we pass in the preprocessing we want the field to do as keyword arguents. We give it the tokenizer we want the field to use, tell it to convert the input to lowercase, and also tell it the input is sequential.

In addition to the keyword arguments mentioned above, the Field class also allows the user to speciy special tokens (the `unk_token` for out-of-vocab words, the `pad_token` for padding, `eos_token` for end-of-sentence, and an optional `init_token` for the start of a sentence), choose whether to make the first dimension the batch or the sequence (the 1st dim is the seq by default), and choose whether to allow the sequence lengths to be decided at runtime or in advance. Fortunately, [the docstrings](https://github.com/pytorch/text/blob/c839a7934930819be7e240ea972e4d600966afdc/torchtext/data/field.py#L61) for the **Field** class are relatively well written, so if you need some advanced preprocessing you should refer to them for more information.

The **Field** class is at the center of torchtext and is what makes preprocessing such an ease. Aside from the standard field class, here's a list of the fields that are currently available (along w/ their use cases):

|Name | Description | Use Case|
|-----|-------------|---------|
|Field|A regular field that defines preprocessing and post processing|Non-text fields and text fields where you don't need to map integers back to words.|
|ReversibleField|An extension of the field that allows reverse mapping of word ids to words|Text fields if you want to map the integers back to natural language (such as in the case of language modeling)|
|NestedField|A field that processes non-tokenized text into a set of smaller fields|Char-based models|
|LabelField (New!)|A regular field with `sequential=False` and no `<unk>` token. Newly added on the master branch.|Label fields in text classification|

### 3. Constructing the Dataset

The fields know what to do when given raw data. Now we need to tell the fields what data they should work on. This is where we use Datasets.

There're various built-in Datasets in torchtext that handle common data formats. For CSV/TSV fiels the **`TabularDataset`** class is convenient. Here's how we'd read data from a CSV file using `TabularDataset`:

In [None]:
from torchtext.data import TabularDataset

tv_datafields = [("id", None), # we won't be needing the id, so we pass None as the field
                 ("comment_text", TEXT), ("toxic", LABEL), 
                 ("severe_toxic", LABEL), ("threat", LABEL), 
                 ("obscene", LABEL), ("insult", LABEL), ("identity_hate", LABEL)]
trn, vld = TabularDataset.splits(
                path=path/comp, # the root directory where the data lies
                train='train.csv', validation='train.csv',
                format='csv',
                skip_header=True, # if your csv has a header, make sure to pass this to ensure it doesn't get processed as data!
                fields=tv_datafields)
tst_datafields = [("id", None), # we won't be needing the id, so we pass in Noen as the field
                  ("comment_text", TEXT)]
tst = TabularDataset(
            path=TRAIN, # the file path
            format='csv',
            skip_header=True,
            fields=tst_datafields)

For the `TabularDataset`, we pass in a list of (name, field) pairs as the fields argument. The fields we pass in must be in the same order as the columns. For the columns we don't use, we pass in a tuple where the field element is None.

The splits method creates a dataset for the train and validation data by applying the same processing. It can also handle the data, but since our test data has a different frmat from the train and validation data, we create a different dataset.

Datasets can mostly be treated in the same way as lists. To understand this, it's instructive to take a look inside our Dataset. Datasets can be indexed and iterated over like normal lists, so let's see what the first element looks like:

In [None]:
trn[0]

In [None]:
trn[1].__dict__.keys()

In [None]:
trn[0].comment_text[:3]

In [None]:
trn[1].comment_text[:3]

Torchtext handles mapping words to integers, but it has to be told the full range of words it should handle. In our case, we probably want to build the vocabulary on the training set only, so we run the following code: `TEXT.build_vocab(trn)`

This makes torchtext go through all the elements in the training set, check the contents corresp----

---

List of currently available datasets and the format of data they take:

|Name|Description|Use Case|
|-|-|-|
|`TabularDataset`|Takes the path to CSV/TSV and JSON files or Python dictionaries as inputs.|Any problem that involves a label (or labels) for each piece of text.|
|`LanguageModelingDataset`|Takes the path to a text file.|Language modeling|
|`TranslationDataset`|Takes a path and extensions to a file for each language. eg: If the files are English: "hoge.en", French: "hoge.fr", path="hoge", exts=("en","fr")|Translation|
|`SequenceTaggingDataset`|Takes a path to a file with the input sequence and output sequence separated by tabs.|Sequence tagging.|

Now that we have our data formatted and read into memory, we turn to the next step: creating an iterator to pass the data to our model:

### 4. Constructing the Iterator

In torchvision and PyTorch, the processing and batching of data is handled by DataLoaders. For some reason torchtext has renamed the objects that do the exact same thing to Iterators. The basic functionality is the same, but Iterators, as we will see, have some convenient functionality that is unique to NLP.

Below is code for how you 'd initialize the Iterators for the train, validation, and test data:

In [None]:
from torchtext.data import Iterator, BucketIterator

train_iter, val_iter = BucketIterator.splits(
    (trn, vld), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(64,64),
    device=0, # if you want to use the GPU, specify GPU number here
    sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=0, sort=False, sort_within_batch=False, repeat=False)


***NOTE***: using the `sort_within_batch` argument, when set to True, sorts the data within each minibatch indecreasing order acc. to the `sort_key`. This is necessary when you want to use `pack_padded_sequence` with the padded sequence data and convert the padded sequence tensor to a `PackedSequence` object.

The `BucketIterator` is one of the most powerful features of torchtext. It automatically shuffles and buckets the input sequences into sequences of similar length.

The reason this is powerful is that we need to pad the input sequennces to be of the same length to enable batch processing. For instance, the sequences:
```
[ [3, 15, 2, 7], 
  [4, 1], 
  [5, 5, 6, 8, 1] ]
```
would need to be padded to become:
```
[ [3, 15, 2, 7, 0],
  [4, 1, 0, 0, 0],
  [5, 5, 6, 8, 1] ]
```

The amount of padding necessary is determined by the longest sequence in the batch. Therefore, padding is most efficient when the sequences are of similar lengths. The BucketIterator does all this behind the scenes. As a word of caution, you need to tell the BucketIterator what attribute you want to bucket the data on. In our case, we want to bucket based on the lengths of the comment_text field, so we pass that in as a keyword argument.

For the test data, we don't want to shuffle the data since we'll be ouputting the predictions at the end of training. This is why we use a standard iterator.

---

List of iterators that torchtext currently implements:

|Name|Description|Use Case|
|-|-|-|
|`Iterator`|Iterates over the data in the order of the dataset.|Test data, or any other data where the order is important.|
|`BucketIterator`|Buckets sequences of similar lengths together.|Text classification, sequence tagging, etc. (use cases where the input is of variable length)|
|`BPTTIterator`|An iterator built especially for language modeling that also generates the input sequence delayed by one timestep. It also varies the BPTT length.|Language modeling|

### 5. Wrapping the Iterator

Currently, the iterator returns a custom datatype called `torchtext.data.Batch`. The **`Batch`** class has a similar API to the `Example` type, with a batch of data from each field as attributes. Unfortunately, this custom datatype makes code reuse difficult (since each time the column names change, we need to modify the code), and makes torchtexk hard to use with other libraries for some use cases (like torchsample and fastai).

In the meantime we'll hack on a simple wrapper to make the batches easy to use. Concretely, we'll convert the batch to a tuple in the form (x, y) where x is the independent variable (input) and y is the dependent variable (labels). Code:

In [None]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl,self.x_var,self.y_vars = dl,x_var,y_vars
        
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is #TODO:
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeroes((1))
            yield (x, y)
    
    def __len__(self):
        return len(self.dl)
    
train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)