In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext as tt

import numpy as np
import matplotlib.pylab as plt
import pandas as pd

import pprint

In [3]:
print = pprint.PrettyPrinter().pprint

In [4]:
BATCH_SIZE = 32
device = ('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Using device = {device}')

'Using device = cuda:0'


# Load Dataset

We are picking the TREC dataset as a proxy for the text classification problem. This has ~5500 training examples with 7 classes (one class is heavily imbalanced compared to the other 6 classes).

In [5]:
dataset = tt.datasets.TREC

#fields are essentially columns in your dataset. they pack a lot of
#convenient functionality in their definitions

#we have two fields - one for the raw text and one for the labels
text_field = tt.data.Field(sequential = True,
                           lower = True,
                           init_token = '<sos>',
                           eos_token = '<eos>'
                          )

#can also use tt.data.LabelField (exactly the same as below)
label_field = tt.data.Field(sequential = False,
                            unk_token = None)

#parse and split the data into train and test
train, test = dataset.splits(text_field, label_field)

downloading train_5500.label


train_5500.label: 100%|██████████| 336k/336k [00:00<00:00, 3.75MB/s]
TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 1.47MB/s]

downloading TREC_10.label





In [6]:
print(f'Train has {len(train.examples)} examples')
print(f'Test has {len(test.examples)} examples')

'Train has 5452 examples'
'Test has 500 examples'


In [7]:
print(train.examples[0].text)
print(train.examples[0].label)

['how',
 'did',
 'serfdom',
 'develop',
 'in',
 'and',
 'then',
 'leave',
 'russia',
 '?']
'DESC'


In [8]:
unique_labels, unique_counts = np.unique([ex.label for ex in train.examples], return_counts=True)

In [9]:
print(list(zip(unique_labels, unique_counts)))

[('ABBR', 86),
 ('DESC', 1162),
 ('ENTY', 1250),
 ('HUM', 1223),
 ('LOC', 835),
 ('NUM', 896)]


### Build vocabulary

In [10]:
text_field.build_vocab(train, vectors = 'glove.6B.100d', unk_init = torch.Tensor.normal_)
label_field.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [01:31, 9.40MB/s]                               
100%|█████████▉| 399536/400000 [00:11<00:00, 33234.41it/s]

In [11]:
print(f'Train text has {len(text_field.vocab)} unique words')
print(f'Train label has {len(label_field.vocab)} unique words')

'Train text has 8681 unique words'
'Train label has 6 unique words'


100%|█████████▉| 399536/400000 [00:30<00:00, 33234.41it/s]

In [133]:
label_field.vocab.stoi.keys()

dict_keys(['ENTY', 'HUM', 'DESC', 'NUM', 'LOC', 'ABBR'])

### Build iterators

In [134]:
train_iter, test_iter = tt.data.BucketIterator.splits((train,test), 
                                                      batch_size = BATCH_SIZE,                                                      
                                                      device = device)

# Tf-idf

In [135]:
train_text = [' '.join(t.text) for t in train.examples]
train_labels = [t.label for t in train.examples]

test_text = [' '.join(t.text) for t in test.examples]
test_labels = [t.label for t in test.examples]

In [136]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [137]:
tfidf = TfidfVectorizer()
tfidf.fit(np.array(train_text))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [138]:
train_text_tfidf = tfidf.transform(train_text)
test_text_tfidf = tfidf.transform(test_text)

In [139]:
from sklearn.ensemble import RandomForestClassifier

In [140]:
rf = RandomForestClassifier(n_estimators=1000, n_jobs=10, max_depth=50)
rf.fit(train_text_tfidf, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=10, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [141]:
train_pred_tfidf = rf.predict(train_text_tfidf)
test_pred_tfidf = rf.predict(test_text_tfidf)

In [142]:
train_acc_tfidf = np.sum(train_pred_tfidf == train_labels) / len(train_labels)
test_acc_tfidf = np.sum(test_pred_tfidf == test_labels) / len(test_labels)

print(f'Train Acc = {train_acc_tfidf}')
print(f'Test  Acc = {test_acc_tfidf}')

'Train Acc = 0.9180117388114454'
'Test  Acc = 0.82'


# Custom PyTorch Model

In [301]:
class TextClassification(nn.Module):
    def __init__(self, 
                 input_dim, #vocab size
                 embed_dim, #user defined
                 hidden_dim, #user defined
                 output_dim, #number of output classes
                 device = None
                ):
        
        super(TextClassification, self).__init__()
    
        self.embedding = nn.Embedding(input_dim, embed_dim)
        
        #can pick LSTM, GRU etc.
        num_layers = 5
        bidirectional = True

        if bidirectional: n_directions = 2
        else: n_directions = 1


        
        '''
        self.rnn = nn.RNN(input_size = embed_dim,
                          hidden_size = hidden_dim,
                          num_layers = num_layers,
                          nonlinearity = 'relu',
                          bias = True,
                          batch_first = False,
                          dropout = 0.5,
                          bidirectional = bidirectional)
        '''
        self.rnn = nn.GRU(input_size = embed_dim,
                          hidden_size = hidden_dim,
                          num_layers = num_layers,
                          #nonlinearity = 'relu',
                          bias = True,
                          batch_first = False,
                          dropout = 0.5,
                          bidirectional = bidirectional)
        
        #output is a linear layer (log probs of belonging to output classes)
        self.output = nn.Linear(num_layers * n_directions * hidden_dim, output_dim)
        
        if device is not None: self.to(device)
        
    def forward(self, string_numerical):
        embed_batch = self.embedding(string_numerical)
        
        output, hidden = self.rnn(embed_batch)
        
        log_prob = self.output(hidden.permute(1,0,2).flatten(start_dim=1, end_dim=2)).squeeze(0)
        
        #log_prob = self.output(hidden).squeeze(0)
        
        return log_prob

In [318]:
class TextClassificationAttention(nn.Module):
    def __init__(self, 
                 input_dim, #vocab size
                 embed_dim, #user defined
                 hidden_dim, #user defined
                 output_dim, #number of output classes
                 #bidirectional = False,
                 device = None
                ):
        
        super(TextClassificationAttention, self).__init__()
    
        self.embedding = nn.Embedding(input_dim, embed_dim)
        
        #can pick LSTM, GRU etc.
        '''
        self.rnn = nn.RNN(input_size = embed_dim,
                          hidden_size = hidden_dim,
                          num_layers = 1,
                          nonlinearity = 'relu',
                          bias = True,
                          batch_first = False,
                          dropout = 0.5,
                          bidirectional = False)
        '''
        
        num_layers = 2
        bidirectional = True
        if bidirectional: n_directions = 2
        else: n_directions = 1

        self.rnn = nn.GRU(input_size = embed_dim,
                          hidden_size = hidden_dim,
                          num_layers = num_layers,
                          #nonlinearity = 'relu',
                          bias = True,
                          batch_first = False,
                          dropout = 0.5,
                          bidirectional = bidirectional)
        
        #output is a linear layer (log probs of belonging to output classes)
        self.output = nn.Linear(num_layers * n_directions * hidden_dim, output_dim)
        
        #attention specific layers
        self.attention_linear = nn.Linear(n_directions * hidden_dim, 1)
        self.attention_softmax = nn.Softmax(dim=0)        
        
        if device is not None: self.to(device)
        
    def forward(self, string_numerical):
        embed_batch = self.embedding(string_numerical)
        
        output, hidden = self.rnn(embed_batch)
        
        attention_weights = self.attention_softmax(self.attention_linear(output).squeeze(2))
        attention_hidden = (attention_weights.unsqueeze(0).permute(1,2,0).expand_as(output) * output).sum(dim=0)
                
        #log_prob = self.output(attention_hidden).squeeze(0)
        log_prob = self.output(attention_hidden.permute(1,0,2).flatten(start_dim=1, end_dim=2)).squeeze(0)
        
        return log_prob

### Experimental: Check data flows through model without problems

This section is used to ensure the forward flow makes sense (dimensionally). Ignore for training new model.

In [164]:
print('Looking at first example...\n')
string = train.examples[0].text
print(string)

'Looking at first example...\n'
['how',
 'did',
 'serfdom',
 'develop',
 'in',
 'and',
 'then',
 'leave',
 'russia',
 '?']


In [165]:
print('Numericalize i.e. replace word by index in vocab...\n')
string_numerical = [text_field.vocab.stoi[word] for word in string]
print(string_numerical)

'Numericalize i.e. replace word by index in vocab...\n'
[11, 23, 7662, 2536, 9, 19, 509, 866, 1160, 4]


In [166]:
print('Fields have function numericalize to do the same thing...\n')
text_field.numericalize([string])

'Fields have function numericalize to do the same thing...\n'


tensor([[  11],
        [  23],
        [7662],
        [2536],
        [   9],
        [  19],
        [ 509],
        [ 866],
        [1160],
        [   4]])

In [167]:
train_iter.create_batches()

In [168]:
print('The model works with batches of text and corresponding labels...\n')
string_batch = next(train_iter.batches)
print(f'Batch size = {len(string_batch)}')

'The model works with batches of text and corresponding labels...\n'
'Batch size = 32'


In [169]:
print('Separate text and labels in each batch\n')
string_batch_text = [s.text for s in string_batch]
string_batch_label = [s.label for s in string_batch]

'Separate text and labels in each batch\n'


In [170]:
print('Each field has process function to do preprocessing, numericalization and post-processing before creating a batch\n')
print('Flow = Data -> Tokenize -> Preprocess Func -> Numericalize -> Example Instance -> Postprocess -> Batch Instance\n')

print(text_field.process(string_batch_text)[:,2])

('Each field has process function to do preprocessing, numericalization and '
 'post-processing before creating a batch\n')
('Flow = Data -> Tokenize -> Preprocess Func -> Numericalize -> Example '
 'Instance -> Postprocess -> Batch Instance\n')
tensor([   2,   49,   17,   51,  315,   27,    5,  774,    4,   49,   17,   51,
         359, 3124,   89, 2177,    4,    3,    1,    1])


In [191]:
print('Size of Text vocabulary...')
len(text_field.vocab)

'Size of Text vocabulary...'


8681

In [192]:
print('Define dimensions...')
input_dim = len(text_field.vocab)
embed_dim = 100
hidden_dim = 128
output_dim = len(unique_labels)

'Define dimensions...'


In [324]:
num_layers = 2
bidirectional = True
if bidirectional: n_directions = 2
else: n_directions = 1

embedding = nn.Embedding(input_dim, embed_dim)
rnn = nn.RNN(input_size = embed_dim,
                          hidden_size = hidden_dim,
                          num_layers = num_layers,
                          nonlinearity = 'relu',
                          bias = True,
                          batch_first = False,
                          dropout = 0.5,
                          bidirectional = bidirectional)
output = nn.Linear(num_layers * n_directions * hidden_dim, output_dim)

In [275]:
print('Step 1: Numericalize strings...')
string_batch_numerical = text_field.process(string_batch_text)
print(string_batch_numerical.shape)

'Step 1: Numericalize strings...'
torch.Size([20, 32])


In [276]:
print('Step 2: Embed tokens...')
embed_batch = embedding(string_batch_numerical)
print(embed_batch.shape)

'Step 2: Embed tokens...'
torch.Size([20, 32, 100])


In [277]:
print('Step 3: Pass through rnn...')
rnn_batch_output, rnn_batch_hidden = rnn(embed_batch)
print(rnn_batch_output.shape)
print(rnn_batch_hidden.shape)
print('Note: hidden tensor size = (num_layers * n_directions, batch_size, hidden_dim)')

'Step 3: Pass through rnn...'
torch.Size([20, 32, 128])
torch.Size([1, 32, 128])
'Note: hidden tensor size = (num_layers * n_directions, batch_size, hidden_dim)'


In [278]:
model_log_prob = output(rnn_batch_hidden.permute(1,0,2).flatten(start_dim=1, end_dim=2)).squeeze(0)

In [279]:
print(model_log_prob.shape)

torch.Size([32, 6])


In [280]:
criterion = nn.NLLLoss()

In [281]:
label_field.process(string_batch_label)

tensor([4, 4, 2, 0, 3, 0, 0, 3, 1, 4, 2, 2, 0, 1, 0, 1, 3, 1, 3, 2, 2, 3, 0, 2,
        1, 2, 4, 0, 1, 0, 0, 3])

In [273]:
string_batch_numerical_label = label_field.process(string_batch_label)
loss = criterion(model_log_prob, string_batch_numerical_label)
print(loss)

tensor(-0.0242, grad_fn=<NllLossBackward>)


### Experimental: Multiple layers

This section is used to ensure the forward flow makes sense (dimensionally). Ignore for training new model.

In [311]:
string_batch_numerical.shape

torch.Size([20, 32])

In [312]:
rnn_batch_hidden.shape

torch.Size([1, 32, 128])

In [316]:
rnn_batch_output.shape

torch.Size([20, 32, 128])

In [317]:
weights = nn.Softmax(dim=0)(nn.Linear(2*128, 1)(rnn_batch_output).squeeze(2))
weights.shape

RuntimeError: size mismatch, m1: [640 x 128], m2: [256 x 1] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:961

In [None]:
rnn_batch_output.shape

In [None]:
rnn_batch_output[:,0,:].shape

In [None]:
weights[:,0].shape

In [None]:
torch.matmul(weights[:,0], rnn_batch_output[:,0,:]).shape

In [None]:
(weights.unsqueeze(0).permute(1,2,0).expand_as(rnn_batch_output) * rnn_batch_output).sum(dim=0).shape

In [331]:
print(string_batch_numerical.shape)

embed_batch = embedding(string_batch_numerical)
print(embed_batch.shape)

output, hidden = rnn(embed_batch)
print(output.shape)
print(hidden.shape)

#attention_weights = nn.Softmax(dim=0)(nn.Linear(num_layers * n_directions, 1)(output).squeeze(2))
#attention_hidden = (attention_weights.unsqueeze(0).permute(1,2,0).expand_as(output) * output).sum(dim=0)

#log_prob = self.output(attention_hidden).squeeze(0)
#log_prob = self.output(attention_hidden.permute(1,0,2).flatten(start_dim=1, end_dim=2)).squeeze(0)


torch.Size([20, 32])
torch.Size([20, 32, 100])
torch.Size([20, 32, 256])
torch.Size([4, 32, 128])


# Training

In [294]:
#helper function for accuracy
def accuracy(pred, labels):
    pred_classes = torch.argmax(pred, dim=1)
    
    acc = (pred_classes==labels).float().mean()
    
    return acc

In [295]:
def train_one_epoch(model, iterator, text_field, label_field, lr=1e-3):
    optimizer = optim.Adam(model.parameters(), lr)
    criterion = nn.CrossEntropyLoss()
    
    model.train()
    
    total_loss = 0
    accuracy_list = []
    
    for string_batch in iterator:
        #separate text and label (already numericalized)
        string_batch_text = string_batch.text
        string_batch_label = string_batch.label
                
        #evaluate loss
        pred_log_prob = model(string_batch_text)
        loss = criterion(pred_log_prob, string_batch_label)
        
        #gradient descent (Adam)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
                
        #accuracy
        acc = accuracy(pred_log_prob, string_batch_label)
        accuracy_list.append(acc.item())

    total_loss /= len(iterator)
        
    return model, total_loss, accuracy_list

In [296]:
def validate(model, iterator, text_field, label_field):
    criterion = nn.CrossEntropyLoss()
    
    model.eval()

    total_loss = 0
    correct_pred = 0
    
    with torch.no_grad(): #don't compute gradients
        for string_batch in iterator:
            #separate text and label
            string_batch_text = string_batch.text
            string_batch_label = string_batch.label
                                    
            #evaluate loss
            pred_log_prob = model(string_batch_text)
            loss = criterion(pred_log_prob, string_batch_label)
            
            total_loss += loss.item()
            
            pred_classes = torch.argmax(pred_log_prob, dim=1)
            correct_pred += (pred_classes==string_batch_label).float().sum()
            
    acc = correct_pred / len(test)
    
    total_loss /= len(iterator)
    
    return total_loss, acc

In [333]:
input_dim = len(text_field.vocab)
embed_dim = 100
hidden_dim = 128
output_dim = len(unique_labels)

N_epochs = 10

model = TextClassification(input_dim, embed_dim, hidden_dim, output_dim, device)
#model = TextClassificationAttention(input_dim, embed_dim, hidden_dim, output_dim, device)

for epoch in range(N_epochs):
    model, loss, accuracy_list = train_one_epoch(model, train_iter, text_field, label_field, lr=1e-3)
    
    test_loss, test_acc = validate(model, test_iter, text_field, label_field)
    
    if epoch % 1 == 0:
        print(f'Train Loss = {loss:.3f} Median Train Batch Accuracy = {np.median(accuracy_list):.2f} Test Loss = {test_loss:.3f} Test Accuracy = {test_acc:.2f}')    

('Train Loss = 0.993 Median Train Batch Accuracy = 0.62 Test Loss = 0.729 Test '
 'Accuracy = 0.70')
('Train Loss = 0.636 Median Train Batch Accuracy = 0.78 Test Loss = 0.520 Test '
 'Accuracy = 0.83')
('Train Loss = 0.471 Median Train Batch Accuracy = 0.84 Test Loss = 0.468 Test '
 'Accuracy = 0.86')
('Train Loss = 0.331 Median Train Batch Accuracy = 0.88 Test Loss = 0.699 Test '
 'Accuracy = 0.73')
('Train Loss = 0.227 Median Train Batch Accuracy = 0.94 Test Loss = 0.699 Test '
 'Accuracy = 0.75')
('Train Loss = 0.143 Median Train Batch Accuracy = 0.97 Test Loss = 0.628 Test '
 'Accuracy = 0.85')
('Train Loss = 0.105 Median Train Batch Accuracy = 0.97 Test Loss = 0.688 Test '
 'Accuracy = 0.83')
('Train Loss = 0.067 Median Train Batch Accuracy = 1.00 Test Loss = 0.725 Test '
 'Accuracy = 0.85')
('Train Loss = 0.045 Median Train Batch Accuracy = 1.00 Test Loss = 0.853 Test '
 'Accuracy = 0.83')
('Train Loss = 0.034 Median Train Batch Accuracy = 1.00 Test Loss = 0.863 Test '
 'Accuracy

In [148]:
model.to(device)

TextClassification(
  (embedding): Embedding(8681, 100)
  (rnn): GRU(100, 128, dropout=0.5)
  (output): Linear(in_features=128, out_features=6, bias=True)
)

In [149]:
model(text_field.process([s.text for s in train.examples]).to(device))

tensor([[-0.2331, -4.7929,  8.4943, -1.6038, -2.8070, -3.0430],
        [ 6.4501, -0.3446, -2.3021, -4.4463, -3.0333, -2.7899],
        [-0.3929, -5.2317,  8.7567, -2.3056, -1.8869, -2.7242],
        ...,
        [ 1.6309, -5.4118,  1.5285,  9.7031, -5.6745, -2.0729],
        [ 1.5351, -4.5882,  0.7383, 10.1366, -5.7996, -2.5140],
        [ 5.9539, -2.8718, -2.0939, -1.5626, -1.4098, -3.5202]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

### Get predictions on all test examples to confirm 84% accuracy above

In [150]:
train_pred_classes = torch.argmax(model(text_field.process([s.text for s in train.examples]).to(device)), dim=1)

In [151]:
train_label = label_field.process([s.label for s in train.examples]).to(device)

In [152]:
(train_pred_classes==train_label).sum().item() / len(train)

0.9860601614086574

In [153]:
test_pred_classes = torch.argmax(model(text_field.process([s.text for s in test.examples]).to(device)), dim=1)

In [154]:
test_label = label_field.process([s.label for s in test.examples]).to(device)

In [155]:
(test_pred_classes==test_label).sum().item() / len(test)

0.864

### Compute Precision and Recall for each class

In [156]:
from sklearn.metrics import precision_recall_fscore_support

In [157]:
train_precision, train_recall, train_f1, train_support = precision_recall_fscore_support(train_label.cpu().detach().numpy(), train_pred_classes.cpu().detach().numpy())
test_precision, test_recall, test_f1, test_support = precision_recall_fscore_support(test_label.cpu().detach().numpy(), test_pred_classes.cpu().detach().numpy())

In [158]:
train_metrics = pd.DataFrame({'class': [label_field.vocab.itos[i] for i in range(6)],
                              'precision': train_precision,
                              'recall': train_recall,
                              'f1': train_f1,
                              'N_examples': train_support})

print(np.unique([ex.label for ex in train.examples], return_counts=True))

train_metrics

(array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], dtype='<U4'),
 array([  86, 1162, 1250, 1223,  835,  896]))


Unnamed: 0,class,precision,recall,f1,N_examples
0,ENTY,0.976172,0.9832,0.979673,1250
1,HUM,0.997494,0.976288,0.986777,1223
2,DESC,0.977215,0.996558,0.986792,1162
3,NUM,0.99115,1.0,0.995556,896
4,LOC,0.992832,0.99521,0.994019,835
5,ABBR,0.971429,0.790698,0.871795,86


In [159]:
test_metrics = pd.DataFrame({'class': [label_field.vocab.itos[i] for i in range(6)],
                             'precision': test_precision,
                             'recall': test_recall,
                             'f1': test_f1,
                             'N_examples': test_support})

print(np.unique([ex.label for ex in test.examples], return_counts=True))

test_metrics

(array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], dtype='<U4'),
 array([  9, 138,  94,  65,  81, 113]))


Unnamed: 0,class,precision,recall,f1,N_examples
0,ENTY,0.804348,0.787234,0.795699,94
1,HUM,0.963636,0.815385,0.883333,65
2,DESC,0.896296,0.876812,0.886447,138
3,NUM,0.813433,0.964602,0.882591,113
4,LOC,0.883117,0.839506,0.860759,81
5,ABBR,1.0,0.777778,0.875,9


In [12]:
pwd

'/home/sanjay/BrnoTeaching2019/notebooks'