In [1]:
!pip install torchtext

Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/c6/bc/b28b9efb4653c03e597ed207264eea45862b5260f48e9f010b5068d64db1/torchtext-0.3.1-py3-none-any.whl (62kB)
[K     |████████████████████████████████| 71kB 1.5MB/s eta 0:00:011
Collecting tqdm (from torchtext)
[?25l  Downloading https://files.pythonhosted.org/packages/9f/3d/7a6b68b631d2ab54975f3a4863f3c4e9b26445353264ef01f465dc9b0208/tqdm-4.32.2-py2.py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 7.1MB/s  eta 0:00:01
Installing collected packages: tqdm, torchtext
Successfully installed torchtext-0.3.1 tqdm-4.32.2
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


#### In this demo we will build a machine learning model to classify sms texts as ham or spam

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### SMS Spam Collection Dataset
Source: https://www.kaggle.com/uciml/sms-spam-collection-dataset


The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.

In [2]:
data = pd.read_csv('datasets/ham-spam/spam.csv', encoding='latin-1')

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Cleaning Data

In [3]:
data = data.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [4]:
data = data.rename(index = str, columns = {'v1': 'labels', 'v2': 'text'})

data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [6]:
train.reset_index(drop=True), test.reset_index(drop=True)

(     labels                                               text
 0       ham  No I'm in the same boat. Still here at my moms...
 1      spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
 2       ham     They r giving a second chance to rahul dengra.
 3       ham     O i played smash bros  &lt;#&gt;  religiously.
 4      spam  PRIVATE! Your 2003 Account Statement for 07973...
 5       ham   G says you never answer your texts, confirm/deny
 6      spam                  88066 FROM 88066 LOST 3POUND HELP
 7       ham  Okey dokey, iÛ÷ll be over in a bit just sorti...
 8       ham                   Why i come in between you people
 9       ham     Wah lucky man... Then can save money... Hee...
 10      ham                         Much better now thanks lol
 11      ham  Madam,regret disturbance.might receive a refer...
 12      ham                          I'm coming home 4 dinner.
 13      ham                                              Ok...
 14      ham  Can Ì_ all decide faster c

In [7]:
train.head()

Unnamed: 0,labels,text
1978,ham,No I'm in the same boat. Still here at my moms...
3989,spam,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935,ham,They r giving a second chance to rahul dengra.
4078,ham,O i played smash bros &lt;#&gt; religiously.
4086,spam,PRIVATE! Your 2003 Account Statement for 07973...


In [8]:
test.head()

Unnamed: 0,labels,text
3245,ham,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
944,ham,I sent my scores to sophas and i had to do sec...
1044,spam,We know someone who you know that fancies you....
2484,ham,Only if you promise your getting out as SOON a...
812,spam,Congratulations ur awarded either å£500 of CD ...


In [9]:
train.shape, test.shape

((4457, 2), (1115, 2))

Saving Train and test data in csv files

In [10]:
train.to_csv('datasets/ham-spam/train.csv', index=False)
test.to_csv('datasets/ham-spam/test.csv', index=False)

In [11]:
!ls datasets/ham-spam

[31mspam.csv[m[m  [31mtest.csv[m[m  [31mtrain.csv[m[m


In [12]:
import numpy as np

import torch
import torchtext

from torchtext.data import Field, BucketIterator, TabularDataset

#### NLTK provides a function called word_tokenize() for splitting strings into tokens (nominally words). It splits tokens based on white space and punctuation.

In [13]:
import nltk
nltk.download('punkt')

from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jananiravi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### The parameters of a Field specify how the data should be processed.We use the TEXT field to define how the text should be processed, and the LABEL field to process the labels.

In [14]:
TEXT = torchtext.data.Field(tokenize = word_tokenize)

In [15]:
LABEL = torchtext.data.LabelField(dtype = torch.float)

In [16]:
datafields = [("labels", LABEL), ("text", TEXT)]

#### the following code splits data into the canonical train/test splits as torchtext.datasets objects. It process the data using the Fields we have previously defined.

In [17]:
trn, tst = torchtext.data.TabularDataset.splits(path = './datasets/ham-spam', 
                                                train = 'train.csv',
                                                test = 'test.csv' ,    
                                                format = 'csv',
                                                skip_header = True,
                                                fields = datafields)

In [18]:
trn[:5]

[<torchtext.data.example.Example at 0x1a1e5ba240>,
 <torchtext.data.example.Example at 0x1a1e5ba2e8>,
 <torchtext.data.example.Example at 0x1a1e5ba0b8>,
 <torchtext.data.example.Example at 0x1a1e7437b8>,
 <torchtext.data.example.Example at 0x1a1e743b38>]

#### We can see how many examples are in each split by checking their length.

In [19]:
print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

Number of training examples: 4457
Number of testing examples: 1115


In [20]:
trn[5].__dict__.keys()

dict_keys(['labels', 'text'])

In [21]:
trn[5].text

['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']

In [22]:
trn[5].labels

'ham'

#### We can also check an example.

In [23]:
print(vars(trn.examples[5]))

{'labels': 'ham', 'text': ['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']}



#### Next, we have to build a vocabulary. This is a effectively a look up table where every unique word in your data set has a corresponding index (an integer). Each index is used to construct a one-hot vector for each word.
There are two ways effectively cut down our vocabulary, we can either only take the top $n$ most common words or ignore words that appear less than $m$ times. We'll do the former, only keeping the top 10,500 words.
The words that appear in examples but we have cut from the are replaced  with a special unknown  token.

In [24]:
TEXT.build_vocab(trn, max_size = 10500)

In [25]:
LABEL.build_vocab(trn)

The vocab size is 10502 because, one of the addition tokens is the unk token and the other is a pad token.

In [26]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10502
Unique tokens in LABEL vocabulary: 2


#### We can also view the most common words in the vocabulary and their frequencies.

In [27]:
print(TEXT.vocab.freqs.most_common(50))

[('.', 3890), ('to', 1750), ('I', 1571), (',', 1468), ('you', 1460), ('?', 1256), ('!', 1134), ('a', 1067), ('...', 1007), ('the', 946), ('&', 772), ('i', 743), ('and', 669), ('in', 663), ('is', 646), (';', 641), ('u', 628), ('me', 586), (':', 570), ('for', 527), ('my', 494), ('of', 471), ('your', 461), ('it', 456), ('have', 395), ('on', 393), (')', 393), ('2', 390), ('that', 384), ("'s", 383), ("'m", 320), ('now', 317), ('are', 316), ('do', 311), ('call', 307), ('at', 301), ('or', 298), ('U', 295), ('not', 294), ("n't", 281), ('be', 275), ('lt', 267), ('gt', 267), ('with', 267), ('get', 265), ('will', 263), ('so', 252), ('#', 245), ('can', 243), ('ur', 237)]


#### We can also see the vocabulary directly using either the stoi (string to int) or itos (int to string) method.

In [28]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']


In [29]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x1a1ce08ea0>, {'ham': 0, 'spam': 1})


Now, we will create iterators that will iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.
#### We'll use a BucketIterator which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.

In [30]:
batch_size = 64

train_iterator, test_iterator = torchtext.data.BucketIterator.splits(
   (trn, tst),
    batch_size = batch_size,
    sort_key = lambda x: len(x.text), 
    sort_within_batch = False)

#### Build The Model

- <b>The embedding layer</b> is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector
- The RNN layer is our RNN which takes in our dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$
- Finally, the linear layer takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.


The RNN returns 2 tensors, output of size [sentence length, batch size, hidden dim] and hidden of size [1, batch size, hidden dim]. output is the concatenation of the hidden state from every time step, whereas hidden is simply the final hidden state. We verify this using the assert statement. Note the squeeze method, which is used to remove a dimension of size 1. Finally, we feed the last hidden state, hidden, through the linear layer, fc, to produce a prediction.

In [31]:
import torch.nn as nn

In [40]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
    
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        hidden_1D = hidden.squeeze(0)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

In [50]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, (hidden, _) = self.rnn(embedded)
        
        hidden_1D = hidden.squeeze(0)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

In [60]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        embedded_dropout = self.dropout(embedded)
        
        output, (hidden, _) = self.rnn(embedded_dropout)
        
        hidden_1D = hidden.squeeze(0)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

#### We now create an instance of our RNN class.

- The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size.
- The embedding dimension is the size of the dense word vectors.
- The hidden dimension is the size of the hidden states
- The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [61]:
input_dim = len(TEXT.vocab)

embedding_dim = 100

hidden_dim = 256

output_dim = 1

In [62]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [63]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 1e-6)

#### we will use BCEWithLogitsLoss loss as our loss function - Creates a criterion that measures the Binary Cross Entropy between the target and the output
This loss combines a Sigmoid layer and the BCELoss in one single class.

In [64]:
criterion = nn.BCEWithLogitsLoss()

#### Training
- For each batch, we first zero the gradients. Each parameter in a model has a grad attribute which stores the gradient calculated by the criterion.
- We then feed the batch of sentences, batch.text, into the model
- The loss and accuracy are then calculated using our predictions and the labels, batch.labels, with the loss being averaged over all examples in the batch.
- We calculate the gradient of each parameter and then update the parameters using the gradients and optimizer algorithm
- Finally, we return the loss and accuracy

##### Calculating Accuracy 
We first feeds the predictions through a sigmoid layer, squashing the values between 0 and 1, we then round them to the nearest integer. This rounds any value greater than 0.5 to 1 (spam) and the rest to 0 (ham).

We then calculate how many rounded predictions equal the actual labels and average it across the batch.

In [65]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.labels)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float() 
        
        acc = correct.sum() / len(correct)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#### the loss is decreasing with each epoch and we get a final accuracy of ~85%

In [66]:
num_epochs = 5

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')

| Epoch: 01 | Train Loss: 0.625 | Train Acc: 85.87% 
| Epoch: 02 | Train Loss: 0.612 | Train Acc: 85.86% 
| Epoch: 03 | Train Loss: 0.601 | Train Acc: 85.95% 
| Epoch: 04 | Train Loss: 0.590 | Train Acc: 86.02% 
| Epoch: 05 | Train Loss: 0.579 | Train Acc: 85.98% 


evaluate is similar to train, with a few modifications as you don't want to update the parameters when evaluating.

In [67]:
epoch_loss = 0
epoch_acc = 0

In [68]:
model.eval()

RNN(
  (embedding): Embedding(10502, 100)
  (rnn): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3)
)

In [69]:
with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.labels)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_preds == batch.labels).float() 
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.649 | Test Acc: 75.23% |
