# How to painlessly transform an NLP model in Jupyter to a production API?

# [SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)

<img src="https://archive.ics.uci.edu/ml/assets/logo.gif" align='left' />

# Fine-tuning DistilBERT for text classification

<a href="https://colab.research.google.com/github/Paulescu/practical-nlp-2021/blob/main/spam_detection/noteboooks/model.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" align="left"/>
</a>

### Required setup if you run the notebook in Google Colab

In [1]:
!pip install -q transformers

# Step 1. Download data and split into train, validation and test

The dataset can be found [here](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)

### Download raw data

In [90]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!tar -xf smsspamcollection.zip

--2020-12-14 18:39:15--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 203415 (199K) [application/x-httpd-php]
Saving to: ‘smsspamcollection.zip.2’


2020-12-14 18:39:17 (269 KB/s) - ‘smsspamcollection.zip.2’ saved [203415/203415]



### Quick data exploration

In [2]:
import pandas as pd

data = pd.read_csv('SMSSpamCollection', sep='\t', header=None)
data.columns = ['label', 'text']

In [3]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data['label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

### Add numeric column for the label

In [5]:
IDX_TO_LABEL = {
    0: 'ham',
    1: 'spam',
}

LABEL_TO_IDX = {
    'ham': 0,
    'spam': 1,
}

data['label_int'] = data['label'].apply(lambda x: LABEL_TO_IDX[x])
data.head()

Unnamed: 0,label,text,label_int
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Split data into files `train.csv` , `validation.csv`, `test.csv`

In [6]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.20, random_state=123,)
train_data, validation_data = train_test_split(train_data, test_size=0.20, random_state=123)

print('train_data: ', len(train_data))
print('validation_data: ', len(validation_data))
print('test_data: ', len(test_data))

train_data[['label_int', 'text']].to_csv('train.csv', index=False, header=False)
validation_data[['label_int', 'text']].to_csv('validation.csv', index=False, header=False)
test_data[['label_int', 'text']].to_csv('test.csv', index=False, header=False)

train_data:  3565
validation_data:  892
test_data:  1115


# Step 2. Define PyTorch `DataLoader`s for train, validation, and test.

In [1]:
import pandas as pd

# train_texts, train_labels
train_data = pd.read_csv('train.csv', header=None)
train_data.columns = ['label', 'text']
train_texts = train_data['text'].tolist()
train_labels = train_data['label'].tolist()

# validation_texts, validation_labels
validation_data = pd.read_csv('validation.csv', header=None)
validation_data.columns = ['label', 'text']
validation_texts = validation_data['text'].tolist()
validation_labels = validation_data['label'].tolist()

# test_texts, test_labels
test_data = pd.read_csv('test.csv', header=None)
test_data.columns = ['label', 'text']
test_texts = test_data['text'].tolist()
test_labels = test_data['label'].tolist()

## HuggingFace `tokenizer`s

In [2]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
validation_encodings = tokenizer(validation_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

## PyTorch `Dataset`s

In [3]:
import torch

class SpamDetectionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SpamDetectionDataset(train_encodings, train_labels)
validation_dataset = SpamDetectionDataset(validation_encodings, validation_labels)
test_dataset = SpamDetectionDataset(test_encodings, test_labels)

## PyTorch `DataLoader`s

In [None]:
from torch.utils.data import DataLoader

# data loaders
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#### Check output from `Dataloader`

In [98]:
train_input = next(iter(train_iter))

print(train_input.text)
print(train_input.label)

tensor([[  68,    2,   34,  ...,   39,  349,    2],
        [ 156,   21,    3,  ..., 1984,   37,  197],
        [   0,  178,   46,  ...,  179,   57,    8],
        ...,
        [ 110,   99,  959,  ...,  443,    8,    1],
        [1963,   73,    0,  ...,  691,   10,    1],
        [  63,  299,    9,  ...,   21,    8,    1]])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])




In [99]:
for i in range(10):
    print('text: ', train[i].text)
    print('label: ', train[i].label)
    print('---')

text:  ['mom', 'wants', 'to', 'know', 'where', 'you', 'at']
label:  0
---
text:  ['boy', ';', 'i', 'love', 'u', 'grl', ':', 'hogolo', 'boy', ':', 'gold', 'chain', 'kodstini', 'grl', ':', 'agalla', 'boy', ':', 'necklace', 'madstini', 'grl', ':', 'agalla', 'boy', ':', 'hogli', '1', 'mutai', 'eerulli', 'kodthini', '!', 'grl', ':', 'i', 'love', 'u', 'kano;-', ')']
label:  0
---
text:  ['its', 'on', 'in', 'engalnd', '!', 'but', 'telly', 'has', 'decided', 'it', 'wo', "n't", 'let', 'me', 'watch', 'it', 'and', 'mia', 'and', 'elliot', 'were', 'kissing', '!', 'damn', 'it', '!']
label:  0
---
text:  ['your', 'gon', 'na', 'have', 'to', 'pick', 'up', 'a', '$', '1', 'burger', 'for', 'yourself', 'on', 'your', 'way', 'home', '.', 'i', 'ca', "n't", 'even', 'move', '.', 'pain', 'is', 'killing', 'me', '.']
label:  0
---
text:  ['no', 'no:)this', 'is', 'kallis', 'home', 'ground.amla', 'home', 'town', 'is', 'durban', ':', ')']
label:  0
---
text:  ['i', 'am', 'seeking', 'a', 'lady', 'in', 'the', 'street', 

# Step 3. Define the neural net model

In [100]:
# TODO: add diagram here

In [9]:
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

# Step 4. Train the model

### Launch Tensorboard

In [103]:
%load_ext tensorboard
%tensorboard --logdir runs

Reusing TensorBoard on port 6008 (pid 79472), started 1:33:04 ago. (Use '!kill 79472' to kill it.)

### Train loop

In [8]:
# Setup logging to Tensorboard
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

now = datetime.now()
now = now.strftime("%Y-%m-%d-%H:%M:%S")
MODEL_NAME = 'fine_tuning_bert'
log_file = f'./runs/{MODEL_NAME}/{now}'
writer = SummaryWriter(log_file)

# Train lopp
from tqdm import tqdm

# optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

N_EPOCHS = 150
for epoch in range(N_EPOCHS):
    
    # train
    running_loss = 0.0
    model.train()
    train_size = 0
    running_accuracy = 0.0
    
    for batch in tqdm(train_loader):
        
        # forward pass to compute the batch loss       
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        predictions = outputs[1]
        
        # backward pass to update model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # compute train metrics
        running_loss += loss.data * input_ids.size(0)
        _, predicted_classes = torch.max(predictions, 1)
        running_accuracy += predicted_classes.eq(labels.data).sum().item()
        train_size += input_ids.size(0)
        
    epoch_loss = running_loss / train_size
    epoch_accuracy = running_accuracy / train_size
    
    # validation
    val_loss = 0.0
    model.eval()
    val_size = 0
    val_accuracy = 0
    with torch.no_grad():
        for batch in validation_loader:
            
            # forward pass
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            predictions = outputs[1]
            
            # compute validation metrics
            val_loss += loss.data * input_ids.size(0)
            _, predicted_classes = torch.max(predictions, 1)
            val_accuracy += predicted_classes.eq(labels.data).sum().item()           
            val_size += input_ids.size(0)
            
        val_loss /= val_size
        val_accuracy /= val_size
        
        print('\nEpoch: {}'.format(epoch))
        print('Loss \t Train: {:.4f} \t Validation: {:.4f}'.format(epoch_loss, val_loss))
        print('Acc: \t Train: {:.4f} \t Validation: {:.4f}'.format(epoch_accuracy, val_accuracy))

    # log metrics to tensorboard
    writer.add_scalars('Loss', {'train': epoch_loss, 'validation': val_loss}, epoch + 1)
    writer.add_scalars('Accuracy', {'train': epoch_accuracy, 'validation': val_accuracy}, epoch + 1)
    
writer.close()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

KeyboardInterrupt: 

# Step 5. Test the model

In [105]:
test_accuracy = 0.0
test_size = 0
with torch.no_grad():
    for batch in test_loader:
        # forward pass
        x = batch.text
        y = batch.label.long()
        predictions = model(x)        
        loss = criterion(predictions, y)

        # compute accuracy
        _, predicted_classes = torch.max(predictions, 1)
        test_accuracy += predicted_classes.eq(y.data).sum().item()
        test_size += x.size(0)

test_accuracy /= test_size
print('Test accuracy: {:.4f}'.format(test_accuracy))

Test accuracy: 0.9785


# Extra. Interact with the model
Pay attention how pre-processing and post-processing are necessary to be able to use the model at inference time.

https://github.com/bentrevett/pytorch-sentiment-analysis/issues/40

In [19]:
sentences = [
    'This is your friend Carl. Come to the Casino and get a discount!',
    'This is your friend Carl, do you want to meet later?',
    'Would you be interested in buying a car for nothing?',
    'Send your card details today and get a prize!',
    'I won two tickets to the show, do you want to come with me?',
    'I won two tickets to the show, just send an SMS to this number and get them',
]

for s in sentences:
    # pre-process text into integer tokens
    model_input = [TEXT.vocab.stoi[token] for token in tokenizer_fn(s)]
    # add 0-dimension
    model_input = torch.LongTensor(model_input).unsqueeze(0)
    
    # run model prediction
    predictions = model(model_input)
    
    # post-processing
    _, predicted_class = torch.max(predictions, 1)
    predicted_class = predicted_class.item()
    predicted_class_str = IDX_TO_LABEL[predicted_class]
    
    # print
    print(s)
    print(predicted_class_str)
    print('------')

This is your friend Carl. Come to the Casino and get a discount!
spam
------
This is your friend Carl, do you want to meet later?
ham
------
Would you be interested in buying a car for nothing?
ham
------
Send your card details today and get a prize!
spam
------
I won two tickets to the show, do you want to come with me?
ham
------
I won two tickets to the show, just send an SMS to this number and get them
ham
------


# Extra: Visualize the learned word embeddings with the [Embedding Projector](https://projector.tensorflow.org/)

### Extract embedding parameters

In [None]:
for name, parameter in model.named_parameters():
    if name == 'embed.weight':
        embeddings = parameter

print(embeddings.shape)

### Generate tsv files

In [None]:
import io

embeddings = embeddings.cpu().detach().numpy()
vocab = TEXT.vocab.itos

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index in [0, 1]:
        # skip 0, it's the unknown token.
        # skip 1, it's the padding token.
        continue
        
    vec = embeddings[index, :] 
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")

out_v.close()
out_m.close()

### Download files to your local computer (in case you are running this notebook in Google Colab)

In [None]:
try:
    from google.colab import files
    files.download('vectors.tsv')
    files.download('metadata.tsv')
except Exception as e:
    pass