## Intent Classification on CLINC150 dataset

#### Classification of 20 in-scope intent classes performed using BERT

##### HuggingFace Transformers v3.5.0 used. This project has been done using a GPU on Google Colaboratory

In [1]:
#connecting and mounting Google Drive folder
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os
os.chdir('/content/drive/My Drive/research_projects/Loyal AI')

In [3]:
!pip install transformers==3.5.0

Collecting transformers==3.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 9.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 27.1MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 40.5MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB

In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertTokenizer
from transformers import get_linear_schedule_with_warmup
import numpy as np
import random
from random import shuffle
import time
import datetime
import json
import sklearn

In [5]:
with open('data_full.json', 'r') as f:
  data= json.load(f)

#### Selecting 20 classes and preparing training, validation and testing datasets for the 20 selected in-scope classes

In [8]:


def loop_over(data, classes):
  select_data, select_labels = [], []
  shuffle(data)
  for row in data:
    if row[1] in classes:
      select_data.append(row[0])
      select_labels.append(row[1])
  return select_data, select_labels


def prepare_splits(data):

  s = set()
  for row in data['train']:
    s.add(row[-1])
  
  classes = list(s)[:20]
  

  train_data, train_labels = loop_over(data['train'], classes)
  val_data, val_labels = loop_over(data['val'], classes)
  test_data, test_labels = loop_over(data['test'], classes)

  return classes, [train_data, train_labels], [val_data, val_labels], [test_data, test_labels]

label_set, train, val, test = prepare_splits(data)

In [9]:
#Doing a stat check on the data in the train, validation and testing splits
print(len(train[0]), len(train[1]))
print(len(val[0]), len(val[1]))
print(len(test[0]), len(test[1]))

2000 2000
400 400
600 600


In [10]:
print('Printing some samples data from the train set')
for i, row in enumerate(train[0][:10]):
  print(row)
  print(train[1][i])
  print()

Printing some samples data from the train set
do you know of any good restaurants
restaurant_suggestion

create alarm 6am
alarm

i would like a block put on my chase account asap
freeze_account

will you tell me who made the ai
who_made_you

im sorry can you repeat yourself
repeat

how much as my taxes by the way
taxes

do i need to get vaccines before my trip
vaccines

how much gas is in my gas tank
gas

could you set one alarm for 8am saturday and one for 9am sunday
alarm

start a countdown for 20 minutes
timer



In [11]:
#Function to convert labels into indices
def labels2idx_func(labels, labels2idx):
  
  idx_labels = []
  for label in labels:
    idx_labels.append(labels2idx[label])
  return idx_labels


#### Preparing dataset for training and testing. Converting labels to indices using labels2idx_func

In [12]:
labels2idx = {label:i for i, label in enumerate(label_set) }

train_data, train_labels = train[0], labels2idx_func(train[1], labels2idx)
val_data, val_labels = val[0], labels2idx_func(val[1], labels2idx)
test_data, test_labels = test[0], labels2idx_func(test[1], labels2idx)


#### Labels with their corresponding indices and printing some sample data from the train set with their indexed label

In [14]:
print('Labels with their corresponding indices')
print(labels2idx)
print()
for i, sent in enumerate(train_data[:10]):
  print(sent, train_labels[i])

Labels with their corresponding indices
{'timezone': 0, 'vaccines': 1, 'reminder_update': 2, 'credit_limit_change': 3, 'restaurant_suggestion': 4, 'redeem_rewards': 5, 'meeting_schedule': 6, 'alarm': 7, 'taxes': 8, 'who_made_you': 9, 'timer': 10, 'food_last': 11, 'fun_fact': 12, 'restaurant_reservation': 13, 'meaning_of_life': 14, 'freeze_account': 15, 'gas': 16, 'tire_pressure': 17, 'repeat': 18, 'card_declined': 19}

do you know of any good restaurants 4
create alarm 6am 7
i would like a block put on my chase account asap 15
will you tell me who made the ai 9
im sorry can you repeat yourself 18
how much as my taxes by the way 8
do i need to get vaccines before my trip 1
how much gas is in my gas tank 16
could you set one alarm for 8am saturday and one for 9am sunday 7
start a countdown for 20 minutes 10


In [15]:
# Function to encode sentences and preparing them for BERT model input. More details are in the text cell below
def encode_sentences(data, tokenizer):
  input_ids = []
  attention_masks = []
  for sent in data:
      encoded = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=64, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
      input_ids.append(encoded['input_ids'])
      attention_masks.append(encoded['attention_mask'])
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, attention_masks

#### Here BertTokenizer is being used. For the sentences to be used for training, validation and testing, they need to be prepared appropriately.

Given the dataset provided by the authors [link](https://github.com/clinc/oos-eval) talk about preprocessing the dataset [Section 2.3] (lower case sentences, punctuation removal and maintaining train/test distribution of data, explicit preprocessing becomes easier.

Preprocessing such as normalization (stemming) or stopword removal could be done, but there is a possibility that the sentences could get shorter and the stemmed tokens could get assigned new token ids rather than getting mapped onto token ids already in the BertTokenizer vocabulary. And very short sentences would mean that the intent could be classified based on specific tokens rather than the contextual meaning of the entire sentence. Hence, the current existing state of sentences currently provided by the authors are good enough to be used as input to the BERT model.

Each sentence needs to be tokenized, [CLS] and [SEP] tokens to be appended to the beginning and end of the sentences and tokens then have to be mapped to appropriate token ids in the vocabulary. Then sentences need to be padded or truncation to maintain uniformity in sequence length and attention masks need to be generated for the BERT model to only consider the actual tokenized words and ignore the pad tokens. BertTokenizer provides the encode_plus function for use to perform all these tasks.

Maximum sequence length has been set to 64.

In [16]:
#generating input ids and attention masks for the training, validation and testing splits.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_input_ids, train_mask = encode_sentences(train_data, tokenizer)
val_input_ids, val_mask = encode_sentences(val_data, tokenizer)
test_input_ids, test_mask = encode_sentences(test_data, tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.







In [17]:
#Using TensorDataset to align input ids, attention mask and corresponding labels.
train_dataset = TensorDataset(train_input_ids, train_mask, torch.tensor(train_labels))
val_dataset = TensorDataset(val_input_ids, val_mask, torch.tensor(val_labels))
test_dataset = TensorDataset(test_input_ids, test_mask, torch.tensor(test_labels))

#Batch size set as 64 for training and testing and this hyperparameter could be changed as per need or GPU space
batch_size = 64

#Batching and bucketing samples. Batches for training set also shuffled while training. Not need for validation or testing set.
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size=batch_size)


#### BertForSequenceClassification model provided by HuggingFace Transformers. 

It contains a linear head on top the BertModel for multi-label sequence classification. Also using pre-trained Bert base uncased weights and fine tuning them for intent classification using the training set prepared above.

The Bert model is pre-trained on English corpora using masked language modeling and next sentence prediction. The BERT model used here has a hidden state of 768 length and is the base model pre-trained on uncased English sentences. It consists of 12 layers with 12 heads in total and includes 110M parameters to make up the model.

In [18]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=20, output_attentions=False, output_hidden_states=False)

#loading model to gpu if available or cpu if not
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Using AdamW optimizer and scheduler for controlling the learning rate. Scheduler used to warmup the model so that the learning rate is not drastically changed during the warmup steps as loss maybe very high during warmup.

Epochs 5 selected. Too many epochs may lead to model overfitting. 

Warm step 100. This could also be changed as needed.

In [19]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 100, num_training_steps = total_steps)

In [20]:
#function to calculate accuracy during prediction
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
#function to calculate time elapsed
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

#### Model training

In [22]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

#comment out if gpu not present
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        #loading data to whichever device available
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()     

        #Sending data to the model and storing loss and logits generated by the model.   
        loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        total_train_loss += loss.item()

        loss.backward() #backpropagating after loss calculation

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #gradient clipping in case of gradient explosion

        #running the optimizer and the scheduler for the learning rate
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("Running Validation...")

    t0 = time.time()


    #Validation
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():       # no need of loss propagation 

            (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
 
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

Training...
  Average training loss: 3.01
  Training epcoh took: 0:00:12
Running Validation...
  Accuracy: 0.13
  Validation Loss: 2.93
  Validation took: 0:00:01
Training...
  Average training loss: 2.74
  Training epcoh took: 0:00:12
Running Validation...
  Accuracy: 0.50
  Validation Loss: 2.46
  Validation took: 0:00:01
Training...
  Average training loss: 2.21
  Training epcoh took: 0:00:12
Running Validation...
  Accuracy: 0.82
  Validation Loss: 1.77
  Validation took: 0:00:01
Training...
  Average training loss: 1.48
  Training epcoh took: 0:00:12
Running Validation...
  Accuracy: 0.98
  Validation Loss: 1.07
  Validation took: 0:00:01
Training...
  Average training loss: 1.00
  Training epcoh took: 0:00:12
Running Validation...
  Accuracy: 0.98
  Validation Loss: 0.85
  Validation took: 0:00:01

Training complete!
Total training took 0:01:04 (h:mm:ss)


#### Testing phase. Using the model and predicting on the test set.

In [23]:
print('Predicting labels')

model.eval()

predictions , true_labels = [], []

for batch in test_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                      
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)


Predicting labels


The predictions provided by the model are in the form of logits. The model tries to maximize the negative log likehood score across the number of classes present. Hence passing the logits passed through a softmax function would give you a probability distribution over the number of classes you have. And the highest probability given to a particular class, is the class predicted by the model.

In [29]:
softmax = torch.nn.Softmax()

labels_flat = []
pred_flat = []
for i, batch in enumerate(predictions):
  for j, logits in enumerate(batch):
    pred_flat.append(np.argmax(softmax(torch.tensor(logits))))
    labels_flat.append(true_labels[i][j])


  """


In [33]:
cnt=0
for i, val in enumerate(pred_flat):
  if val == labels_flat[i]:
    cnt+=1
print('Accuracy of the model to the true labels')
print(cnt/len(labels_flat))

Accuracy of the model to the true labels
0.9783333333333334


In [34]:
print('Macro averaged F1 score')
sklearn.metrics.f1_score(labels_flat, pred_flat, average='macro')

Macro averaged F1 score


0.9782846298635111

#### Saving the model to an output directory

In [35]:
output_dir = 'output_dir'

model_to_save = model.module if hasattr(model, 'module') else model  
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('output_dir/tokenizer_config.json',
 'output_dir/special_tokens_map.json',
 'output_dir/vocab.txt',
 'output_dir/added_tokens.json')

In [36]:
!pip freeze > requirements.txt

In [37]:
!ls

data_full.json	output_dir  requirements.txt
