In [90]:
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertForSequenceClassification

from tensorflow.keras.optimizers import Adam

Load the data file

In [91]:
data = pd.read_json('Intent.json')

In [92]:
data.head()

Unnamed: 0,intents
0,"{'intent': 'greetings', 'text': ['hello', 'hey..."
1,"{'intent': 'goodbye', 'text': ['cya', 'see you..."
2,"{'intent': 'age', 'text': ['how old', 'how old..."
3,"{'intent': 'name', 'text': ['what is your name..."
4,"{'intent': 'common cold symptoms', 'text': ['R..."


sentences: This list comprehension flattens the list of sentences from the JSON file. Each sublist['text'] contains multiple sentences, so we iterate through each sublist and extract all the sentences.

In [93]:
sentences = [item for sublist in data['intents'] for item in sublist['text']]

In [94]:
sentences

['hello',
 'hey',
 'hi',
 'good day',
 'greetings',
 "what's up?",
 'how is it going',
 'cya',
 'see you later',
 'goodbye',
 'have a good day',
 'bye',
 'cao',
 'see ya',
 'how old',
 'how old are you?',
 'what is your age',
 'how old are you',
 'age?',
 'what is your name',
 'what should i call you',
 "what's your name?",
 'who are you?',
 'can you tell me your name',
 'Runny or stuffy nose',
 'Sore throat',
 'Cough',
 'Congestion',
 'Slight body aches or a mild headache',
 'Sneezing',
 'Low-grade fever',
 'Generally feeling unwell (malaise)',
 'Sweating',
 'Chills and ',
 'Headache',
 'Muscle aches',
 'Loss of appetite',
 'Irritability',
 'Dehydration',
 'General weakness',
 'I feel a sharp, burning sensation when I urinate.',
 'It feels like my bladder is on fire every time I go to the bathroom',
 "I can't help but flinch because of the intense burning when I pee",
 'My urine feels so hot and painful that I dread going to the bathroom',
 "There's a stinging, uncomfortable feeling t

labels: This list comprehension assigns each sentence a label based on the index (idx) of the corresponding intent. If a sentence belongs to the nth intent, it is assigned the label n.

In [95]:
labels = [idx for idx, sublist in enumerate(data['intents']) for _ in sublist['text']]

In [96]:
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,


Encode labels

num_labels: The total number of unique intents (or classes) is determined by the length of the data['intents'] list.

In [97]:
num_labels = len(data['intents'])

In [98]:
num_labels

36

to_categorical(labels, num_classes=num_labels): This converts the integer labels into one-hot encoded vectors. Each label is represented as a binary vector where the index of the intent is marked with 1, and all others are 0.

In [99]:
labels = tf.keras.utils.to_categorical(labels, num_classes = num_labels)

In [100]:
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

split the data

In [101]:
train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2)

In [110]:
train_texts

['I don’t want to eat anything, not even a snack.',
 'These body aches are making me feel miserable.',
 'It’s like a constant cycle of nausea and vomiting.',
 'I’m physically tired and mentally worn out.',
 'I can feel my fever rising again.',
 'I’m keeping track of my fever because it doesn’t seem to be normal.',
 'I feel like I’m sneezing every minute, and it’s really starting to get frustrating.',
 'My head is pounding, and I can feel that my temperature is too high.',
 'I’ve had a fever for the past few hours, and my temperature keeps rising.',
 'I feel so tired, even after getting a full night’s sleep.',
 "I keep  even when I’m in a warm environment, and it feels like I can't control it.",
 'No matter how often I wipe it, my nose keeps running and dripping.',
 'I’m burning up, and I can’t seem to cool down no matter what I do.',
 'No matter how much I try to breathe deeply, my nose remains blocked.',
 'My nose feels constantly irritated, and it triggers sneezing every few seconds.

In [111]:
train_labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

Tokenize

BertTokenizer.from_pretrained('bert-base-uncased'): Loads the pre-trained BERT tokenizer (bert-base-uncased). This tokenizer is designed to handle lowercase English text.


train_encodings and val_encodings: These encode the training and validation texts using the BERT tokenizer.

truncation=True: If the text exceeds the maximum length (128 tokens), it will be truncated.

padding=True: Shorter texts will be padded to the maximum length (128 tokens).

max_length=128: The maximum length of tokenized input.
return_tensors="tf": Returns the result as TensorFlow tensors.

In [102]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation = True, padding = True, max_length=128, return_tensors = "tf")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="tf")

In [112]:
train_encodings

{'input_ids': <tf.Tensor: shape=(528, 28), dtype=int32, numpy=
array([[ 101, 1045, 2123, ...,    0,    0,    0],
       [ 101, 2122, 2303, ...,    0,    0,    0],
       [ 101, 2009, 1521, ...,    0,    0,    0],
       ...,
       [ 101, 1996, 3147, ...,    0,    0,    0],
       [ 101, 1045, 1521, ...,    0,    0,    0],
       [ 101, 2009, 1521, ...,    0,    0,    0]])>, 'token_type_ids': <tf.Tensor: shape=(528, 28), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(528, 28), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

load the model

TFBertForSequenceClassification.from_pretrained: Loads a pre-trained BERT model for sequence classification. The model is pre-configured for classification tasks and will output logits for each class. The num_labels parameter is used to specify the number of unique classes.

In [103]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimizer and loss function

Adam(learning_rate=5e-5): Sets up the Adam optimizer with a learning rate of 5e-5, which is a common choice for fine-tuning BERT models.

CategoricalCrossentropy(from_logits=True): The loss function used for multi-class classification. from_logits=True indicates that the model outputs logits (unnormalized predictions), not probabilities, so the loss function will apply the softmax internally.

In [104]:
optimizer = Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

prepare the db

from_tensor_slices: Converts the training and validation data into TensorFlow datasets.

For each batch, the input consists of two parts:

input_ids: The tokenized IDs of the input sentences.

attention_mask: A mask that tells the model which tokens are actual words and which are padding.
The labels are provided as targets.

.shuffle(len(train_texts)): Shuffles the training data to ensure the model sees different batches in each epoch.

.batch(16): Batches the data into groups of 16 samples per batch.

In [105]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids' : train_encodings['input_ids'], 'attention_mask' : train_encodings['attention_mask']},
    train_labels
)).shuffle(len(train_texts)).batch(16)


val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids' : val_encodings['input_ids'], 'attention_mask' : val_encodings['attention_mask']},
    val_labels
)).batch(16)

In [106]:
# Training loop
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    # Training
    for batch in train_dataset:
        with tf.GradientTape() as tape:
            outputs = model(batch[0], training=True)
            loss = loss_fn(batch[1], outputs.logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(f"Training loss: {loss.numpy()}")

    # Validation
    val_loss = 0
    for batch in val_dataset:
        outputs = model(batch[0], training=False)
        val_loss += loss_fn(batch[1], outputs.logits).numpy()
    val_loss /= len(val_dataset)
    print(f"Validation loss: {val_loss}")

Epoch 1/5
Training loss: 3.260148048400879
Validation loss: 3.0880062845018177
Epoch 2/5
Training loss: 2.2589402198791504
Validation loss: 2.2174277305603027
Epoch 3/5


KeyboardInterrupt: 