In [52]:
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertForSequenceClassification

from tensorflow.keras.optimizers import Adam

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

In [53]:
lemmatizer = WordNetLemmatizer()

In [54]:
custom_stopwords = set(stopwords.words('english')).union({'doctor', 'feel', 'feeling', 'experience', 'experiencing', 'sensation', 'really', 'get', 'got', 'just'})

In [55]:
# Text Preprocessing Function
def preprocess_text(text):
    # Lowercase text
    text = text.lower()

    # Remove special characters and numbers (optional)
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize and remove stopwords, then lemmatize
    words = text.split()
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in custom_stopwords]

    # Join words back into a sentence
    return ' '.join(filtered_words)

Load the data file

In [56]:
data = pd.read_json('Intent.json')

In [57]:
data.head()

Unnamed: 0,intents
0,"{'intent': 'greetings', 'text': ['hello', 'hey..."
1,"{'intent': 'goodbye', 'text': ['cya', 'see you..."
2,"{'intent': 'age', 'text': ['how old', 'how old..."
3,"{'intent': 'name', 'text': ['what is your name..."
4,"{'intent': 'common cold symptoms', 'text': ['R..."


sentences: This list comprehension flattens the list of sentences from the JSON file. Each sublist['text'] contains multiple sentences, so we iterate through each sublist and extract all the sentences.

In [58]:
sentences = [item for sublist in data['intents'] for item in sublist['text']]
#sentences = [preprocess_text(item) for sublist in data['intents'] for item in sublist['text']]

In [59]:
sentences

['hello',
 'hey',
 'hi',
 'good day',
 'greeting',
 'whats',
 'going',
 'cya',
 'see later',
 'goodbye',
 'good day',
 'bye',
 'cao',
 'see ya',
 'old',
 'old',
 'age',
 'old',
 'age',
 'name',
 'call',
 'whats name',
 '',
 'tell name',
 'runny stuffy nose',
 'sore throat',
 'cough',
 'congestion',
 'slight body ache mild headache',
 'sneezing',
 'lowgrade fever',
 'generally unwell malaise',
 'sweating',
 'chill',
 'headache',
 'muscle ache',
 'loss appetite',
 'irritability',
 'dehydration',
 'general weakness',
 'sharp burning urinate',
 'feel like bladder fire every time go bathroom',
 'cant help flinch intense burning pee',
 'urine feel hot painful dread going bathroom',
 'there stinging uncomfortable last throughout urination',
 'every time try urinate feel like inside burned',
 'unbearable pain end urination wont go away',
 'burning constant get worse drink fluid',
 'pee there heat make want stop midstream',
 'feel like urinary tract infection constant burning',
 'pain urinating

labels: This list comprehension assigns each sentence a label based on the index (idx) of the corresponding intent. If a sentence belongs to the nth intent, it is assigned the label n.

This line is extracting the corresponding label for each sentence. Each intent corresponds to a specific category or class in the classification task. The label for a sentence is determined by its position in the intents list. For example, the first intent (at index 0) would have a label of 0 for all its sentences, the second intent would have a label of 1 for all its sentences, and so on.

In [60]:
labels = [idx for idx, sublist in enumerate(data['intents']) for _ in sublist['text']]

In [61]:
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,


Encode labels

num_labels: The total number of unique intents (or classes) is determined by the length of the data['intents'] list.

Label encoding is necessary for training machine learning models, especially neural networks. Most machine learning algorithms (including neural networks) expect the labels to be represented as numerical values, not as strings (like 'greeting', 'goodbye', etc.). This is because neural networks work with numeric data for efficient computation and optimization.

One-hot encoding means that each label is converted into a vector where all elements are 0 except the index corresponding to the class. For example, if you have 3 classes (0, 1, 2), the label for class 0 will be [1, 0, 0], for class 1 it will be [0, 1, 0], and for class 2 it will be [0, 0, 1].

Why use one-hot encoding? This is used because neural networks typically perform better when labels are represented in this form. One-hot encoding avoids treating the label values as continuous (ordinal), so it doesn’t imply any hierarchy or distance between the different classes. Each class is treated as independent and equally distant from one another, which is a common assumption in classification tasks.

num_classes: This refers to the total number of unique intents (classes) in the dataset. It ensures that the resulting encoded labels have the right length (i.e., as many output nodes as classes).

In [62]:
num_labels = len(data['intents'])

In [63]:
num_labels

36

to_categorical(labels, num_classes=num_labels): This converts the integer labels into one-hot encoded vectors. Each label is represented as a binary vector where the index of the intent is marked with 1, and all others are 0.

In [64]:
labels = tf.keras.utils.to_categorical(labels, num_classes = num_labels)

In [65]:
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

split the data

In [66]:
train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2)

In [67]:
train_texts

['nose runny im worried might run tissue',
 'feel like lung struggling enough air',
 'sore throat making uncomfortable eat drink anything',
 'feel like lump throat painful try talk',
 'ive putting hot sauce everything still taste flavorless',
 'weak vomiting',
 'there constant queasy stomach',
 'ive coughing persistently day doesnt seem better',
 'breathing difficult feel like there something blocking airway',
 'dont want eat anything even snack',
 'pain deep inside muscle bone',
 'wheeze even im anything strenuous',
 'constant dry cough thats making throat sore',
 'disconnected cant enjoy smell used surround',
 'im struggling enough air lung',
 'every breath feel shallow like im getting enough oxygen',
 'ive vomiting night long',
 'feel like stomach tying knot',
 'like nose leaking difficult manage',
 'keep checking temperature high',
 'pain urinating feel like something sharp inside',
 'ive sneezing continuously past hour exhausting',
 'voice feel raspy barely speak whisper',
 'cant 

In [68]:
train_labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

Tokenize

BertTokenizer.from_pretrained('bert-base-uncased'): Loads the pre-trained BERT tokenizer (bert-base-uncased). This tokenizer is designed to handle lowercase English text.


train_encodings and val_encodings: These encode the training and validation texts using the BERT tokenizer.

truncation=True: If the text exceeds the maximum length (128 tokens), it will be truncated.

padding=True: Shorter texts will be padded to the maximum length (128 tokens).

max_length=128: The maximum length of tokenized input.
return_tensors="tf": Returns the result as TensorFlow tensors.

In [69]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation = True, padding = True, max_length=128, return_tensors = "tf")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="tf")

In [70]:
train_encodings

{'input_ids': <tf.Tensor: shape=(528, 16), dtype=int32, numpy=
array([[  101,  4451,  2448, ...,     0,     0,     0],
       [  101,  2514,  2066, ...,     0,     0,     0],
       [  101, 14699,  3759, ...,     0,     0,     0],
       ...,
       [  101, 16701,  2130, ...,     0,     0,     0],
       [  101,  5505,  2514, ...,     0,     0,     0],
       [  101,  4318, 11969, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(528, 16), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(528, 16), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

load the model

TFBertForSequenceClassification.from_pretrained: Loads a pre-trained BERT model for sequence classification. The model is pre-configured for classification tasks and will output logits for each class. The num_labels parameter is used to specify the number of unique classes.

In [71]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimizer and loss function

Adam(learning_rate=5e-5): Sets up the Adam optimizer with a learning rate of 5e-5, which is a common choice for fine-tuning BERT models.

CategoricalCrossentropy(from_logits=True): The loss function used for multi-class classification. from_logits=True indicates that the model outputs logits (unnormalized predictions), not probabilities, so the loss function will apply the softmax internally.

In [72]:
optimizer = Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

prepare the db

from_tensor_slices: Converts the training and validation data into TensorFlow datasets.

For each batch, the input consists of two parts:

input_ids: The tokenized IDs of the input sentences.

attention_mask: A mask that tells the model which tokens are actual words and which are padding.
The labels are provided as targets.

.shuffle(len(train_texts)): Shuffles the training data to ensure the model sees different batches in each epoch.

.batch(16): Batches the data into groups of 16 samples per batch.

In [73]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids' : train_encodings['input_ids'], 'attention_mask' : train_encodings['attention_mask']},
    train_labels
)).shuffle(len(train_texts)).batch(16)


val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids' : val_encodings['input_ids'], 'attention_mask' : val_encodings['attention_mask']},
    val_labels
)).batch(16)

In [74]:
train_dataset

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 16), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 16), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 36), dtype=tf.float64, name=None))>

In [75]:
# Training loop
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    # Training
    for batch in train_dataset:
        with tf.GradientTape() as tape:
            outputs = model(batch[0], training=True)
            loss = loss_fn(batch[1], outputs.logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(f"Training loss: {loss.numpy()}")

    # Validation
    val_loss = 0
    for batch in val_dataset:
        outputs = model(batch[0], training=False)
        val_loss += loss_fn(batch[1], outputs.logits).numpy()
    val_loss /= len(val_dataset)
    print(f"Validation loss: {val_loss}")

Epoch 1/5
Training loss: 3.5850777626037598
Validation loss: 3.551460372077094
Epoch 2/5
Training loss: 3.1158227920532227
Validation loss: 3.2183582252926297
Epoch 3/5
Training loss: 2.2687430381774902
Validation loss: 2.348859945933024
Epoch 4/5
Training loss: 1.7854042053222656
Validation loss: 1.6483122110366821
Epoch 5/5
Training loss: 0.9484887719154358
Validation loss: 1.2405961354573567


Validation loss: 0.9055526322788663, Accuracy: 0.7803030303030303


In [76]:
# Validation
val_loss = 0
correct_predictions = 0
total_predictions = 0

for batch in val_dataset:
    outputs = model(batch[0], training=False)
    val_loss += loss_fn(batch[1], outputs.logits).numpy()

    # Compute predictions and accuracy
    predictions = tf.argmax(outputs.logits, axis=1)
    labels = tf.argmax(batch[1], axis=1)
    correct_predictions += tf.reduce_sum(tf.cast(predictions == labels, tf.float32)).numpy()
    total_predictions += labels.shape[0]

val_loss /= len(val_dataset)
accuracy = correct_predictions / total_predictions
print(f"Validation loss: {val_loss}, Accuracy: {accuracy}")


Validation loss: 1.2405961354573567, Accuracy: 0.7272727272727273


In [77]:
# Function to test the model with a single input
def test_model(sentence):
    # Tokenize the input
    encoding = tokenizer(sentence, truncation=True, padding=True, max_length=128, return_tensors="tf")
    outputs = model(encoding)
    logits = outputs.logits

    # Get predicted class
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    return predicted_class

# Test an example sentence
example_sentence = "The lack of taste is making it hard to enjoy anything, even drinks."
predicted_class = test_model(example_sentence)

# Map the predicted class to the intent
intent = data['intents'][predicted_class]['intent']
print(f"Predicted intent: {intent}")


Predicted intent: Loss of Taste
