In [23]:
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertForSequenceClassification

from tensorflow.keras.optimizers import Adam

Load the data file

In [24]:
data = pd.read_json('DB/intense.json')

In [25]:
data.head()

Unnamed: 0,intents
0,"{'intent': 'Continuous Sneezing', 'text': ['I ..."
1,"{'intent': 'headache', 'text': ['I have a head..."
2,"{'intent': 'Shivering', 'text': ['I’m shaking ..."
3,"{'intent': 'Chills', 'text': ['I suddenly felt..."
4,"{'intent': 'Watering Eyes', 'text': ['My eyes ..."


sentences: This list comprehension flattens the list of sentences from the JSON file. Each sublist['text'] contains multiple sentences, so we iterate through each sublist and extract all the sentences.

In [26]:
sentences = [item for sublist in data['intents'] for item in sublist['text']]

In [27]:
sentences

['I can’t stop sneezing, it’s happening almost every minute.',
 'It feels like I’m sneezing non-stop, no matter what I do.',
 'Every time I breathe in, I end up sneezing, it’s been going on for hours.',
 'My nose won’t stop itching, and I keep sneezing over and over again.',
 'It’s so frustrating because the sneezing won’t stop, and I feel like I’m getting no relief.',
 'I’ve been sneezing continuously for the past few hours, and it’s exhausting.',
 'Every time I try to speak, I end up sneezing instead, it’s embarrassing.',
 'I feel like my sneezing fits are triggered by the slightest bit of dust.',
 'I sneeze so much that my nose is sore and red from all the irritation.',
 "I’m sneezing uncontrollably and it's affecting my daily routine.",
 'I feel like I’m sneezing every minute, and it’s really starting to get frustrating.',
 'My nose is itching, and every time I scratch it, I end up sneezing again.',
 'I keep sneezing uncontrollably, and it feels like it’s never going to stop.',
 'I

labels: This list comprehension assigns each sentence a label based on the index (idx) of the corresponding intent. If a sentence belongs to the nth intent, it is assigned the label n.

In [28]:
labels = [idx for idx, sublist in enumerate(data['intents']) for _ in sublist['text']]

In [29]:
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,


Encode labels

num_labels: The total number of unique intents (or classes) is determined by the length of the data['intents'] list.

In [30]:
num_labels = len(data['intents'])

In [31]:
num_labels

94

to_categorical(labels, num_classes=num_labels): This converts the integer labels into one-hot encoded vectors. Each label is represented as a binary vector where the index of the intent is marked with 1, and all others are 0.

In [32]:
labels = tf.keras.utils.to_categorical(labels, num_classes = num_labels)

In [33]:
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

split the data

In [34]:
train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2)

In [35]:
train_texts

["I'm experiencing headaches and cold hands and feet.",
 'I have a persistent cough that produces phlegm.',
 "I'm experiencing excessive skin peeling.",
 "I can't turn my head easily.",
 "I'm experiencing redness and warmth around my joint.",
 "I'm experiencing fatigue and yellowing of my eyes.",
 'If you could invent something, what would it be?',
 "I'm concerned about my cognitive decline.",
 'Shortness of breath is making it hard to move around.',
 "What is something you're proud of?",
 'I feel sore and stiff, like I’ve been working out too hard, but I haven’t.',
 'My teeth are chattering and I feel like I can’t warm up, even with a hot drink.',
 "I'm experiencing cold intolerance.",
 "I'm experiencing confusion and yellowing of my eyes.",
 'I have excess body fat.',
 'The ringing in my ears is making it difficult to concentrate.',
 'My breathing feels labored.',
 'I feel breathless even though I haven’t been doing anything physically demanding.',
 'I can’t get comfortable because o

In [36]:
train_labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Tokenize

BertTokenizer.from_pretrained('bert-base-uncased'): Loads the pre-trained BERT tokenizer (bert-base-uncased). This tokenizer is designed to handle lowercase English text.


train_encodings and val_encodings: These encode the training and validation texts using the BERT tokenizer.

truncation=True: If the text exceeds the maximum length (128 tokens), it will be truncated.

padding=True: Shorter texts will be padded to the maximum length (128 tokens).

max_length=128: The maximum length of tokenized input.
return_tensors="tf": Returns the result as TensorFlow tensors.

In [37]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation = True, padding = True, max_length=128, return_tensors = "tf")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="tf")

In [38]:
train_encodings

{'input_ids': <tf.Tensor: shape=(1575, 29), dtype=int32, numpy=
array([[ 101, 1045, 1005, ...,    0,    0,    0],
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 1045, 1005, ...,    0,    0,    0],
       ...,
       [ 101, 1996, 2448, ...,    0,    0,    0],
       [ 101, 2130, 2295, ...,    0,    0,    0],
       [ 101, 1996, 2282, ...,    0,    0,    0]])>, 'token_type_ids': <tf.Tensor: shape=(1575, 29), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1575, 29), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

load the model

TFBertForSequenceClassification.from_pretrained: Loads a pre-trained BERT model for sequence classification. The model is pre-configured for classification tasks and will output logits for each class. The num_labels parameter is used to specify the number of unique classes.

In [39]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimizer and loss function

Adam(learning_rate=5e-5): Sets up the Adam optimizer with a learning rate of 5e-5, which is a common choice for fine-tuning BERT models.

CategoricalCrossentropy(from_logits=True): The loss function used for multi-class classification. from_logits=True indicates that the model outputs logits (unnormalized predictions), not probabilities, so the loss function will apply the softmax internally.

In [40]:
optimizer = Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

prepare the db

from_tensor_slices: Converts the training and validation data into TensorFlow datasets.

For each batch, the input consists of two parts:

input_ids: The tokenized IDs of the input sentences.

attention_mask: A mask that tells the model which tokens are actual words and which are padding.
The labels are provided as targets.

.shuffle(len(train_texts)): Shuffles the training data to ensure the model sees different batches in each epoch.

.batch(16): Batches the data into groups of 16 samples per batch.

In [41]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids' : train_encodings['input_ids'], 'attention_mask' : train_encodings['attention_mask']},
    train_labels
)).shuffle(len(train_texts)).batch(16)


val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids' : val_encodings['input_ids'], 'attention_mask' : val_encodings['attention_mask']},
    val_labels
)).batch(16)

In [42]:
# Training loop
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    # Training
    for batch in train_dataset:
        with tf.GradientTape() as tape:
            outputs = model(batch[0], training=True)
            loss = loss_fn(batch[1], outputs.logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(f"Training loss: {loss.numpy()}")

    # Validation
    val_loss = 0
    for batch in val_dataset:
        outputs = model(batch[0], training=False)
        val_loss += loss_fn(batch[1], outputs.logits).numpy()
    val_loss /= len(val_dataset)
    print(f"Validation loss: {val_loss}")

Epoch 1/5
Training loss: 3.9808788299560547
Validation loss: 3.8134920978546143
Epoch 2/5
Training loss: 2.291400671005249
Validation loss: 2.7953037452697753
Epoch 3/5
Training loss: 1.8984429836273193
Validation loss: 2.0059783601760866
Epoch 4/5
Training loss: 0.7830457091331482
Validation loss: 1.5532015657424927
Epoch 5/5
Training loss: 0.9211347699165344
Validation loss: 1.2507192301750183


In [44]:
# Validation
val_loss = 0
correct_predictions = 0
total_predictions = 0

for batch in val_dataset:
    outputs = model(batch[0], training=False)
    val_loss += loss_fn(batch[1], outputs.logits).numpy()

    # Compute predictions and accuracy
    predictions = tf.argmax(outputs.logits, axis=1)
    labels = tf.argmax(batch[1], axis=1)
    correct_predictions += tf.reduce_sum(tf.cast(predictions == labels, tf.float32)).numpy()
    total_predictions += labels.shape[0]

val_loss /= len(val_dataset)
accuracy = correct_predictions / total_predictions
print(f"Validation loss: {val_loss}, Accuracy: {accuracy}")


Validation loss: 1.2507192301750183, Accuracy: 0.7030456852791879


In [45]:
# Function to test the model with a single input
def test_model(sentence):
    # Tokenize the input
    encoding = tokenizer(sentence, truncation=True, padding=True, max_length=128, return_tensors="tf")
    outputs = model(encoding)
    logits = outputs.logits

    # Get predicted class
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    return predicted_class

# Test an example sentence
example_sentence = "Why this is not working"
predicted_class = test_model(example_sentence)

# Map the predicted class to the intent
intent = data['intents'][predicted_class]['intent']
print(f"Predicted intent: {intent}")


Predicted intent: irrelevent


In [46]:
import pickle

In [47]:
# Map the predicted class to the intent
intent = data['intents'][predicted_class]['intent']
print(f"Predicted intent: {intent}")

# Save the model and tokenizer
model.save_pretrained('./saved_model2')
tokenizer.save_pretrained('./saved_model2')

# Pickle the label mappings
label_mapping = {i: intent['intent'] for i, intent in enumerate(data['intents'])}
with open('label_mapping.pkl', 'wb') as f:
    pickle.dump(label_mapping, f)

print("Model and label mapping saved successfully!")

Predicted intent: irrelevent
Model and label mapping saved successfully!
