In [32]:
import pandas as pd
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, create_optimizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import json

In [4]:
file = "intent_examples_1000_multilingual.xlsx"
df = pd.read_excel(file)

In [7]:
df['Example Query (Spanish)'] = df['Example Query (Spanish)'].str.lstrip('¿')
df.to_excel("cleaned_intent_examples_1000_multilingual.xlsx", index = False)

In [8]:
dataset = "cleaned_intent_examples_1000_multilingual.xlsx"
ds = pd.read_excel(dataset)

In [10]:
# Extracting columns
df1 = ds[['Example Query (English)', 'Example Query (Arabic)', 'Example Query (Spanish)', 'Intent Category']]
df1.columns = ['English', 'Arabic', 'Spanish', 'Category']

In [14]:
# Changing data format from wide to long
ndf = df1.melt(id_vars = ['Category'], var_name = 'Language', value_name = 'Sample')
ndf = ndf[['Sample', 'Category']]
ndf.to_excel("Final Samples.xlsx")

In [13]:
print(ndf.head())

                             Sample        Category
0             Play some rock music.       Media (M)
1                   Pause the song.       Media (M)
2           Skip to the next track.       Media (M)
3              Increase the volume.       Media (M)
4  Take me to the nearest pharmacy.  Navigation (N)


In [18]:
# Encoding Categories
encoder = LabelEncoder()
ndf['Category'] = encoder.fit_transform(ndf['Category'])

In [19]:
num_labels = len(encoder.classes_)
num_labels

9

In [20]:
ndf.head()

Unnamed: 0,Sample,Category
0,Play some rock music.,4
1,Pause the song.,4
2,Skip to the next track.,4
3,Increase the volume.,4
4,Take me to the nearest pharmacy.,5


In [21]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [33]:
def tokenize_function(samples, categories):

    encodings = tokenizer(samples.tolist(), padding = True, truncation = True, return_tensors = 'tf')
    return encodings, tf.keras.utils.to_categorical(categories, num_classes = num_labels)

In [43]:
train_samples, test_samples, train_categories, test_categories = train_test_split(ndf['Sample'], ndf['Category'], test_size = 0.2, random_state = 41, shuffle = True)
train_encodings, train_categories = tokenize_function(train_samples, train_categories)
test_encodings, test_categories = tokenize_function(test_samples, test_categories)
print(test_categories)

[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [35]:
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]}, train_categories)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]}, test_categories)).batch(16)

In [36]:
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = num_labels)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [37]:
training_steps = len(train_dataset) * 10
optimizer, schedule = create_optimizer(init_lr = 3e-5, num_warmup_steps = 0, num_train_steps = training_steps)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
metrics = [tf.keras.metrics.CategoricalAccuracy(name = 'accuracy'), 
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')]
model.compile(optimizer = optimizer, loss = loss, metrics = metrics)

In [38]:
model.fit(train_dataset, validation_data = test_dataset, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x189c0702240>

In [39]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_dataset)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")

Test Loss: 0.002900490304455161
Test Accuracy: 1.0
Test Precision: 1.0
Test Recall: 1.0


In [44]:
true_labels = tf.argmax(test_categories, axis = -1).numpy()
true_labels = encoder.inverse_transform(true_labels)    

prediction = model.predict(test_dataset)

predicted_labels = tf.argmax(prediction.logits, axis = -1).numpy()
predicted_labels = encoder.inverse_transform(predicted_labels)

for i in range(10):
    print(f"Predicted: {predicted_labels[i]}, True: {true_labels[i]}")

Predicted: 2, True: 2
Predicted: 4, True: 4
Predicted: 0, True: 0
Predicted: 2, True: 2
Predicted: 3, True: 3
Predicted: 1, True: 1
Predicted: 7, True: 7
Predicted: 6, True: 6
Predicted: 4, True: 4
Predicted: 0, True: 0
