In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
scodepy_customer_support_intent_dataset_path = kagglehub.dataset_download('scodepy/customer-support-intent-dataset')

print('Data source import complete.')


Data source import complete.


In [None]:
%pip install gensim



In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("scodepy/customer-support-intent-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/customer-support-intent-dataset


In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf
import gensim.downloader as api
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Bidirectional, Dropout, GlobalMaxPool1D, BatchNormalization
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
train = pd.read_csv("/content/Bitext_Sample_Customer_Service_Training_Dataset.csv")
validation = pd.read_csv("/content/Bitext_Sample_Customer_Service_Validation_Dataset.csv")
test = pd.read_csv("/content/Bitext_Sample_Customer_Service_Testing_Dataset.csv")

In [None]:
# Combine the DataFrames into a list
data_frames = [train, test, validation]

# Perform Label Encoding for each DataFrame in the list
le = LabelEncoder()
for df in data_frames:
    le.fit(df['intent'])
    df['intent'] = le.transform(df['intent'])

# Splitting to features and target
X_train, X_test , X_validation = train['utterance'] , test['utterance'], validation['utterance']
y_train, y_test , y_validation = train['intent'], test['intent'], validation['intent']
# Convert the text data into sequences of integer values
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_validation_sequences = tokenizer.texts_to_sequences(X_validation)

# Pad the sequences to ensure they all have the same length
maxlen = 100
X_train_padded = pad_sequences(X_train_sequences, padding='post', truncating='post', maxlen=maxlen)
X_validation_padded = pad_sequences(X_validation_sequences, padding='post', truncating='post', maxlen=maxlen)

# Load the pre-trained GloVe embeddings
word_vectors = api.load('glove-wiki-gigaword-100')

# Create the embedding layer
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_vector = word_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)





In [None]:
num_classes = train['intent'].nunique()
# Define the LSTM model
model = Sequential()
model.add(embedding_layer)

model.add(Bidirectional(LSTM(
        100,
        return_sequences = True,
        recurrent_dropout=0.2)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32, activation = "relu"))
model.add(Dense(64, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(128, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(256, activation = "relu"))
model.add(Dense(512, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation = 'softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_validation_padded, y_validation))

Epoch 1/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 476ms/step - accuracy: 0.0528 - loss: 3.3129 - val_accuracy: 0.2665 - val_loss: 3.1870
Epoch 2/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 447ms/step - accuracy: 0.2509 - loss: 2.3986 - val_accuracy: 0.5355 - val_loss: 2.1454
Epoch 3/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 463ms/step - accuracy: 0.5217 - loss: 1.3626 - val_accuracy: 0.8325 - val_loss: 1.0670
Epoch 4/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 448ms/step - accuracy: 0.6552 - loss: 0.9531 - val_accuracy: 0.8900 - val_loss: 0.4281
Epoch 5/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 460ms/step - accuracy: 0.7294 - loss: 0.7553 - val_accuracy: 0.9022 - val_loss: 0.2309
Epoch 6/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 453ms/step - accuracy: 0.7652 - loss: 0.6341 - val_accuracy: 0.9633 - val_loss: 0.1520
Epoch 7/10

<keras.src.callbacks.history.History at 0x7e703d3e2610>

In [None]:
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, padding='post', truncating='post', maxlen=maxlen)

In [None]:
# Model Evaluation on the test set
model.evaluate(X_test_padded,y_test)

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 76ms/step - accuracy: 0.9849 - loss: 0.0838


[0.08710349351167679, 0.9865525960922241]

In [None]:
# Save the model in HDF5 format
model.save('/kaggle/working/intent_model.h5')



In [None]:
# Get all unique encoded categories from the training data
encoded_categories = train['intent'].unique()

# Use the LabelEncoder's inverse_transform to map numbers back to original categories
decoded_categories = le.inverse_transform(encoded_categories)

# Print the mapping
mapping = dict(zip(encoded_categories, decoded_categories))
print("Number-to-Category Mapping:")
for number, category in mapping.items():
    print(f"{number} -> {category}")

Number-to-Category Mapping:
0 -> cancel_order
1 -> change_order
2 -> change_shipping_address
3 -> check_cancellation_fee
4 -> check_invoice
5 -> check_payment_methods
6 -> check_refund_policy
7 -> complaint
8 -> contact_customer_service
9 -> contact_human_agent
10 -> create_account
11 -> delete_account
12 -> delivery_options
13 -> delivery_period
14 -> edit_account
15 -> get_invoice
16 -> get_refund
17 -> newsletter_subscription
18 -> payment_issue
19 -> place_order
20 -> recover_password
21 -> registration_problems
22 -> review
23 -> set_up_shipping_address
24 -> switch_account
25 -> track_order
26 -> track_refund


In [14]:
from tensorflow.keras.models import load_model

model = load_model('/kaggle/working/intent_model.h5')

# Example single query
single_query = "I'm having trouble with my electric bill"

# Tokenize the query
single_query_sequence = tokenizer.texts_to_sequences([single_query])  # Wrap in a list to maintain 2D input

# Pad the sequence
single_query_padded = pad_sequences(single_query_sequence, padding='post', truncating='post', maxlen=maxlen)

# Get prediction
prediction = model.predict(single_query_padded)

# Get the predicted class (numeric)
predicted_class = prediction.argmax(axis=-1)[0]

# Convert numeric class back to original category label
predicted_label = le.inverse_transform([predicted_class])[0]

print("Intenção identificada:", predicted_label)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Intenção identificada: payment_issue
