# Data Analysis Chatbot

In [None]:

import pandas as pd
import json
from transformers import BertTokenizer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from pylab import rcParams
import tensorflow as tf
# import tensorflow_text
import tensorflow_hub as hub
import tensorflow_text as text
import keras_nlp
tf.config.run_functions_eagerly(True)


In [None]:
tf.get_logger().setLevel('ERROR')

sns.set_theme(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_file="../data/intent.json"
test_file="../data/test.json"

## Data Pre processing and Feature Extraction.

In [None]:
from sklearn.model_selection import train_test_split
with open(train_file) as f:
    data = json.load(f)

intent_df=pd.DataFrame(data['intents'])

intent_df=intent_df[['intent','patterns']].explode('patterns')

train_df, valid_df = train_test_split(intent_df, test_size=0.2, random_state=42)

train_df.head()

In [None]:
train_df.shape

In [None]:
with open(test_file) as f:
    data = json.load(f)
test_df=pd.DataFrame(data['intents'])
test_df=test_df.explode('patterns')
test_df.head()

In [None]:
train_features=train_df.copy()
train_labels=pd.DataFrame(train_features.pop('intent'))



In [None]:
train_features=train_features.values



In [None]:
chart = sns.countplot(train_labels, palette=HAPPY_COLORS_PALETTE,x="intent")
plt.title("Number of texts per intent")
chart.set_xticklabels(chart.get_xticklabels(), rotation=30, horizontalalignment='right');

### One Hot Encode labels

In [None]:
binarizer=LabelBinarizer()
train_labels=binarizer.fit_transform(train_labels.values)


In [None]:
train_labels.shape


In [None]:
test_features=test_df.copy()
test_labels=test_features.pop("intent")

valid_features=valid_df.copy()
valid_labels=valid_features.pop("intent")

test_features=test_features.values
valid_features=valid_features.values

test_labels=binarizer.transform(test_labels.values)
valid_labels=binarizer.transform(valid_labels.values)



## Loading models from TensorFlow Hub

In [None]:
bert_model_name = 'small_bert/bert_en_uncased_L-2_H-128_A-2'
map_name_to_handle = {

    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'
}

map_model_to_preprocess = {

    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',

}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

## Preprocessing Model

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
train_features[0]

In [None]:
text_test = train_features[0]
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

## BERT Model

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

## Define the Model

In [None]:
# !pip install keras

In [None]:
# def build_classifier_model():
#     text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

#     # Preprocess input text using BertPreprocessor
#     preprocessing_layer = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en_uncased", trainable=True)
#     encoder_inputs = preprocessing_layer(text_input)

#     # Get the BertBackbone model
#     encoder = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")

#     # Pass the preprocessed inputs through the backbone
#     outputs = encoder(encoder_inputs)
#     net = outputs['pooled_output']

#     # Additional layers for classification
#     net = tf.keras.layers.Dropout(0.1)(net)
#     net = tf.keras.layers.Dense(7, activation=None, name='classifier')(net)

#     return tf.keras.Model(text_input, net)

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)  # Increased dropout rate
  net = tf.keras.layers.Dense(12, activation=None, name='classifier')(net)  # Adjusted output size
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(train_features[0]))
print(tf.keras.activations.softmax(bert_raw_result))

In [None]:
classifier_model.summary()

## Model Training

In [None]:
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = [tf.metrics.CategoricalAccuracy()]
metrics

## Loading the BERT model and training

In [None]:
epochs=5
optimizer=tf.keras.optimizers.Adam(0.0005)
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)


In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_features,y=train_labels,
                               validation_data=(valid_features,valid_labels),
                               batch_size=5,
                               epochs=epochs)

In [None]:
loss, accuracy = classifier_model.evaluate(test_features,test_labels)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
# Get the predicted labels for the test data
predicted_labels = np.argmax(classifier_model.predict(test_features), axis=1)
# Convert one-hot encoded true labels back to categorical labels
true_labels = np.argmax(test_labels, axis=1)

# Create the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Visualize the confusion matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')  # Replace ... with your class labels
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
true_labels = np.argmax(test_labels, axis=1)

# Create a DataFrame to store the data
data = {'Pattern': test_df['patterns'],  # Assuming test_patterns contains the patterns for the test data
        'Actual': true_labels,
        'Predicted': predicted_labels}
df = pd.DataFrame(data)

# Display the DataFrame
df

In [None]:
invoice_text = "Show me a heatmap of order by customer"


# Predict details based on preprocessed text
predicted_label = classifier_model.predict([invoice_text])

# Convert the predicted label to a human-readable format if necessary
# For example, if you have a list of class labels, you can get the predicted label's index and find the corresponding class label
# class_labels = ['Class 0', 'Class 1', 'Class 2', ...]  # Replace ... with your class labels
predicted_class_index = np.argmax(predicted_label)
# predicted_class_label = class_labels[predicted_class_index]

# Print the predicted class label
print("Predicted class:", predicted_class_index)

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['categorical_accuracy']
val_acc = history_dict['val_categorical_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 8))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.grid(True)
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.grid(True)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
classifier_model.save("../data/intent_model")

In [None]:
import pickle
with open('../data/intent_model/label_binarizer.pkl', 'wb') as file:
    pickle.dump(binarizer, file)


In [41]:
loaded_model = tf.keras.models.load_model("../data/intent_model")
intent_prediction = loaded_model.predict(["Please show me a scatter plot of country and product type"])
print(intent_prediction)

[[-0.532727   -0.8623593   1.390384   -0.10985241  0.2148691   0.13875294
   4.5822864  -1.2159488  -3.3323498  -0.7280443  -0.6408461  -1.2203573 ]]
