# Closed Domain Chatbot Maker

![chatbot-gig](https://media2.giphy.com/media/S60CrN9iMxFlyp7uM8/giphy.gif)

## Data & Preprocessing

In [75]:
# bi lingual --> tags_bilingual.txt
# pt --> tags_pt.txt
# en --> tags_en.txt

# 614 p

with open('data\\tags_bilingual.txt', encoding='utf8') as file_in:
    X = [' '.join(line.strip().split(' ')[:-1]) for line in file_in]
with open('data\\tags_bilingual.txt', encoding='utf8') as file_in:
    Y = [line.strip().split(' ')[-1] for line in file_in]

from keras.layers import TextVectorization

vocab_size = 2500 #2500 for bilingual or 1500 for english or portuguese
embed_size = 256
max_len = 10

text_vectorization = TextVectorization(max_tokens=vocab_size, output_mode="int", ngrams=2)
text_vectorization.adapt(X)
vocabulary = text_vectorization.get_vocabulary()
inverse_vocab = dict(enumerate(vocabulary))

with open(r'pre_trained_aira\\vocabulary_bilingual.txt', 'w') as fp:
    for word in vocabulary:
        fp.write("%s\n" % word)
    
encoded_X = text_vectorization(X)

from keras_preprocessing.sequence import pad_sequences
encoded_X_padded = pad_sequences(encoded_X, maxlen=max_len, truncating='post')

from keras.utils import to_categorical

one_hot_encoded_Y = to_categorical(Y)[:,1:]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(encoded_X_padded, one_hot_encoded_Y, test_size=0.1, random_state=42)


## Model

In [None]:
import tensorflow as tf
import keras

inputs = tf.keras.Input(shape=(None,), dtype="int32")

embedded = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, mask_zero=True)(inputs)

x = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(128, return_sequences=True))(embedded)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(x)
x = tf.keras.layers.Dropout(0.8)(x)

outputs = tf.keras.layers.Dense(142, activation="softmax")(x)
model = tf.keras.Model(inputs, outputs)

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")
model.summary()

callbacks = [keras.callbacks.ModelCheckpoint("pre_trained_aira\\Aira_bilingual.keras",
                                                save_best_only=True,
                                                monitor="val_loss",
                                                patience=10, 
                                                restore_best_weights=True)]

model.fit(x_train,
          y_train,
          validation_split = 0.2,
          epochs=100,
          batch_size=8,
          verbose=1,
          callbacks=callbacks)

model = keras.models.load_model("pre_trained_aira\\Aira_bilingual.keras")
test_loss_score, test_acc_score = model.evaluate(x_test, y_test)

print(f'Final Loss: {round(test_loss_score, 1)}.')
print(f'Final Performance: {round(test_acc_score * 100, 2)} %.')


## Load Model

In [76]:
import keras
import numpy as np
from keras_preprocessing.sequence import pad_sequences

model = keras.models.load_model("pre_trained_aira\\Aira_bilingual.keras")

with open(r'pre_trained_aira\\vocabulary_bilingual.txt', 'r') as fp:
    vocabulary = [line[:-1] for line in fp]

with open('data\\answers_en.txt', encoding='utf8') as file_in:
    answers = [line.strip() for line in file_in]

from keras.layers import TextVectorization

text_vectorization = TextVectorization(max_tokens=vocab_size, 
                                        output_mode="int", 
                                        ngrams=2,
                                        vocabulary=vocabulary)


## Testing

In [77]:
import string

text = '''what is Interpretability?'''
#text = '''What is the problem of alignment?'''
#text = '''O que é Interpretabilidade?'''
#text = '''O que é o problema de Alinhamento?'''
#text = '''What is Machine Learning?'''
#text = '''O que é Ética das Virtudes?'''
#text = '''What is your name?'''
#text = '''Qual é o seu nome?'''
#text = '''O que é SGD?'''
#text = '''What is Stochastic Gradient Descent?'''

print(f'Questions: {text}\n')

encoded_sentence = text_vectorization(text.lower().translate(str.maketrans('', '', string.punctuation)))
print(f'Encoded Sentence: {encoded_sentence}\n')

encoded_sentence_padded = pad_sequences([encoded_sentence], maxlen=10, truncating='post')
print(f'Padded Encoded Sentence: {encoded_sentence_padded}\n')

preds = model.predict(encoded_sentence_padded,verbose=0)[0]
output = answers[np.argmax(preds)]
print(f'Answers: {output} [Confidence: {max(preds) * 100: .2f} %]')

Questions: what is Interpretability?

Encoded Sentence: [   3    7 1281    8 1975]

Padded Encoded Sentence: [[   0    0    0    0    0    3    7 1281    8 1975]]

Answers: Interpretability is the ability to explain or present the reasoning of an ML model in terms understandable to a human. [Confidence:  100.00 %]
