## Load Dataset

In [1]:
import json

# Open and read the JSON file containing intent data
with open("./data/intents.json") as data_file:
    data = data_file.read()

# Parse the JSON data into a Python dictionary
intents = json.loads(data)

## Data Preprocessing

In [2]:
# Import necessary libraries
import nltk
from nltk.stem import WordNetLemmatizer

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Download NLTK resources (if not already downloaded)
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize lists to hold words, classes, and document tuples
words = []
classes = []
documents = []

# Define punctuation marks to ignore during processing
ignore_words = ["?", "!"]

# Iterate over intents and their patterns
for intent in intents["intents"]:
    for pattern in intent["patterns"]:
        # Tokenize each word in the pattern
        w = nltk.word_tokenize(pattern)

        # Extend the words list with tokenized words
        words.extend(w)

        # Add documents to the corpus as tuples containing tokenized words and intent tag
        documents.append((w, intent["tag"]))

        # Add the intent tag to the classes list if it's not already present
        if intent["tag"] not in classes:
            classes.append(intent["tag"])

### Lemmatize, Convert to lowercase, and Remove duplicates

In [4]:
# Lemmatize, convert to lowercase, and remove duplicates from the words list
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

# Sort the classes list
classes = sorted(list(set(classes)))

# Print information about the dataset
print(len(documents), "documents")  # Total number of documents (patterns and intents)
print(len(classes), "classes", classes)  # Total number of classes (intents)
print(
    len(words), "unique lemmatized words", words
)  # Total number of unique lemmatized words

21 documents
7 classes ['goodbye', 'greeting', 'java_course_recommendation', 'javascript_course_recommendation', 'options', 'python_course_recommendation', 'thanks']
35 unique lemmatized words ['a', 'any', 'appreciate', 'bye', 'can', 'course', 'do', 'find', 'for', 'goodbye', 'have', 'hello', 'help', 'hey', 'hi', 'how', 'i', 'it', 'java', 'javascript', 'later', 'learn', 'offer', 'python', 'see', 'service', 'suggestion', 'thank', 'thanks', 'to', 'tutorial', 'want', 'what', 'where', 'you']


### Save words and classes list

In [5]:
import pickle

# Save processed data to pickle files for future use
with open("./models/words.pkl", "wb") as words_file:
    pickle.dump(words, words_file)  # Save the words list to a pickle file

with open("./models/classes.pkl", "wb") as classes_file:
    pickle.dump(classes, classes_file)  # Save the classes list to a pickle file

## Data Splitting

In [6]:
import random

# Prepare training data
training_data = []

# Iterate over each document (pattern and intent)
for doc in documents:
    # Initialize our bag of words
    bag_of_words = []

    # Get the list of tokenized words for the pattern
    pattern_words = doc[0]

    # Lemmatize each word to its base form
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]

    # Create the bag of words array
    for word in words:
        bag_of_words.append(1) if word in pattern_words else bag_of_words.append(0)

    # Generate the output row: '0' for each tag and '1' for the current tag
    output_row = [0] * len(classes)
    output_row[classes.index(doc[1])] = 1

    # Add the bag of words and output row to the training data
    training_data.append([bag_of_words, output_row])

# Shuffle the training data
random.shuffle(training_data)

# Create train_x and train_y lists
train_x = [data[0] for data in training_data]  # Features
train_y = [data[1] for data in training_data]  # Labels

# Print a message indicating the completion of training data creation
print("Training data created")

Training data created


## Model Building

In [7]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import SGD

model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation="softmax"))
sgd = SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               4608      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 7)                 455       
                                                                 
Total params: 13319 (52.03 KB)
Trainable params: 13319 (52.03 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [8]:
import numpy as np

hist = model.fit(
    np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [9]:
model.save("./models/model.keras", hist)

In [10]:
import random
import nltk
import numpy as np

# Define your functions


def clean_up_sentence(sentence: str) -> list:
    """
    Tokenizes and lemmatizes the input sentence.

    Args:
    sentence (str): Input sentence to be cleaned up.

    Returns:
    list: List of lemmatized words.
    """
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words


def bow(sentence: str, words: list, show_details: bool = True) -> np.array:
    """
    Converts a sentence into a bag of words array.

    Args:
    sentence (str): Input sentence.
    words (list): List of words from the vocabulary.
    show_details (bool): Whether to print details or not.

    Returns:
    np.array: Bag of words array.
    """
    sentence_words = clean_up_sentence(sentence)
    bag = [0] * len(words)
    for s in sentence_words:
        for i, w in enumerate(words):
            if w == s:
                bag[i] = 1
                if show_details:
                    print("found in bag: %s" % w)
    return np.array(bag)


def predict_class(sentence: str, model) -> list:
    """
    Predicts the intent of the input sentence.

    Args:
    sentence (str): Input sentence.
    model: Trained model for prediction.

    Returns:
    list: List of intents with their probabilities.
    """
    p = bow(sentence, words, show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25
    results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list


def get_response(ints: list, intents_json: dict) -> str:
    """
    Retrieves a response based on the predicted intent.

    Args:
    ints (list): List of predicted intents with their probabilities.
    intents_json (dict): JSON object containing intents and responses.

    Returns:
    str: Response message.
    """
    tag = ints[0]["intent"]
    list_of_intents = intents_json["intents"]
    for i in list_of_intents:
        if i["tag"] == tag:
            result = random.choice(i["responses"])
            break
    return result


def chatbot_response(text: str) -> str:
    """
    Generates a response from the chatbot based on the input text.

    Args:
    text (str): Input text from the user.

    Returns:
    str: Response message from the chatbot.
    """
    ints = predict_class(text, model)
    res = get_response(ints, intents)
    return res

In [None]:
!pip install -U gradio
import gradio as gr


def chatbot(message, history):
    if message.strip() != "":
        res = chatbot_response(message)
        return res


iface = gr.ChatInterface(
    chatbot,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(
        placeholder="Ask me a yes or no question", container=False, scale=7
    ),
    description="Ask questions about Jobs & Courses",
    theme="soft",
    examples=["Do you have any suggestions for a Python course?"],
    cache_examples=True,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
    title="Chat with Bot",
)
iface.launch()