In [1]:
import random
import json
import pickle
import numpy as np
import tensorflow as tf
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="intents.json")


Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
print(dataset['train']['intents'])

[[{'tag': 'greeting', 'patterns': ['Hi', 'Hello', 'Hey', 'Good morning', 'Good evening', 'How are you?', 'Hi there', 'Hello there', 'Hey there', 'Greetings'], 'response': ['Hello!', 'Hi, how can I assist you?', 'Good to see you!', 'Hello there!', 'Hey, how can I help?', 'Hi! What can I do for you?', 'Welcome!', 'Greetings!', 'Nice to meet you!', 'Hi, what’s up?'], 'context': ['']}, {'tag': 'goodbye', 'patterns': ['Bye', 'See you', 'Goodbye', 'Take care', 'Catch you later', 'See you soon', 'I’m leaving', 'Farewell', 'Talk to you later', 'Later'], 'response': ['Goodbye!', 'See you soon!', 'Take care!', 'Bye! Have a great day!', 'Catch you later!', 'Farewell!', 'See you again!', 'Have a good one!', 'Take care, bye!', 'Talk to you later!'], 'context': ['']}, {'tag': 'thanks', 'patterns': ['Thanks', 'Thank you', 'I appreciate it', 'Thanks a lot', 'Many thanks', 'Thanks so much', 'Thanks for your help', 'Much obliged', 'Thank you kindly', 'Thanks a million'], 'response': ["You're welcome!", 

In [11]:
with open("intents.json", "r") as file:
    intents = json.load(file)

In [12]:
# Initialize data structures
words = []
classes = []
documents = []
ignore_letters = ['?', '!', '.', ',']

In [14]:
# Download necessary NLTK datasets if you haven't already
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [16]:
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # Tokenize using split()
        word_list = pattern.split()
        words.extend(word_list)
        documents.append((word_list, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [17]:
from transformers import AutoTokenizer

# Initialize a tokenizer (you can use any model, e.g., "bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

for intent in intents['intents']:
    for pattern in intent['patterns']:
        # Tokenize using transformers tokenizer
        word_list = tokenizer.tokenize(pattern)
        words.extend(word_list)
        documents.append((word_list, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [20]:
# Lemmatize and clean up words
words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in ignore_letters]
words = sorted(set(words))

classes = sorted(set(classes))

In [21]:
# Save words and classes for future use
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

In [22]:
# Create training data
training = []
output_empty = [0] * len(classes)

In [23]:
for document in documents:
    bag = []
    word_patterns = document[0]
    word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
    for word in words:
        bag.append(1) if word in word_patterns else bag.append(0)

    output_row = list(output_empty)
    output_row[classes.index(document[1])] = 1
    training.append(bag + output_row)

In [24]:
# Shuffle and convert training data to NumPy arrays
random.shuffle(training)
training = np.array(training)

train_x = training[:, :len(words)]
train_y = training[:, len(words):]

In [25]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, input_shape=(len(train_x[0]),), activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(train_y[0]), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
# Compile the model
sgd = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])


In [27]:
# Train the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)


Epoch 1/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.1831 - loss: 2.2835
Epoch 2/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3166 - loss: 2.1687
Epoch 3/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4648 - loss: 1.9991
Epoch 4/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5266 - loss: 1.8087
Epoch 5/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7392 - loss: 1.3960
Epoch 6/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7784 - loss: 0.9633
Epoch 7/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7994 - loss: 0.7408
Epoch 8/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8860 - loss: 0.5575
Epoch 9/200
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━

In [28]:
# Save the model
model.save('chatbot_model.h5', hist)
print("Model training complete and saved!")



Model training complete and saved!
