In [1]:
from google.colab import drive
drive.mount("/content/drive")
data_root= '/content/drive/My Drive/ChatBot'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Import the libraries
import json
import string
import random
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer


In [3]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Load the dataset
data_file= open(data_root + '/intents.json').read()
data=json.loads(data_file)

**Dataset loading and Exploration:**

In [5]:
data

{'intents': [{'tag': 'hello',
   'patterns': ['Hey there',
    'Greetings',
    'Good evening',
    'Hello, how can I help?',
    "Hi, what's new?",
    'Hey, good to see you',
    "What's happening?",
    'Greetings and salutations',
    'Hiya!',
    'Hello, nice to meet you',
    'Hola!',
    'Hey, how are you doing?',
    'Good day!',
    'Hello, lovely people',
    "Hi, how's your day?",
    'Greetings, Earthling!',
    'Hello, world!',
    "Hey, what's going on?",
    'Hi, how can I assist you?',
    'Hello, friend!',
    'Hi there, any news?',
    'Greetings, fellow human',
    'Hello, sunshine!',
    'Good to see you',
    'Hi, any updates?',
    "Hey, how's life?",
    'Hello, beautiful souls',
    'Hi, need any help?',
    "Hey, what's the word?",
    'Hello, fantastic person',
    "Hi, what's the buzz?",
    "Hey, how's everything?",
    'Hello, anyone home?',
    "Hi, what's cooking?",
    "Hey, how's your mood?",
    'Hello, dear friend',
    "Hi, what's the scoop?",
    'G

In [6]:
import pandas as pd

# Extract patterns and corresponding tags from the dataset
patterns_and_tags = [(pattern, intent['tag']) for intent in data['intents'] for pattern in intent['patterns']]

# Create a DataFrame
df = pd.DataFrame(patterns_and_tags, columns=['Pattern', 'Tag'])

# Display the DataFrame
df



Unnamed: 0,Pattern,Tag
0,Hey there,hello
1,Greetings,hello
2,Good evening,hello
3,"Hello, how can I help?",hello
4,"Hi, what's new?",hello
...,...,...
1138,Recommend a health book that explores the holi...,health
1139,How does proper hydration impact skin health a...,health
1140,Tell me about the benefits of engaging in hobb...,health
1141,What are some evidence-based strategies for im...,health


In [7]:
# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the shuffled DataFrame
df

Unnamed: 0,Pattern,Tag
0,How long have you been alive,age
1,How do scientists study the origins of the uni...,science
2,Going through a tough day,bad
3,Ciao for now,goodbye
4,Tell me about your abilities and skills,actions
...,...,...
1138,Tell me about the significance of spices in di...,food
1139,Share tips for maintaining a balanced and heal...,health
1140,Share insights on the benefits of laughter for...,health
1141,What's the future of virtual reality technology?,technology


In [8]:
df.describe()

Unnamed: 0,Pattern,Tag
count,1143,1143
unique,1140,24
top,Until we meet again,still there
freq,2,51


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Pattern  1143 non-null   object
 1   Tag      1143 non-null   object
dtypes: object(2)
memory usage: 18.0+ KB


In [10]:
df['Tag'].value_counts()

still there    51
wait           48
goodbye        48
weather        48
men            48
women          48
health         48
thanks         48
science        48
food           48
creator        48
good           48
programming    47
feeling        47
city           47
technology     47
age            47
hello          47
music          47
leisure        47
job            47
actions        47
bad            47
travel         47
Name: Tag, dtype: int64

**Text Preprocessing:**

In [11]:
#Lowercase the pattern
df['Pattern']= df['Pattern'].str.lower()
df

Unnamed: 0,Pattern,Tag
0,how long have you been alive,age
1,how do scientists study the origins of the uni...,science
2,going through a tough day,bad
3,ciao for now,goodbye
4,tell me about your abilities and skills,actions
...,...,...
1138,tell me about the significance of spices in di...,food
1139,share tips for maintaining a balanced and heal...,health
1140,share insights on the benefits of laughter for...,health
1141,what's the future of virtual reality technology?,technology


In [12]:
import string
exclude=string.punctuation
exclude

#Remove Punctuations
def remove_punc1(text, exclude):
    return text.translate(str.maketrans('', '', exclude))


df['Pattern']=df['Pattern'].apply(remove_punc1, exclude=exclude)
df

Unnamed: 0,Pattern,Tag
0,how long have you been alive,age
1,how do scientists study the origins of the uni...,science
2,going through a tough day,bad
3,ciao for now,goodbye
4,tell me about your abilities and skills,actions
...,...,...
1138,tell me about the significance of spices in di...,food
1139,share tips for maintaining a balanced and heal...,health
1140,share insights on the benefits of laughter for...,health
1141,whats the future of virtual reality technology,technology


In [13]:
#Tokenization
import nltk
from nltk.tokenize import word_tokenize
df['Pattern']=df['Pattern'].apply(word_tokenize)
df

Unnamed: 0,Pattern,Tag
0,"[how, long, have, you, been, alive]",age
1,"[how, do, scientists, study, the, origins, of,...",science
2,"[going, through, a, tough, day]",bad
3,"[ciao, for, now]",goodbye
4,"[tell, me, about, your, abilities, and, skills]",actions
...,...,...
1138,"[tell, me, about, the, significance, of, spice...",food
1139,"[share, tips, for, maintaining, a, balanced, a...",health
1140,"[share, insights, on, the, benefits, of, laugh...",health
1141,"[whats, the, future, of, virtual, reality, tec...",technology


In [14]:
#Apply Lemmatization to the Pattern column
!pip install spacy
!python -m spacy download en_core_web_sm

import spacy

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')


def lemmatize_tokens(tokens):
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    return lemmatized_tokens

# Apply lemmatization to the tokenized 'review' column
df['Pattern'] = df['Pattern'].apply(lemmatize_tokens)
df

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,Pattern,Tag
0,"[how, long, have, you, be, alive]",age
1,"[how, do, scientist, study, the, origin, of, t...",science
2,"[go, through, a, tough, day]",bad
3,"[ciao, for, now]",goodbye
4,"[tell, I, about, your, ability, and, skill]",actions
...,...,...
1138,"[tell, I, about, the, significance, of, spice,...",food
1139,"[share, tip, for, maintain, a, balanced, and, ...",health
1140,"[share, insight, on, the, benefit, of, laughte...",health
1141,"[what, s, the, future, of, virtual, reality, t...",technology


**TF-IDF representation of Pattern:**

In [15]:
#TF-IDF representation
from sklearn.feature_extraction.text import TfidfVectorizer
df['Pattern'] = df['Pattern'].apply(' '.join) #Convert the Pattern from a list to a string format
tfidf=TfidfVectorizer()
x=tfidf.fit_transform(df['Pattern'])

In [16]:
print(x)

  (0, 28)	0.5634406364718932
  (0, 80)	0.2297792640511401
  (0, 1022)	0.20625238045003955
  (0, 419)	0.3987161470098498
  (0, 535)	0.5634406364718932
  (0, 449)	0.33280050755478713
  (1, 955)	0.4504386797865168
  (1, 626)	0.1877276366495164
  (1, 641)	0.43767222268603445
  (1, 907)	0.30751373654823533
  (1, 873)	0.42687833700608213
  (1, 802)	0.39522959072621666
  (1, 271)	0.2480500665007692
  (1, 449)	0.2752839952333749
  (2, 234)	0.4232843742779995
  (2, 928)	0.5499475091745847
  (2, 916)	0.5343607454296905
  (2, 397)	0.48254188355890737
  (3, 621)	0.5939893970037836
  (3, 362)	0.2990158617238271
  (3, 161)	0.7468374057883275
  (4, 831)	0.557221794999077
  (4, 36)	0.3578033064233121
  (4, 0)	0.5918485202865275
  (4, 1023)	0.24017526315477283
  :	:
  (1140, 819)	0.21714608313237413
  (1140, 629)	0.2340944377116377
  (1140, 421)	0.302051031565027
  (1140, 572)	0.34300113736885474
  (1140, 89)	0.32999836965640955
  (1140, 36)	0.23079778806739581
  (1140, 362)	0.1842598607740269
  (1140,

**Label Encode the Tag:**

In [17]:
import joblib
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Label encode the 'Tag' column and store the result in 'y'
y = label_encoder.fit_transform(df['Tag'])
joblib.dump(label_encoder, 'label_encoder.pkl')




['label_encoder.pkl']

In [18]:
print(y)

[ 1 16  2 ...  9 18  9]


**Split into training and testing sets:**

In [19]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (914, 1026)
X_test shape: (229, 1026)
y_train shape: (914,)
y_test shape: (229,)


**Model Building and Training:**

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Convert sparse matrix to dense array
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Build the model
model = Sequential()
model.add(Dense(128, input_shape=(X_train_dense.shape[1],), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(set(y)), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_dense, y_train, epochs=45, batch_size=32, validation_data=(X_test_dense, y_test))


Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


<keras.src.callbacks.History at 0x7aa58415d090>

**Model Evaluation:**

In [21]:
# Evaluate the model on the test set
eval_results = model.evaluate(X_test_dense, y_test)

# Print the evaluation results
print("Test Loss:", eval_results[0])
print("Test Accuracy:", eval_results[1])


Test Loss: 0.26431751251220703
Test Accuracy: 0.903930127620697


In [22]:
# Save the model
model.save('chatbot_model.h5')
# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')



  saving_api.save_model(


['tfidf_vectorizer.pkl']

**User Input and Interaction with the ChatBot:**

In [24]:
import string
import nltk
import joblib
from nltk.tokenize import word_tokenize
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import load_model
import numpy as np

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')
# Load the pre-trained model
loaded_model = load_model('chatbot_model.h5')
# Load the TF-IDF vectorizer
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Function to preprocess user input for prediction
def preprocess_input_for_prediction(vectorizer, user_input):
    # Convert to lowercase
    user_input = user_input.lower()
    # Remove punctuation
    user_input = user_input.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    user_input = word_tokenize(user_input)
    # Lemmatization
    user_input = [token.lemma_ for token in nlp(" ".join(user_input))]
    # Convert to string
    user_input = ' '.join(user_input)
    # Transform the preprocessed input using TF-IDF vectorizer
    input_tfidf = vectorizer.transform([user_input])
    # Convert sparse matrix to dense array
    input_dense = input_tfidf.toarray()
    return input_dense

# Function to get chatbot response
def get_response(model, vectorizer, label_encoder, user_input):
    # Preprocess user input for prediction
    input_dense = preprocess_input_for_prediction(vectorizer, user_input)
    # Make prediction using the loaded model
    predicted_class = np.argmax(model.predict(input_dense))
    # Decode the predicted class to get the original tag
    predicted_tag = label_encoder.inverse_transform([predicted_class])[0]
    # Get the responses for the predicted tag
    responses = [intent['responses'] for intent in data['intents'] if intent['tag'] == predicted_tag]
    if responses:
        # Choose a random response
        chatbot_response = np.random.choice(responses[0])
        return chatbot_response
    else:
        return "I'm sorry, I don't understand that."

# User interaction loop
while True:
    user_input = input("You: ")

    if user_input.lower() == 'exit':
        break
    else:
        chatbot_response = get_response(loaded_model, tfidf_vectorizer, label_encoder, user_input)
        print("Chatbot:", chatbot_response)




You: Hey
Chatbot: Hey!
You: What can you do?
Chatbot: I am a Chat bot who can do some basic chat with you. I am not trained enough to do problem solving. I am created using basic NLP techniques and trained on a small amount of data.
You: How old are you?
Chatbot: I was born in 2024. I was made by Tarun Kumar Behera.
You: What are your hobbies?
Chatbot: I enjoy various leisure activities such as reading, listening to music, and learning new things during my free time.
You: Who created you?
Chatbot: I was created by Tarun Kumar Behera in 2024 as a beginner-level NLP project. He has completed his B.Tech in Biomedical Engineering from National Institute of Technology, Rourkela. He is skilled in Data Science, Machine Learning, and Artificial Intelligence. Check out his [GitHub](https://github.com/Tarun304) and [LinkedIn](https://www.linkedin.com/in/tarun-kumar-behera-b943541ba/) profiles for more details.
You: Tell me about Music.
Chatbot: Music is a universal language that I appreciate. Ho