# Importing Libraries

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import json
import string
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, Dropout, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import random

# Content loading

Content is a JSON file containing tag, intent and response.

In [4]:
with open("/content/BanglaHealthcareChatbotData.json") as BanglaHealthcareChatbotData:
  intents = json.load(BanglaHealthcareChatbotData)

In [5]:
intents

{'intents': [{'tag': 'Greetings',
   'patterns': ['এই যে', 'হেলো', 'হ্যালো', 'হাই', 'হে', 'হায়'],
   'responses': ['হাই  %% USER %% ! আমি কীভাবে সাহায্য করতে পারি?',
    'হ্যালো %% USER %%, আমি আপনাকে কীভাবে সাহায্য করতে পারি?',
    'হ্যালো %% USER %% !  আমি আপনার জন্য কী করতে পারি?',
    'হাই %% USER %%, আমি আপনাকে কীভাবে সাহায্য করতে পারি?',
    'হাই  %% USER %%, আমি আপনার জন্য কী করতে পারি?'],
   'context': ['']},
  {'tag': 'CourtesyGreeting',
   'patterns': ['আপনি কেমন আছেন?',
    'হাই, আপনি কেমন আছেন?',
    'হ্যালো, আপনি কেমন আছেন?',
    'কি অবস্থা , আপনার?',
    'কি খবর , আপনার?',
    'কি খবর ?',
    'কি অবস্থা ?',
    'তুমি কেমন আছো?',
    'কেমন আছো?',
    'আপনি ভাল করছেন আশা করি?',
    'ভাল আছো আশা করি?',
    'ভাল আছো ?',
    'হ্যালো আশা করি আপনি ভাল করছেন?'],
   'responses': ['আমি খুব ভালো, আপনি কেমন আছেন? ',
    'হ্যালো, আপনি কেমন আছেন? আমি ভালো ',
    'হ্যালো, আমি ভাল আছি আপনাকে ধন্যবাদ',
    'হাই, আমি ভাল আছি আপনাকে ধন্যবাদ, কেমন আছেন?',
    'হাই, ভাল আপনাকে ধন্যবাদ, আপনি ক

In [6]:
tags = []
patterns = []
responses = {}
for intent in intents['intents']:
    responses[intent['tag']] = intent["responses"]
    for lines in intent['patterns']:
        patterns.append(lines)
        tags.append(intent['tag'])


print("this is tags",tags)
print("this is input",patterns)
print("this is responses",responses)

this is tags ['Greetings', 'Greetings', 'Greetings', 'Greetings', 'Greetings', 'Greetings', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CourtesyGreeting', 'CurrentHumanQuery', 'CurrentHumanQuery', 'CurrentHumanQuery', 'CurrentHumanQuery', 'CurrentHumanQuery', 'CurrentHumanQuery', 'CurrentHumanQuery', 'NameQuery', 'NameQuery', 'NameQuery', 'NameQuery', 'NameQuery', 'NameQuery', 'RealNameQuery', 'RealNameQuery', 'RealNameQuery', 'RealNameQuery', 'RealNameQuery', 'RealNameQuery', 'RealNameQuery', 'TimeQuery', 'TimeQuery', 'TimeQuery', 'TimeQuery', 'TimeQuery', 'TimeQuery', 'TimeQuery', 'Thanks', 'Thanks', 'Thanks', 'Thanks', 'Thanks', 'Thanks', 'NotTalking2U', 'NotTalking2U', 'NotTalking2U', 'NotTalking2U', 'NotTalking2U', 'NotTalking2U', 'NotTalking2U', 'Jokes', 'Jokes', 'Jokes', 'Jokes', 'Joke

# Data Preprocessing

Now we construct a dataframe consist of patterns and their respective tags.

In [7]:
data = pd.DataFrame({"inputs":patterns,
                    "tags":tags})

In [8]:
data

Unnamed: 0,inputs,tags
0,এই যে,Greetings
1,হেলো,Greetings
2,হ্যালো,Greetings
3,হাই,Greetings
4,হে,Greetings
...,...,...
484,রিপোর্ট জানান,Result
485,রেজাল্ট বলুন,Result
486,রেজাল্ট বলুন আমাকে,Result
487,দয়া করে বলুন আমার রেজাল্ট কি,Result


# Tokenizing & Padding

In [9]:
tokenizer = Tokenizer(num_words = 2000)

In [10]:
tokenizer.fit_on_texts(data["inputs"])

In [11]:
train = tokenizer.texts_to_sequences(data["inputs"])

In [12]:
train

[[248, 183],
 [249],
 [145],
 [184],
 [250],
 [251],
 [24, 92, 146],
 [184, 24, 92, 146],
 [145, 24, 92, 146],
 [6, 185, 20],
 [6, 186, 20],
 [6, 186],
 [6, 185],
 [58, 92, 110],
 [92, 110],
 [24, 21, 187, 147, 111],
 [21, 110, 147, 111],
 [21, 110],
 [145, 147, 111, 24, 21, 187],
 [22, 37, 6],
 [58, 59, 93, 188],
 [24, 40, 41, 111, 252],
 [24, 6, 41, 253, 40, 6],
 [254, 19, 42, 255],
 [24, 59, 256, 37, 257],
 [22, 37, 60],
 [20, 37, 6],
 [40, 258, 93, 94, 148],
 [40, 6, 112, 259, 149, 148],
 [20, 260, 112, 93, 188],
 [58, 261],
 [59, 150, 37, 262],
 [20, 67, 37, 6],
 [20, 67, 37, 6, 151, 26],
 [150, 67, 37, 6],
 [20, 67, 37, 60],
 [20, 67, 37],
 [20, 67, 37, 151, 26],
 [20, 67, 37, 151, 26],
 [263, 264, 113],
 [152, 189, 113],
 [58, 6, 190, 152, 265, 113],
 [58, 6, 190, 183, 152, 189, 113],
 [24, 59, 68, 94, 266],
 [60, 6, 68],
 [68],
 [153, 33, 46],
 [153, 33, 46],
 [153, 33],
 [46],
 [46],
 [267, 268],
 [40, 20, 19, 42, 114, 2],
 [40, 150, 19, 42, 2, 95],
 [20, 19, 42, 114, 2],
 [20

In [13]:
x_train = pad_sequences(train)

In [14]:
x_train

array([[  0,   0,   0, ...,   0, 248, 183],
       [  0,   0,   0, ...,   0,   0, 249],
       [  0,   0,   0, ...,   0,   0, 145],
       ...,
       [  0,   0,   0, ..., 144,  60,  59],
       [  0,   0,   0, ...,  22, 144,   6],
       [  0,   0,   0, ...,  22, 143,   6]], dtype=int32)

# Encoding the output

In [15]:
le = LabelEncoder()

In [16]:
y_train = le.fit_transform(data["tags"])

In [17]:
y_train

array([10, 10, 10, 10, 10, 10,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  6,  6,  6,  6,  6,  6,  6, 15, 15, 15, 15, 15, 15, 19, 19,
       19, 19, 19, 19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22,
       22, 17, 17, 17, 17, 17, 17, 17, 13, 13, 13, 13, 13, 25, 25, 25, 25,
       25, 25, 21, 21, 21, 21, 21, 21, 21,  9,  9,  9,  9,  9,  9,  4,  4,
        4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [18]:
input_shape = x_train.shape[1]
print(input_shape)

11


In [19]:
unique_words = len(tokenizer.word_index)
output_length = le.classes_.shape[0]
print("Number of unique words: ", unique_words)
print("Output size: ", output_length)

Number of unique words:  401
Output size:  26


## Constructing a Neural Network


The initial layer is an Embedding layer, facilitating the transformation of input tokens into dense vectors of fixed size. Subsequently, two Bidirectional Long Short-Term Memory (LSTM) layers are configured to return sequences. A Dropout layer follows, providing regularization to prevent overfitting. The model then flattens the output and passes it through a Dense layer. Finally, the output layer consists of a Dense layer with output_length units and a softmax activation function.

In [20]:
model = tf.keras.Sequential()
model.add(Embedding(unique_words + 1, 50, input_length=input_shape))
model.add(Bidirectional(LSTM(10, return_sequences=True)))
model.add(Bidirectional(LSTM(10, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=output_length, activation='softmax'))

In [21]:
model.compile(loss = "sparse_categorical_crossentropy", optimizer ='adam', metrics=['accuracy'])

# Training the Model

Here, I am train the neural network. The training is performed on a dataset for 400 epochs.

In [27]:
BanglaBot = model.fit(x_train, y_train, epochs=400)


Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78/400
Epoch 7

In [28]:
print("Accuracy: ",BanglaBot.history['accuracy'][-1])

Accuracy:  0.8179959058761597


# Chatting with bot

In [None]:

print("BanglaBot: হ্যালো ! আমি Bangla bot । আপনাকে কিভাবে সাহায্য করতে পারি ?")
while True:
    textList = []
    user_input = input("You: ")
    prediction_input = []


    prediction_input = ''.join(user_input)
    textList.append(prediction_input)

    prediction_input = tokenizer.texts_to_sequences(textList)
    prediction_input = np.array(prediction_input).reshape(-1)
    prediction_input = pad_sequences([prediction_input], input_shape)

    output = model.predict(prediction_input)
    output = output.argmax()

    response_tag = le.inverse_transform([output])[0]
    print("BanglaBot: ", random.choice(responses[response_tag]))
    if response_tag == 'GoodBye':
        break

BanglaBot: হ্যালো ! আমি Bangla bot । আপনাকে কিভাবে সাহায্য করতে পারি ?
