In [1]:
import numpy as np
import nltk
import pandas as pd
import json
import re

In [2]:
from nltk.stem import WordNetLemmatizer

### Load data and preprocessing

In [3]:
file=open('../intents.json')
json_file=json.load(file)

In [4]:
dictionary_tag_pattern={}
dictionary_tag_response={}

In [5]:
all_tags=list(set([dic['tag'].lower().strip() for dic in json_file['intents']]))

In [55]:
all_tags

['appreciate',
 'haha',
 'exclaim',
 'greeting',
 'greetreply',
 'whatsup',
 'noanswer',
 'goodbye',
 'age',
 'inspire',
 'thanks',
 'contact',
 'activity',
 'timer',
 'programmer',
 'song',
 'covid19',
 'datetime',
 'cricket',
 'riddle',
 'options',
 'no',
 'insult',
 'karan',
 'suggest',
 'weather',
 'news',
 'jokes',
 'identity',
 'nicetty',
 'google']

In [6]:
for tag in all_tags:
    dictionary_tag_pattern[tag]=[]
    dictionary_tag_response[tag]=[]

In [7]:
for dic in json_file['intents']:
    tag=dic['tag'].lower().strip()
    
    for pattern in dic['patterns']:
        dictionary_tag_pattern[tag].append(pattern)
    for response in dic['responses']:
        dictionary_tag_response[tag].append(response)
    

In [8]:
DataF=pd.DataFrame(columns=['pattern','tag'])

In [9]:
for tag in dictionary_tag_pattern.keys():
    for pattern in dictionary_tag_pattern[tag]:
        DataF.loc[len(DataF.index)]=[pattern,tag]

In [10]:
DataF.head(10)

Unnamed: 0,pattern,tag
0,You are awesome,appreciate
1,you are the best,appreciate
2,you are great,appreciate
3,you are good,appreciate
4,haha,haha
5,lol,haha
6,rofl,haha
7,lmao,haha
8,thats funny,haha
9,Awesome,exclaim


In [11]:
X=DataF['pattern'].values
Y=DataF['tag'].values

In [12]:
def cleanText(text:str)->str:
    wordlem=WordNetLemmatizer()
    clenanedText=''
    pattern1=r'[^A-Za-z\s]'
    pattern2=r'\s+'
    text=text.lower().strip()
    text=re.sub(pattern1,'',text)
    text=re.sub(pattern2,' ',text)
    for word in text.split():
        word=wordlem.lemmatize(word,pos='v')
        clenanedText=clenanedText+word+" "
    clenanedText=clenanedText.strip()
    return clenanedText

In [13]:
cleandedX=[]
for sen in X:
    cleandedX.append(cleanText(sen))

In [14]:
cleandedX

['you be awesome',
 'you be the best',
 'you be great',
 'you be good',
 'haha',
 'lol',
 'rofl',
 'lmao',
 'thats funny',
 'awesome',
 'great',
 'i know',
 'ok',
 'yeah',
 'hi there',
 'how be you',
 'be anyone there',
 'hey',
 'hola',
 'hello',
 'good day',
 'namaste',
 'yo',
 'i be good',
 'im good',
 'i be fine',
 'im fine',
 'good',
 'whats up',
 'wazzup',
 'how be you',
 'sup',
 'how you do',
 'bye',
 'see you later',
 'goodbye',
 'get lose',
 'till next time',
 'bbye',
 'how old be you',
 'when be you make',
 'what be your age',
 'who inspire you',
 'who be your inspiration',
 'who motivate you',
 'thank',
 'thank you',
 'thats helpful',
 'awesome thank',
 'thank for help me',
 'contact developer',
 'contact karan',
 'contact programmer',
 'contact creator',
 'what be you do',
 'what be you upto',
 'set a timer',
 'who make you',
 'who design you',
 'who program you',
 'top songs',
 'best songs',
 'hot songs',
 'top songs',
 'top ten songs',
 'covid',
 'what be the time',
 'what

#### Encoding the target variable and create a vector space for training data

In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
X=np.array(X)
Y=np.array(Y)

In [17]:
Y

array(['appreciate', 'appreciate', 'appreciate', 'appreciate', 'haha',
       'haha', 'haha', 'haha', 'haha', 'exclaim', 'exclaim', 'exclaim',
       'exclaim', 'exclaim', 'greeting', 'greeting', 'greeting',
       'greeting', 'greeting', 'greeting', 'greeting', 'greeting',
       'greeting', 'greetreply', 'greetreply', 'greetreply', 'greetreply',
       'greetreply', 'whatsup', 'whatsup', 'whatsup', 'whatsup',
       'whatsup', 'goodbye', 'goodbye', 'goodbye', 'goodbye', 'goodbye',
       'goodbye', 'age', 'age', 'age', 'inspire', 'inspire', 'inspire',
       'thanks', 'thanks', 'thanks', 'thanks', 'thanks', 'contact',
       'contact', 'contact', 'contact', 'activity', 'activity', 'timer',
       'programmer', 'programmer', 'programmer', 'song', 'song', 'song',
       'song', 'song', 'covid19', 'datetime', 'datetime', 'datetime',
       'datetime', 'datetime', 'datetime', 'datetime', 'cricket',
       'cricket', 'riddle', 'riddle', 'riddle', 'options', 'options',
       'options', 'o

In [18]:
Encoder=LabelEncoder()
encodedY=Encoder.fit_transform(Y)

In [19]:
tfIdf=TfidfVectorizer()
tfIdf.fit(X)

In [20]:
vectorizedX=tfIdf.transform(X)

In [21]:
tfIdf.get_feature_names()



['10',
 '19',
 'age',
 'am',
 'anyone',
 'are',
 'ask',
 'awesome',
 'bad',
 'bbye',
 'be',
 'best',
 'bye',
 'can',
 'contact',
 'could',
 'covid',
 'creator',
 'cricket',
 'current',
 'date',
 'day',
 'designed',
 'developer',
 'do',
 'doing',
 'dumb',
 'fine',
 'for',
 'funny',
 'get',
 'good',
 'goodbye',
 'google',
 'great',
 'haha',
 'he',
 'hello',
 'help',
 'helpful',
 'helping',
 'hey',
 'hi',
 'hola',
 'hot',
 'how',
 'idiot',
 'india',
 'inspiration',
 'inspires',
 'internet',
 'is',
 'it',
 'joke',
 'karan',
 'know',
 'later',
 'latest',
 'laugh',
 'lmao',
 'lol',
 'lost',
 'made',
 'make',
 'malik',
 'matches',
 'me',
 'motivates',
 'namaste',
 'news',
 'next',
 'nice',
 'no',
 'nope',
 'offered',
 'ok',
 'old',
 'programmed',
 'programmer',
 'provide',
 'question',
 'riddle',
 'rofl',
 'score',
 'search',
 'see',
 'set',
 'shut',
 'songs',
 'suggest',
 'suggestions',
 'sup',
 'support',
 'talk',
 'talking',
 'tell',
 'temperature',
 'ten',
 'thank',
 'thanks',
 'that',
 '

#### machine learning model creation and parameter tuning 

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV

In [23]:
randomForestModel=RandomForestClassifier()
score=cross_val_score(randomForestModel,vectorizedX,encodedY,cv=4)



In [24]:
score

array([0.44827586, 0.35714286, 0.35714286, 0.32142857])

In [25]:
param_grid = {  'bootstrap': [True,False], 'max_depth': [5, 10, 15], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15,20,25,40]}

gridSearch=GridSearchCV(randomForestModel,param_grid)

In [26]:
gridSearch.fit(vectorizedX,encodedY)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [27]:
gridSearch.best_score_

0.44980237154150193

In [28]:
gridSearch.best_estimator_

#### DeepLearning Model

In [32]:
import tensorflow as tf

In [33]:
oov_token='<OOV>'
Tokenizer=tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)

In [34]:
Tokenizer.fit_on_texts(cleandedX)

In [35]:
len(Tokenizer.word_index)

117

In [36]:
TrainSeq=Tokenizer.texts_to_sequences(cleandedX)

In [38]:
Trainpad=tf.keras.preprocessing.sequence.pad_sequences(TrainSeq,padding='post')

In [40]:
Trainpad.shape

(113, 6)

In [56]:
deepModel=tf.keras.Sequential(
[
    tf.keras.layers.Embedding(len(Tokenizer.word_index)+1,32,input_length=Trainpad.shape[1]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,dropout=0.2,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,dropout=0.2,return_sequences=False)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dropout(0.01),
    tf.keras.layers.Dense(20, activation = "relu"),
    tf.keras.layers.Dense(31,activation='softmax')
    
])


In [57]:
deepModel.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 6, 32)             3776      
                                                                 
 bidirectional_4 (Bidirectio  (None, 6, 200)           106400    
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 flatten_2 (Flatten)         (None, 200)               0         
                                                                 
 dense_6 (Dense)             (None, 50)                10050     
                                                                 
 dropout_2 (Dropout)         (None, 50)               

In [76]:
deepModel.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [77]:
encodedNY=np.array(encodedY)

In [78]:
history=deepModel.fit(Trainpad,encodedNY,epochs=60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
