In [1]:
import numpy as np
import nltk
import pandas as pd
import json
import re

In [2]:
from nltk.stem import WordNetLemmatizer

### Load data and preprocessing

In [3]:
file=open('../intents.json')
json_file=json.load(file)

In [4]:
dictionary_tag_pattern={}
dictionary_tag_response={}

In [5]:
all_tags=list(set([dic['tag'].lower().strip() for dic in json_file['intents']]))

In [6]:
all_tags

['greeting',
 'cricket',
 'noanswer',
 'age',
 'karan',
 'google',
 'identity',
 'suggest',
 'datetime',
 'news',
 'weather',
 'insult',
 'options',
 'exclaim',
 'contact',
 'appreciate',
 'jokes',
 'haha',
 'no',
 'thanks',
 'timer',
 'nicetty',
 'covid19',
 'greetreply',
 'programmer',
 'song',
 'activity',
 'whatsup',
 'inspire',
 'goodbye',
 'riddle']

In [7]:
for tag in all_tags:
    dictionary_tag_pattern[tag]=[]
    dictionary_tag_response[tag]=[]

In [8]:
for dic in json_file['intents']:
    tag=dic['tag'].lower().strip()
    
    for pattern in dic['patterns']:
        dictionary_tag_pattern[tag].append(pattern)
    for response in dic['responses']:
        dictionary_tag_response[tag].append(response)
    

In [9]:
DataF=pd.DataFrame(columns=['pattern','tag'])

In [10]:
for tag in dictionary_tag_pattern.keys():
    for pattern in dictionary_tag_pattern[tag]:
        DataF.loc[len(DataF.index)]=[pattern,tag]

In [11]:
DataF.head(10)

Unnamed: 0,pattern,tag
0,Hi there,greeting
1,How are you,greeting
2,Is anyone there?,greeting
3,Hey,greeting
4,Hola,greeting
5,Hello,greeting
6,Good day,greeting
7,Namaste,greeting
8,yo,greeting
9,current cricket matches,cricket


In [12]:
X=DataF['pattern'].values
Y=DataF['tag'].values

In [13]:
def cleanText(text:str)->str:
    wordlem=WordNetLemmatizer()
    clenanedText=''
    pattern1=r'[^A-Za-z\s]'
    pattern2=r'\s+'
    text=text.lower().strip()
    text=re.sub(pattern1,'',text)
    text=re.sub(pattern2,' ',text)
    for word in text.split():
        word=wordlem.lemmatize(word,pos='v')
        clenanedText=clenanedText+word+" "
    clenanedText=clenanedText.strip()
    return clenanedText

In [14]:
cleandedX=[]
for sen in X:
    cleandedX.append(cleanText(sen))

In [15]:
cleandedX

['hi there',
 'how be you',
 'be anyone there',
 'hey',
 'hola',
 'hello',
 'good day',
 'namaste',
 'yo',
 'current cricket match',
 'cricket score',
 'how old be you',
 'when be you make',
 'what be your age',
 'who be he',
 'who be that',
 'who be karan',
 'karan malik',
 'google',
 'search',
 'internet',
 'who be you',
 'what be you',
 'you be useless',
 'useless',
 'suggest',
 'suggestions',
 'you be bad',
 'what be the time',
 'what be the date',
 'date',
 'time',
 'tell me the date',
 'day',
 'what day be be today',
 'news',
 'latest news',
 'india news',
 'temperature',
 'weather',
 'how hot be it',
 'you be dumb',
 'shut up',
 'idiot',
 'how you could help me',
 'what you can do',
 'what help you provide',
 'how you can be helpful',
 'what support be offer',
 'awesome',
 'great',
 'i know',
 'ok',
 'yeah',
 'contact developer',
 'contact karan',
 'contact programmer',
 'contact creator',
 'you be awesome',
 'you be the best',
 'you be great',
 'you be good',
 'tell me a joke',

#### Encoding the target variable and create a vector space for training data

In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
X=np.array(X)
Y=np.array(Y)

In [18]:
Y

array(['greeting', 'greeting', 'greeting', 'greeting', 'greeting',
       'greeting', 'greeting', 'greeting', 'greeting', 'cricket',
       'cricket', 'age', 'age', 'age', 'karan', 'karan', 'karan', 'karan',
       'google', 'google', 'google', 'identity', 'identity', 'suggest',
       'suggest', 'suggest', 'suggest', 'suggest', 'datetime', 'datetime',
       'datetime', 'datetime', 'datetime', 'datetime', 'datetime', 'news',
       'news', 'news', 'weather', 'weather', 'weather', 'insult',
       'insult', 'insult', 'options', 'options', 'options', 'options',
       'options', 'exclaim', 'exclaim', 'exclaim', 'exclaim', 'exclaim',
       'contact', 'contact', 'contact', 'contact', 'appreciate',
       'appreciate', 'appreciate', 'appreciate', 'jokes', 'jokes',
       'jokes', 'haha', 'haha', 'haha', 'haha', 'haha', 'no', 'no',
       'thanks', 'thanks', 'thanks', 'thanks', 'thanks', 'timer',
       'nicetty', 'nicetty', 'covid19', 'greetreply', 'greetreply',
       'greetreply', 'gree

In [19]:
Encoder=LabelEncoder()
encodedY=Encoder.fit_transform(Y)

In [20]:
tfIdf=TfidfVectorizer()
tfIdf.fit(X)

In [21]:
vectorizedX=tfIdf.transform(X)

In [22]:
tfIdf.get_feature_names()



['10',
 '19',
 'age',
 'am',
 'anyone',
 'are',
 'ask',
 'awesome',
 'bad',
 'bbye',
 'be',
 'best',
 'bye',
 'can',
 'contact',
 'could',
 'covid',
 'creator',
 'cricket',
 'current',
 'date',
 'day',
 'designed',
 'developer',
 'do',
 'doing',
 'dumb',
 'fine',
 'for',
 'funny',
 'get',
 'good',
 'goodbye',
 'google',
 'great',
 'haha',
 'he',
 'hello',
 'help',
 'helpful',
 'helping',
 'hey',
 'hi',
 'hola',
 'hot',
 'how',
 'idiot',
 'india',
 'inspiration',
 'inspires',
 'internet',
 'is',
 'it',
 'joke',
 'karan',
 'know',
 'later',
 'latest',
 'laugh',
 'lmao',
 'lol',
 'lost',
 'made',
 'make',
 'malik',
 'matches',
 'me',
 'motivates',
 'namaste',
 'news',
 'next',
 'nice',
 'no',
 'nope',
 'offered',
 'ok',
 'old',
 'programmed',
 'programmer',
 'provide',
 'question',
 'riddle',
 'rofl',
 'score',
 'search',
 'see',
 'set',
 'shut',
 'songs',
 'suggest',
 'suggestions',
 'sup',
 'support',
 'talk',
 'talking',
 'tell',
 'temperature',
 'ten',
 'thank',
 'thanks',
 'that',
 '

#### machine learning model creation and parameter tuning 

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV

In [24]:
randomForestModel=RandomForestClassifier()
score=cross_val_score(randomForestModel,vectorizedX,encodedY,cv=4)



In [25]:
score

array([0.51724138, 0.39285714, 0.32142857, 0.28571429])

In [26]:
param_grid = {  'bootstrap': [True,False], 'max_depth': [5, 10, 15], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15,20,25,40]}

gridSearch=GridSearchCV(randomForestModel,param_grid)

In [None]:
gridSearch.fit(vectorizedX,encodedY)

In [28]:
gridSearch.best_score_

0.4320158102766799

In [29]:
gridSearch.best_estimator_

#### DeepLearning Model

In [30]:
import tensorflow as tf

In [31]:
oov_token='<OOV>'
Tokenizer=tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)

In [69]:
Tokenizer.fit_on_texts(cleandedX)

In [70]:
len(Tokenizer.word_index)

117

In [71]:
TrainSeq=Tokenizer.texts_to_sequences(cleandedX)

In [72]:
Trainpad=tf.keras.preprocessing.sequence.pad_sequences(TrainSeq,padding='post')

In [36]:
Trainpad.shape

(113, 6)

In [37]:
deepModel=tf.keras.Sequential(
[
    tf.keras.layers.Embedding(len(Tokenizer.word_index)+1,32,input_length=Trainpad.shape[1]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,dropout=0.2,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,dropout=0.2,return_sequences=False)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dropout(0.01),
    tf.keras.layers.Dense(20, activation = "relu"),
    tf.keras.layers.Dense(31,activation='softmax')
    
])


In [38]:
deepModel.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6, 32)             3776      
                                                                 
 bidirectional (Bidirectiona  (None, 6, 200)           106400    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 flatten (Flatten)           (None, 200)               0         
                                                                 
 dense (Dense)               (None, 50)                10050     
                                                                 
 dropout (Dropout)           (None, 50)                0

In [49]:
deepModel.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [46]:
encodedNY=np.array(encodedY)

In [47]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(Trainpad,encodedNY,test_size=0.3,random_state=0)

In [51]:
history=deepModel.fit(Trainpad,encodedNY,epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [52]:
deepModel.save('deepModel.h5')

#### Testing the model

In [92]:
query=''
query=cleanText(query)
seq=Tokenizer.texts_to_sequences([query])
padSeq=tf.keras.preprocessing.sequence.pad_sequences(seq,padding='post',maxlen=6)
output=deepModel.predict(padSeq)
Encoder.inverse_transform([output.argmax()])



array(['greeting'], dtype=object)