In [1]:
import numpy as np
import nltk
import pandas as pd
import json
import re

In [2]:
from nltk.stem import WordNetLemmatizer

### Load data and preprocessing

In [3]:
file=open('../intents.json')
json_file=json.load(file)

In [4]:
dictionary_tag_pattern={}
dictionary_tag_response={}

In [5]:
all_tags=list(set([dic['tag'].lower().strip() for dic in json_file['intents']]))

In [6]:
all_tags

['inspire',
 'age',
 'identity',
 'nicetty',
 'noanswer',
 'haha',
 'datetime',
 'whatsup',
 'goodbye',
 'appreciate',
 'greeting',
 'karan',
 'contact',
 'cricket',
 'exclaim',
 'song',
 'insult',
 'no',
 'timer',
 'greetreply',
 'covid19',
 'activity',
 'options',
 'google',
 'news',
 'weather',
 'programmer',
 'riddle',
 'thanks',
 'suggest',
 'jokes']

In [7]:
for tag in all_tags:
    dictionary_tag_pattern[tag]=[]
    dictionary_tag_response[tag]=[]

In [8]:
for dic in json_file['intents']:
    tag=dic['tag'].lower().strip()
    
    for pattern in dic['patterns']:
        dictionary_tag_pattern[tag].append(pattern)
    for response in dic['responses']:
        dictionary_tag_response[tag].append(response)
    

In [9]:
DataF=pd.DataFrame(columns=['pattern','tag'])

In [10]:
for tag in dictionary_tag_pattern.keys():
    for pattern in dictionary_tag_pattern[tag]:
        DataF.loc[len(DataF.index)]=[pattern,tag]

In [11]:
DataF.head(10)

Unnamed: 0,pattern,tag
0,who inspires you,inspire
1,who is your inspiration,inspire
2,who motivates you,inspire
3,how old are you,age
4,when were you made,age
5,what is your age,age
6,Who are you,identity
7,what are you,identity
8,it was nice talking to you,nicetty
9,good talk,nicetty


In [12]:
X=DataF['pattern'].values
Y=DataF['tag'].values

In [13]:
def cleanText(text:str)->str:
    wordlem=WordNetLemmatizer()
    clenanedText=''
    pattern1=r'[^A-Za-z\s]'
    pattern2=r'\s+'
    text=text.lower().strip()
    text=re.sub(pattern1,'',text)
    text=re.sub(pattern2,' ',text)
    for word in text.split():
        word=wordlem.lemmatize(word,pos='v')
        clenanedText=clenanedText+word+" "
    clenanedText=clenanedText.strip()
    return clenanedText

In [39]:
cleandedX=[]
cleandedXforWord2Vec=[]
for sen in X:
    cleandedX.append(cleanText(sen))
    cleandedXforWord2Vec.append(cleanText(sen).split())

In [None]:
cleandedXforWord2Vec

#### Encoding the target variable and create a vector space for training data

In [41]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

In [23]:
X=np.array(X)
Y=np.array(Y)

###### Encode using Word2Vec model

In [42]:
Word2VecModel=Word2Vec(sentences=cleandedXforWord2Vec,min_count=1,vector_size=25)

In [47]:
Word2VecModel.wv.similar_by_word('songs',topn=2)

[('ten', 0.5823303461074829), ('be', 0.5737924575805664)]

In [78]:
def sentToVec(sent):
    size=Word2VecModel.wv.vector_size
    numOfWords=0
    vector=np.zeros(size)
    for word in sent.split():
        
        try:
            vector+=Word2VecModel.wv.get_vector(word)
            numOfWords+=1
        except KeyError:
            vector=vector
            numOfWords+=1
        print(vector)
    vector=vector/numOfWords
    

In [30]:
Encoder=LabelEncoder()
encodedY=Encoder.fit_transform(Y)

###### Encode using TF-IDF

In [31]:
tfIdf=TfidfVectorizer()
tfIdf.fit(cleandedX)

In [32]:
vectorizedX=tfIdf.transform(cleandedX)

In [None]:
tfIdf.get_feature_names()

#### machine learning model creation and parameter tuning 

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV

In [24]:
randomForestModel=RandomForestClassifier()
score=cross_val_score(randomForestModel,vectorizedX,encodedY,cv=4)



In [25]:
score

array([0.51724138, 0.39285714, 0.32142857, 0.28571429])

In [26]:
param_grid = {  'bootstrap': [True,False], 'max_depth': [5, 10, 15], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15,20,25,40]}

gridSearch=GridSearchCV(randomForestModel,param_grid)

In [None]:
gridSearch.fit(vectorizedX,encodedY)

In [28]:
gridSearch.best_score_

0.4320158102766799

In [29]:
gridSearch.best_estimator_

#### DeepLearning Model

In [28]:
import tensorflow as tf

In [29]:
oov_token='<OOV>'
Tokenizer=tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)

In [30]:
Tokenizer.fit_on_texts(cleandedX)

In [31]:
len(Tokenizer.word_index)

117

In [32]:
TrainSeq=Tokenizer.texts_to_sequences(cleandedX)

In [33]:
Trainpad=tf.keras.preprocessing.sequence.pad_sequences(TrainSeq,padding='post')

In [34]:
Trainpad.shape

(113, 6)

In [58]:
deepModel=tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(Tokenizer.word_index)+1,32,input_length=Trainpad.shape[1]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,dropout=0.2,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,dropout=0.2,return_sequences=False)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dropout(0.01),
    tf.keras.layers.Dense(20, activation = "relu"),
    tf.keras.layers.Dense(31,activation='softmax')
    
])


In [59]:
deepModel.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 6, 32)             3776      
                                                                 
 bidirectional_8 (Bidirectio  (None, 6, 200)           106400    
 nal)                                                            
                                                                 
 bidirectional_9 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 flatten_4 (Flatten)         (None, 200)               0         
                                                                 
 dense_19 (Dense)            (None, 50)                10050     
                                                                 
 dropout_7 (Dropout)         (None, 50)               

In [60]:
deepModel.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [61]:
encodedNY=np.array(encodedY)

In [62]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(Trainpad,encodedNY,test_size=0.3,random_state=0)

In [63]:
history=deepModel.fit(Trainpad,encodedNY,epochs=60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [64]:
deepModel.save('deepModel.h5')

#### Testing the model

In [74]:
import random

In [87]:
query='good morning'
query=cleanText(query)
seq=Tokenizer.texts_to_sequences([query])
padSeq=tf.keras.preprocessing.sequence.pad_sequences(seq,padding='post',maxlen=6)
output=deepModel.predict(padSeq)
response=Encoder.inverse_transform([output.argmax()])
random.choice(dictionary_tag_response[response[0]])




'Good to know!'