In [1]:
import numpy as np
import nltk
import pandas as pd
import json
import re

In [2]:
from nltk.stem import WordNetLemmatizer

### Load data and preprocessing

In [3]:
file=open('../intents.json')
json_file=json.load(file)

In [4]:
dictionary_tag_pattern={}
dictionary_tag_response={}

In [5]:
all_tags=list(set([dic['tag'].lower().strip() for dic in json_file['intents']]))

In [6]:
for tag in all_tags:
    dictionary_tag_pattern[tag]=[]
    dictionary_tag_response[tag]=[]

In [7]:
for dic in json_file['intents']:
    tag=dic['tag'].lower().strip()
    
    for pattern in dic['patterns']:
        dictionary_tag_pattern[tag].append(pattern)
    for response in dic['responses']:
        dictionary_tag_response[tag].append(response)
    

In [8]:
DataF=pd.DataFrame(columns=['pattern','tag'])

In [9]:
for tag in dictionary_tag_pattern.keys():
    for pattern in dictionary_tag_pattern[tag]:
        DataF.loc[len(DataF.index)]=[pattern,tag]

In [10]:
DataF.head(10)

Unnamed: 0,pattern,tag
0,it was nice talking to you,nicetty
1,good talk,nicetty
2,haha,haha
3,lol,haha
4,rofl,haha
5,lmao,haha
6,thats funny,haha
7,Bye,goodbye
8,See you later,goodbye
9,Goodbye,goodbye


In [50]:
X=DataF['pattern'].values
Y=DataF['tag'].values

In [14]:
def cleanText(text:str)->str:
    wordlem=WordNetLemmatizer()
    clenanedText=''
    pattern1=r'[^A-Za-z\s]'
    pattern2=r'\s+'
    text=text.lower().strip()
    text=re.sub(pattern1,'',text)
    text=re.sub(pattern2,' ',text)
    for word in text.split():
        word=wordlem.lemmatize(word,pos='v')
        clenanedText=clenanedText+word+" "
    clenanedText=clenanedText.strip()
    return clenanedText

In [16]:
cleandedX=[]
for sen in X:
    cleandedX.append(cleanText(sen))

In [17]:
cleandedX

['it be nice talk to you',
 'good talk',
 'haha',
 'lol',
 'rofl',
 'lmao',
 'thats funny',
 'bye',
 'see you later',
 'goodbye',
 'get lose',
 'till next time',
 'bbye',
 'whats up',
 'wazzup',
 'how be you',
 'sup',
 'how you do',
 'hi there',
 'how be you',
 'be anyone there',
 'hey',
 'hola',
 'hello',
 'good day',
 'namaste',
 'yo',
 'thank',
 'thank you',
 'thats helpful',
 'awesome thank',
 'thank for help me',
 'ask me a riddle',
 'ask me a question',
 'riddle',
 'covid',
 'who be you',
 'what be you',
 'no',
 'nope',
 'top songs',
 'best songs',
 'hot songs',
 'top songs',
 'top ten songs',
 'who inspire you',
 'who be your inspiration',
 'who motivate you',
 'you be useless',
 'useless',
 'suggest',
 'suggestions',
 'you be bad',
 'what be the time',
 'what be the date',
 'date',
 'time',
 'tell me the date',
 'day',
 'what day be be today',
 'awesome',
 'great',
 'i know',
 'ok',
 'yeah',
 'contact developer',
 'contact karan',
 'contact programmer',
 'contact creator',
 'ho

#### Encoding the target variable and create a vector space for training data

In [40]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
X=np.array(X)
Y=np.array(Y)

In [45]:
Y

array(['nicetty', 'nicetty', 'haha', 'haha', 'haha', 'haha', 'haha',
       'goodbye', 'goodbye', 'goodbye', 'goodbye', 'goodbye', 'goodbye',
       'whatsup', 'whatsup', 'whatsup', 'whatsup', 'whatsup', 'greeting',
       'greeting', 'greeting', 'greeting', 'greeting', 'greeting',
       'greeting', 'greeting', 'greeting', 'thanks', 'thanks', 'thanks',
       'thanks', 'thanks', 'riddle', 'riddle', 'riddle', 'covid19',
       'identity', 'identity', 'no', 'no', 'song', 'song', 'song', 'song',
       'song', 'inspire', 'inspire', 'inspire', 'suggest', 'suggest',
       'suggest', 'suggest', 'suggest', 'datetime', 'datetime',
       'datetime', 'datetime', 'datetime', 'datetime', 'datetime',
       'exclaim', 'exclaim', 'exclaim', 'exclaim', 'exclaim', 'contact',
       'contact', 'contact', 'contact', 'options', 'options', 'options',
       'options', 'options', 'karan', 'karan', 'karan', 'karan', 'insult',
       'insult', 'insult', 'cricket', 'cricket', 'google', 'google',
       'go

In [52]:
Encoder=LabelEncoder()
encodedY=Encoder.fit_transform(Y)

In [53]:
tfIdf=TfidfVectorizer()
tfIdf.fit(X)

In [54]:
vectorizedX=tfIdf.transform(X)

In [None]:
tfIdf.get_feature_names()

#### machine learning model creation and parameter tuning 

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV

In [69]:
randomForestModel=RandomForestClassifier()
score=cross_val_score(randomForestModel,vectorizedX,encodedY,cv=4)



In [70]:
score

array([0.44827586, 0.42857143, 0.35714286, 0.28571429])

In [81]:
param_grid = {  'bootstrap': [True,False], 'max_depth': [5, 10, 15], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15,20,25,40]}

gridSearch=GridSearchCV(randomForestModel,param_grid)

In [None]:
gridSearch.fit(vectorizedX,encodedY)

In [83]:
gridSearch.best_score_

0.432806324110672

In [86]:
gridSearch.best_estimator_

#### DeepLearning Model