In [1]:
import json 
import re
import numpy as np 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder


def pad_sequences(sequences,maxlen,padding = 'pre',value=0):
  padding_sequences = sequences
  if padding=='pre':
    for seq in padding_sequences:
      while(len(seq)<maxlen):
        seq.insert(0,value)
  else:
    for seq in padding_sequences:
      while(len(seq)<maxlen):
        seq.append(value)
  return np.array(padding_sequences)





def texts_to_sequences(training_sentences,word_index):
  seq=[]
  bad_char = [',','?','/','_','@','#','*']
  for sentence in training_sentences:
    w=[]
    for c in bad_char:
      sentence = sentence.replace(c,'')
    for word in sentence.split():
      word = word.lower()
      w.append(word_index[word])
    seq.append(w)
  return seq



class label_encoder:

  def __init__(self,training_labels):
    self.training_labels = training_labels
    self.class_array = []
    self.class_labels = []


  def find_class_array(self):
    self.class_array = []
    self.class_labels = []
    idx = 0
    for data in self.training_labels:
      if data not in self.class_array:
        self.class_array.append(data)
        self.class_labels.append(idx)
        idx+=1
    self.class_array = sorted(self.class_array)
    #print(self.class_array)
    

  def Label_Encoder(self):
    num_array = []
    #print(self.class_array)
    d = {self.class_array[i]:self.class_labels[i] for i in range(len(self.class_array))}
    for label in self.training_labels:
      num_array.append(d[label])
    return(np.array(num_array))



  def inverse_transform(similarity_vector,self):
    max_index = None
    max = 0
    for index in range(len(similarity_vector)):
      if max<similarity_vector[index]:
        max_index = index
        max = similarity_vector[index]
    return self.class_array[max_index]
    

In [2]:
import nltk
#nltk.download('stem',quiet=True)
nltk.download('stopwords', quiet = True)
nltk.download('wordnet',quiet = True)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

stop_words = list(stopwords.words('english'))
print(stop_words)
print(lemmatizer.lemmatize("how"))
if "how" in stop_words:
  print('True')
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
def remove_stopwords(stop_words,training_sentences):
  new_sentences = []
  for sentence in training_sentences:
    s = ''
    words = sentence.split()
    for word in words:
      #word = word.lower()
      if word not in stop_words:
        word = lemmatizer.lemmatize(word)
        s+=(word+' ')
    s=s.strip()
    new_sentences.append(s)
  return new_sentences



In [9]:
with open('intents.json') as file:
    data = json.load(file)


training_sentences = []
training_labels = []
labels = []
responses = []


for intent in data['intents']:
    for pattern in intent['patterns']:
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])
    responses.append(intent['responses'])
    
    if intent['tag'] not in labels:
        labels.append(intent['tag'])




#removing stopwords
training_sentences = remove_stopwords(stop_words,training_sentences)

document = []
for i in range(len(training_sentences)):
  sentence = training_sentences[i]
  for word in sentence.split():
    document.append((word,training_labels[i]))

print(document,end='\n\n\n')




print("before encoding : ",training_labels,end = '\n\n')

print("after removing stopwords:")
print("new sentences : ",training_sentences,end = '\n\n')

#encoding training labels
lbl_encoder = label_encoder(training_labels) 
lbl_encoder.find_class_array()
training_labels = lbl_encoder.Label_Encoder()


print("after encoding : ",training_labels)


vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) 
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = texts_to_sequences(training_sentences,word_index)
padded_sequences = pad_sequences(sequences,max_len)

epochs = 550
#history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)

[('Hi', 'greeting'), ('How', 'greeting'), ('Is', 'greeting'), ('anyone', 'greeting'), ('there?', 'greeting'), ('Hey', 'greeting'), ('Hola', 'greeting'), ('Hello', 'greeting'), ('Good', 'greeting'), ('day', 'greeting'), ('Bye', 'goodbye'), ('See', 'goodbye'), ('later', 'goodbye'), ('Goodbye', 'goodbye'), ('Nice', 'goodbye'), ('chatting', 'goodbye'), ('you,', 'goodbye'), ('bye', 'goodbye'), ('Till', 'goodbye'), ('next', 'goodbye'), ('time', 'goodbye'), ('Thanks', 'thanks'), ('Thank', 'thanks'), ("That's", 'thanks'), ('helpful', 'thanks'), ('Awesome,', 'thanks'), ('thanks', 'thanks'), ('Thanks', 'thanks'), ('helping', 'thanks'), ('How', 'options'), ('could', 'options'), ('help', 'options'), ('me?', 'options'), ('What', 'options'), ('do?', 'options'), ('What', 'options'), ('help', 'options'), ('provide?', 'options'), ('How', 'options'), ('helpful?', 'options'), ('What', 'options'), ('support', 'options'), ('offered', 'options'), ('How', 'adverse_drug'), ('check', 'adverse_drug'), ('Adverse

In [5]:
seq = texts_to_sequences(training_sentences,word_index)
print(seq)

[[26], [10], [27, 28, 29], [30], [31], [32], [33, 34], [20], [35, 36], [37], [38, 39, 40, 20], [41, 42, 43], [11], [44], [45, 21], [46, 11], [11, 47], [10, 48, 22, 49], [12, 50], [12, 22, 51], [10, 21], [12, 52, 53], [10, 54, 5, 6, 13], [23, 5, 6, 24], [55, 14, 6, 56, 5, 57], [14, 6, 58, 4, 5, 13], [59, 6, 60, 5, 13], [23, 2, 3, 24], [61, 62, 2, 3], [2, 3, 15, 63], [16, 17, 64, 2, 3, 7], [2, 3, 15, 65], [16, 17, 18, 2, 3, 7, 66], [2, 3, 4], [67, 4, 2, 3, 7], [68, 2, 3, 7, 4], [19, 2, 3, 7, 69], [19, 8], [19, 8], [14, 8, 70], [71, 8], [18, 8], [25, 9], [72, 9, 73, 4], [16, 17, 18, 9, 15], [9, 25, 4], [74, 9, 75]]


In [6]:
print(padded_sequences)
#padded_sequences2[0,:].shape
# if seq == sequences:
#   print('true')

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 26]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 27 28 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 30]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 31]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 32]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 33 34]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 20]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 35 36]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 37]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 38 39 40 20]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 41 42 43]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 11]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 44]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 45 21]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0