## This file includes the preprocessing steps

In [1]:

import numpy as np
from spacy.lang.en import English
import en_core_web_sm
sp = en_core_web_sm.load()

In [3]:
#This function will get the elements, words and all the classes of intents from the dataset

#we wlll use spacy library to preprocess, and it will be done on English language
nlp = English()


def get_elements(intents):
    words = []
    classes = []
    documents = []
    ignore_words = ['?']
    #loopin on all the objects in the intents 
    for intent in intents['intents']:
        
        #looping on all the queries that a user can have
        for pattern in intent['patterns']:
            
            #converting the sting into the spacy document object 
            doc = nlp(pattern)
            
            #tokenisation using the apcy lobrary.
            tokens = []
            for token in doc:
                tokens.append(token.text)
                
            # the words list contain the vocabulary
            words.extend(tokens)
            
            #Collecting the documents
            documents.append((tokens, intent['tag']))
            
            #appending all the classes into the intent list
            if intent['tag'] not in classes:
                classes.append(intent['tag'])
      #print(token_list)
    
    #getting the lemantized word using spacy library
    words = [sp(w.lower())[0].lemma_ for w in words if w not in ignore_words]
    
    #in vocabulary, have only the unique words
    words = sorted(list(set(words)))
    
    # remove duplicates
    classes = sorted(list(set(classes)))
    return documents, classes, words
    

In [5]:
#cleaning the sentence for the prediction process
def clean_up_sentence(sentence):
    # tokenize the pattern
    #make an object os spacy Document type
    doc = nlp(sentence)
    
    #tokenize the sentence
    token_list = []
    for token in doc:
        token_list.append(token.text)
    
    # stem each word
    sentence_words = [sp(w.lower())[0].lemma_ for w in token_list]
    return sentence_words


In [6]:
# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    
    #makign the bag of words array. If the word matches in the vocabulary, the element at that position wil become 1,
    #Rest all the elements are set to 0
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))
