In [1]:
# !pip install pandas
# !pip install nltk
# !pip install numpy==1.22.4
# !pip install requests==2.27.1
# !pip install setuptools==62.3.2
# !pip install scikit-learn
# !pip install openpyxl
# !pip install tensorflow
# !pip install --ignore-installed --upgrade --user tensorflow-gpu
# !pip install tflearn

### Check Tensorflow is running in GPU mode

In [2]:
import tensorflow as tf
tf.__version__
len(tf.config.list_physical_devices('GPU'))>0

True

In [3]:
import pandas as pd
import nltk
import numpy as np
import re
import random
from nltk.stem import wordnet # Lemmitization
from nltk import pos_tag # Part Of Speech
from nltk import word_tokenize # Tokenize text
from sklearn.feature_extraction.text import CountVectorizer # Bow
from sklearn.feature_extraction.text import TfidfVectorizer # TFIDF
from sklearn.metrics import pairwise_distances # Cosine similarity

# NN Dependencies
import tensorflow as tf 
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Dense, Dropout

### Load our data in a pandas instance

In [59]:
df = pd.read_excel('data\\dialog_talk_agent.xlsx')
df.head(35)

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,
3,Describe yourself,
4,tell me about yourself,
5,all about you,
6,tell me some stuff about you,
7,talk some stuff about you,
8,talk about yourself,
9,about yourself,


### Fills null values with previous values

In [60]:
df.ffill(axis=0, inplace=True)
df.head(35)

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,I can help you work smarter instead of harder
3,Describe yourself,I can help you work smarter instead of harder
4,tell me about yourself,I can help you work smarter instead of harder
5,all about you,I can help you work smarter instead of harder
6,tell me some stuff about you,I can help you work smarter instead of harder
7,talk some stuff about you,I can help you work smarter instead of harder
8,talk about yourself,I can help you work smarter instead of harder
9,about yourself,I can help you work smarter instead of harder


### Define a function to clean our text for vectorizing

In [6]:
def text_normalization(text):
    text = str(text).lower() # Convert text to lowercase
    special_char_rm = re.sub(r'[^ a-z]', '', text) # Remove special characters
    tokens = nltk.word_tokenize(special_char_rm) # Tokenize text
    lema = wordnet.WordNetLemmatizer() # Initialize Lemmatization
    tag_list = pos_tag(tokens, tagset=None) # POS tagging
    lema_words = [] # Initialize empty list
    for token, pos_token in tag_list: # I don't know why I've done this
        if pos_token.startswith('V'): # Verb
            pos_val = 'v'
        elif pos_token.startswith('J'): # Adjective
            pos_val = 'a'
        elif pos_token.startswith('R'): # Adverb
            pos_val = 'r'
        else:
            pos_val = 'n' # Noun
        lema_words.append(lema.lemmatize(token, pos_val)) # Perform Lemmatization and append to list
    
    return " ".join(lema_words) # Returns lemmatized tokens as a sentence

### Apply the text_normalization function to the 'Context' column, and append the returned data to a new column

In [7]:
df['Lemmatized Context'] = df['Context'].apply(text_normalization)
df.head()

Unnamed: 0,Context,Text Response,Lemmatized Context
0,Tell me about your personality,Just think of me as the ace up your sleeve.,tell me about your personality
1,I want to know you better,I can help you work smarter instead of harder,i want to know you good
2,Define yourself,I can help you work smarter instead of harder,define yourself
3,Describe yourself,I can help you work smarter instead of harder,describe yourself
4,tell me about yourself,I can help you work smarter instead of harder,tell me about yourself


### Initialize Tf-idf Vectorizer

In [8]:
tfidf = TfidfVectorizer()

### Initialize stopwords var

In [9]:
stopwords = nltk.corpus.stopwords.words("english")

### Fit our 'Lemmatized Context' data to the tf-idf vectorizer, and then fit it to a pandas DF

In [10]:
lemma_tfidf = tfidf.fit_transform(df['Lemmatized Context']).toarray()
df_tfidf = pd.DataFrame(lemma_tfidf, columns=tfidf.get_feature_names_out())
df_tfidf

Unnamed: 0,abort,about,absolutely,abysmal,actually,adore,advice,advise,affirmative,afraid,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0.0,0.407572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.330555,0.0,0.0,0.000000,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.218768,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.641790,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.641790,0.0
4,0.0,0.453790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.608937,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
1588,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
1589,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
1590,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.131753,0.000000,0.0,0.0,0.000000,0.0


### List of questions to pass to our functions

In [11]:
questions = [
    'Will you help me, and tell me about yourself?',
    'How are you doing?',
    'I love you',
    'Thanks for the support!',
    'Will you reply accurately?',
    'Will you marry me?',
    'You are amazing, I hope to see you soon!',
    'What is the meaning of life?'
]

### Function to process the quesiton using TF-idf and return a response from our DF using cosine similarity

In [12]:
def chat_tfidf(text):
    question_clean = []
    for word in text.split():
        word = re.sub(r"[^a-zA-Z0-9]+", '', word) # Removes special characters
        if word in stopwords: # Remove stopwords from our text
            pass
        else:
            question_clean.append(word)
    lemma = text_normalization(" ".join(question_clean)) # Join and normalize the text
    print(f"Question: {text}\nLemma: {lemma}")
    lemma_tfidf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1-pairwise_distances(df_tfidf, lemma_tfidf, metric='cosine') # cosine similarity
    similarity_index = cos.argmax() # Get index value of highest similarity
    response = df['Text Response'].loc[similarity_index]
    return print(f"Answer: {response}")

In [13]:
for question in questions:
    chat_tfidf(question)
    print("")

Question: Will you help me, and tell me about yourself?
Lemma: will help tell
Answer: I'm glad to help. What can I do for you?

Question: How are you doing?
Lemma: how
Answer: Lovely, thanks.

Question: I love you
Lemma: i love
Answer: That's great to hear.

Question: Thanks for the support!
Lemma: thanks support
Answer: It's my pleasure to help.

Question: Will you reply accurately?
Lemma: will reply accurately
Answer: Oh, don't give up on me!

Question: Will you marry me?
Lemma: will marry
Answer: In the virtual sense that I can, sure.

Question: You are amazing, I hope to see you soon!
Lemma: you amaze i hope see soon
Answer: Bye.

Question: What is the meaning of life?
Lemma: what mean life
Answer: Sorry. I think I may have been a little confused by what you said.



### Import the Lancaster Stemmer to prevent duplicates in our texts, this will be used on the training data and on the testing/input data too

In [14]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

### We load a small dataset containing query patterns tagged by intent. The data is processed and is appended to 6 different lists.

In [15]:
intents =  {
    "intents": [
        {"tag": "greeting",
         "patterns": ["Hi there", "How are you", "Is anyone there?", "Hey", "Hola", "Hello", "Good day"]
        },
        {"tag": "goodbye",
         "patterns": ["Bye", "See you later", "Goodbye", "Nice chatting to you, bye", "Till next time"]
        },
        {"tag": "thanks",
         "patterns": ["Thanks", "Thank you", "That's helpful", "Awesome, thanks", "Thanks for helping me"]
        },
        {"tag": "query",
         "patterns": ["What time is it?", "What's the date today?", "Is it raining outside?", "Tell me about that", "Will you give me more information?", "I have a question"]
        },
        {"tag": "angry",
         "patterns": ["That's not what I asked!", "You're so annoying", "You're an idiot", "You are useless!"]
        },
        {"tag": "compliment",
         "patterns": ["You're the best!", "I love you", "You look beautiful", "You're incredible", "That's amazing"]
        } 
    ]
 }


words = [] # List of unique words in our database: [word1, word2, word3]
labels = [] # List of intent tags: [tag1, tag2, tag3]
x_docs = [] # List of lists containing tokenized queries: [[query1], [query2], [query3]]
x_docs_join = [] # List of lemmatized queries: [query1, query2, query3]
x_docs_tagged = [] # List of tuples: [(query1, tag1), (query2, tag2), (query3, tag3)]
y_docs = [] # List of tags corresponding to x_docs in order: y_docs[1] == x_docs[1]

for intent in intents.values():
    for tags in intent:
        for pattern in tags['patterns']:
            pattern = text_normalization(pattern)
            lemma = nltk.word_tokenize(pattern) # Tokenize the query
            words.extend(lemma) # Add words to our words list
            lemma = [stemmer.stem(w) for w in lemma] # Stemming
            x_docs.append(lemma)
            x_docs_join.append(" ".join(lemma))
            y_docs.append(tags['tag'])
            if tags['tag'] not in labels: # Only append intent tag if not already in the list
                labels.append(tags['tag'])
            x_docs_tagged.append((" ".join(lemma), tags['tag']))
words = [stemmer.stem(w.lower()) for w in words if w.isalpha()]
words = sorted(list(set(words)))
labels = sorted(labels)
print(x_docs_tagged)

[('hi ther', 'greeting'), ('how be you', 'greeting'), ('be anyon ther', 'greeting'), ('hey', 'greeting'), ('hol', 'greeting'), ('hello', 'greeting'), ('good day', 'greeting'), ('bye', 'goodbye'), ('see you lat', 'goodbye'), ('goodby', 'goodbye'), ('nic chat to you bye', 'goodbye'), ('til next tim', 'goodbye'), ('thank', 'thanks'), ('thank you', 'thanks'), ('that help', 'thanks'), ('awesom thank', 'thanks'), ('thank for help me', 'thanks'), ('what tim be it', 'query'), ('what the dat today', 'query'), ('be it rain outsid', 'query'), ('tel me about that', 'query'), ('wil you giv me mor inform', 'query'), ('i hav a quest', 'query'), ('that not what i ask', 'angry'), ('yo so annoy', 'angry'), ('yo an idiot', 'angry'), ('you be useless', 'angry'), ('yo the best', 'compliment'), ('i lov you', 'compliment'), ('you look beauty', 'compliment'), ('yo incred', 'compliment'), ('that amaz', 'compliment')]


### Create a Pandas dataframe and then us Tf-idf to convert it into an array.

In [16]:
doc_dict = {
    'Query': x_docs_join,
    'Tag': y_docs
}
doc_df = pd.DataFrame(doc_dict)
doc_lemma_tfidf = tfidf.fit_transform(doc_df['Query']).toarray()
doc_tfidf = pd.DataFrame(doc_lemma_tfidf, columns=tfidf.get_feature_names_out())
doc_tfidf

Unnamed: 0,about,amaz,an,annoy,anyon,ask,awesom,be,beauty,best,...,ther,til,tim,to,today,useless,what,wil,yo,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.666238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519876,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441942
2,0.0,0.0,0.0,0.0,0.658825,0.0,0.0,0.468521,0.0,0.0,...,0.588589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393067
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### A function to compare our questions against the dataset above using cosine similarity to predict the intent of the question

In [17]:
def chat_tag(text):
    question_clean = []
    for word in text.split():
        word = re.sub(r"[^a-zA-Z0-9]+", '', word) # Removes special characters
        if word in stopwords: # Remove stopwords from our text
            pass
        else:
            question_clean.append(word)
    lemma = text_normalization(" ".join(question_clean)) # Join and normalize the text
    print(f"Question: {text}\nLemma: {lemma}")
    lemma_tfidf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1-pairwise_distances(doc_lemma_tfidf, lemma_tfidf, metric='cosine') # cosine similarity
    similarity_index = cos.argmax() # Get index value of highest similarity
    response = doc_df['Tag'].loc[similarity_index]
    return print(f"Intent: {response}")

In [18]:
for question in questions:
    chat_tag(question)
    print("")

Question: Will you help me, and tell me about yourself?
Lemma: will help tell
Intent: thanks

Question: How are you doing?
Lemma: how
Intent: greeting

Question: I love you
Lemma: i love
Intent: greeting

Question: Thanks for the support!
Lemma: thanks support
Intent: greeting

Question: Will you reply accurately?
Lemma: will reply accurately
Intent: greeting

Question: Will you marry me?
Lemma: will marry
Intent: greeting

Question: You are amazing, I hope to see you soon!
Lemma: you amaze i hope see soon
Intent: goodbye

Question: What is the meaning of life?
Lemma: what mean life
Intent: query



### The text has to be converted to a numerical form for the NN to process, we'll be using a BOW approach for this

In [19]:
training = []
output = []
out_empty = [0 for _ in range(len(labels))]

# One hot encoding, Converting the words to numerals
for x, doc in enumerate(x_docs):
    bag = []
    for w in words:
        if w in doc:
            bag.append(1)
        else:
            bag.append(0)

    output_row = out_empty[:]
    output_row[labels.index(y_docs[x])] = 1

    training.append(bag)
    output.append(output_row)

training = np.array(training)
output = np.array(output)

### The first layer will be our input layer, our training data defined above will be our single parameter

### There are three hidden layers responsible for processing the input data, then there is of course the output layer

### The model is saved locally using the tflearn module

### Load the model we trained

In [20]:
import tflearn

tf.compat.v1.reset_default_graph()

net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 10)
net = tflearn.fully_connected(net, 10)
net = tflearn.fully_connected(net, 10)
net = tflearn.fully_connected(net, len(output[0]), activation='softmax')

model = tflearn.DNN(net)
model.load('model.tflearn')

Instructions for updating:
non-resource variables are not supported in the long term
curses is not supported on this machine (please install/reinstall curses for an optimal experience)
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Restoring parameters from C:\Users\aaron\Jupyter\model.tflearn


### Since our model was trained with a BOW approach, our input must also be converted using BOW for it to work correctly

In [21]:
def bag_of_words(inp, words):
    bag = [0 for _ in range(len(words))]
    # inp_words = nltk.word_tokenize(inp)
    # inp_words = [stemmer.stem(word.lower()) for word in inp_words]

    for inp_word in inp:
        for index, word in enumerate(words):
            if word == inp_word:
                bag[index] = 1

    return np.array(bag)

### Function to predict the tag using our model using our NN

In [22]:
def chat_tag_nn(question):
    question_norm = text_normalization(question)
    question_lemma = nltk.word_tokenize(question_norm)
    question_lemma = [stemmer.stem(word.lower()) for word in question_lemma]
    
    results = model.predict([bag_of_words(question_lemma, words)])
    
    results_index = np.argmax(results)

    tag = labels[results_index]

    for value in intents.values():
        for tg in value:
            if tg['tag'] == tag:
                print(f"Question: {question}\nQuestion Stemmed: {question_lemma}\nTag: {tag}\nResponse: Null")

In [23]:
for question in questions:
    chat_tag_nn(question)
    print("")

Question: Will you help me, and tell me about yourself?
Question Stemmed: ['wil', 'you', 'help', 'me', 'and', 'tel', 'me', 'about', 'yourself']
Tag: query
Response: Null

Question: How are you doing?
Question Stemmed: ['how', 'be', 'you', 'do']
Tag: greeting
Response: Null

Question: I love you
Question Stemmed: ['i', 'lov', 'you']
Tag: compliment
Response: Null

Question: Thanks for the support!
Question Stemmed: ['thank', 'for', 'the', 'support']
Tag: thanks
Response: Null

Question: Will you reply accurately?
Question Stemmed: ['wil', 'you', 'reply', 'acc']
Tag: compliment
Response: Null

Question: Will you marry me?
Question Stemmed: ['wil', 'you', 'marry', 'me']
Tag: greeting
Response: Null

Question: You are amazing, I hope to see you soon!
Question Stemmed: ['you', 'be', 'amaz', 'i', 'hop', 'to', 'see', 'you', 'soon']
Tag: compliment
Response: Null

Question: What is the meaning of life?
Question Stemmed: ['what', 'be', 'the', 'mean', 'of', 'lif']
Tag: greeting
Response: Null



In [24]:
chat_tag_nn("tell me about big bang")

Question: tell me about big bang
Question Stemmed: ['tel', 'me', 'about', 'big', 'bang']
Tag: query
Response: Null


### Using cosine similarity we try to predict the intent of the queries in our original large dataset

In [25]:
df_tagged_cos = []
def df_tag_cos(text):
    question_clean = []
    for word in text.split():
        word = re.sub(r"[^a-zA-Z0-9]+", '', word) # Removes special characters
        if word in stopwords: # Remove stopwords from our text
            pass
        else:
            question_clean.append(word)
    lemma = " ".join(question_clean) # Join the text
    lemma_tfidf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1-pairwise_distances(doc_lemma_tfidf, lemma_tfidf, metric='cosine') # cosine similarity
    similarity_index = cos.argmax() # Get index value of highest similarity
    tagged = x_docs_tagged[similarity_index][1]
    df_tagged_cos.append((text, tagged))

### If we print every 5th result, we see most come up as 'greeting', only one comes up as something different.

In [26]:
for data in df['Lemmatized Context']:
    df_tag_cos(data)
for query in df_tagged_cos[0:100:5]:
    print(query)

('tell me about your personality', 'greeting')
('all about you', 'greeting')
('who be you', 'greeting')
('say about you', 'greeting')
('what be your age', 'greeting')
('id like to know your age', 'greeting')
('youre so annoy', 'angry')
('you be irritate', 'greeting')
('answer me', 'greeting')
('can you answer a question for me', 'greeting')
('just answer the question', 'greeting')
('answer the question', 'greeting')
('give me the answer', 'greeting')
('you be horrible', 'greeting')
('you be no good', 'greeting')
('youre a bad', 'greeting')
('youre not very good', 'greeting')
('you be bad', 'greeting')
('be smarter', 'greeting')
('be clever', 'greeting')


### Now we'll try do the same prediction with our NN

In [27]:
df_tagged_nn = []

def df_tag_nn(text):
    text_norm = text_normalization(text)
    text_lemma = nltk.word_tokenize(text_norm)
    text_lemma = [stemmer.stem(word.lower()) for word in text_lemma]
    
    results = model.predict([bag_of_words(text_lemma, words)])
    
    results_index = np.argmax(results)

    tag = labels[results_index]
    for value in intents.values():
        for tg in value:
            if tg['tag'] == tag:
                df_tagged_nn.append((text, tag))

In [28]:
for data in df['Lemmatized Context']:
    df_tag_nn(data)

### We get more varied results with our NN, though not very accurate.

In [29]:
for query in df_tagged_nn[0:100:5]:
    print(query)

('tell me about your personality', 'query')
('all about you', 'greeting')
('who be you', 'compliment')
('say about you', 'greeting')
('what be your age', 'greeting')
('id like to know your age', 'angry')
('youre so annoy', 'angry')
('you be irritate', 'compliment')
('answer me', 'greeting')
('can you answer a question for me', 'greeting')
('just answer the question', 'greeting')
('answer the question', 'greeting')
('give me the answer', 'query')
('you be horrible', 'compliment')
('you be no good', 'greeting')
('youre a bad', 'greeting')
('youre not very good', 'angry')
('you be bad', 'compliment')
('be smarter', 'greeting')
('be clever', 'greeting')


### We'll put it back into a Pandas DataFrame

In [30]:
df_tagged_nn_query = []
df_tagged_nn_tags = []
for query, tag in df_tagged_nn:
    df_tagged_nn_query.append(query)
    df_tagged_nn_tags.append(tag)

In [31]:
df_tagged_nn_data = {
    'Query': df_tagged_nn_query,
    'Response': df['Text Response'],
    'Tag': df_tagged_nn_tags
}
df_tagged_nn_df = pd.DataFrame(df_tagged_nn_data)
df_tagged_nn_df.head(10)

Unnamed: 0,Query,Response,Tag
0,tell me about your personality,Just think of me as the ace up your sleeve.,query
1,i want to know you good,I can help you work smarter instead of harder,compliment
2,define yourself,I can help you work smarter instead of harder,angry
3,describe yourself,I can help you work smarter instead of harder,angry
4,tell me about yourself,I can help you work smarter instead of harder,query
5,all about you,I can help you work smarter instead of harder,greeting
6,tell me some stuff about you,I can help you work smarter instead of harder,query
7,talk some stuff about you,I can help you work smarter instead of harder,greeting
8,talk about yourself,I can help you work smarter instead of harder,greeting
9,about yourself,I can help you work smarter instead of harder,greeting


### We need to make our data more consistent. I decided since a lot of queries have the same response, these should be grouped together, and then can find which intent occures most often, and assign that intent to all queries in that group

#### Note: this method is flawed as we earlier used the df.ffill() method to quickly fill in the blank spaces in our data in the response column. This filled in blanks with the last response available. The problem with this is each "group" of queries has 1-2 responses, meaning there is always a group with just one response.

#### Looking at the raw data, not every query group has multiple responses, meaning it'll be difficult and time consuming to write a function to correct this, I might do this at some point, but for now I'm just going to leave it.

#

### This fun function processes our data by grouping the data by response, and picking the intent tag that occures most often and assigns it to that group

### The function returns three things, a list of tuples, and two dictionaries

In [32]:
# the labels var is where all our tags are stored
keys = [] # Query, Response, Tag

# Assigns the keys [Query, Response, Tag] to our list defined above
for key in df_tagged_nn_data.keys():
    keys.append(key)

def df_tag_norm(df):
    df_tagged_tuples = [] # List of tuples containing: (query, response, predicted tag)
    df_responses_numerical_dict = {} # Dictionary assigning each response an int: {response1: 0, response2: 1}
    df_query_by_response_dict = {} # Categorizes each query by the response given in the DataFrame: {response1: (query1, tag1), (query2, tag2)... response2: etc.}

    # Converts our responses to numerical values in the df_responses_numerical_dict dictionary
    for value in range(len(df[keys[0]])):
        df_tagged_tuples.append((df[keys[0]][value], df[keys[1]][value], df[keys[2]][value]))
        if df[keys[1]][value] not in df_responses_numerical_dict:
            df_responses_numerical_dict[df[keys[1]][value]] = len(df_responses_numerical_dict)

    # Assigns data to the df_query_by_response_dict dictionary, and counts the occurences of different intents and adds them together
    # with this we'll be able to identify which tags occures most often, and then we'll assign this tag to all of the queries in
    # the groups we created
    for df_tuple in df_tagged_tuples:
        index = df_responses_numerical_dict[df_tuple[1]]
        tags = []
        if df_responses_numerical_dict[df_tuple[1]] not in df_query_by_response_dict.keys():
            df_query_by_response_dict[index] = [[df_tuple[0]]]
        else:
            df_query_by_response_dict[index][0].append(df_tuple[0])

        # Creates a list to append to our dictionary if it doesn't already exist in the following format: [0, 0, 0, 0, 0, 0]
        for tag in labels:
            if df_tuple[2] == tag:
                tags.append(1)
            else:
                tags.append(0)

        # If it does already exists, we take the values already there, and add them together with the new ones, assigning this new value
        # in place of the old one
        if len(df_query_by_response_dict[index]) < 2:
            df_query_by_response_dict[index].append(tags)
        else:
            new_tags = []
            for tag_index, value in enumerate(df_query_by_response_dict[index][1]):
                new_value = tags[tag_index] + value 
                new_tags.append(new_value)
            df_query_by_response_dict[index][1] = new_tags

    return df_tagged_tuples, df_responses_numerical_dict, df_query_by_response_dict
   
df_tagged_tuples, df_responses_numerical_dict, df_query_by_response_dict = df_tag_norm(df_tagged_nn_data)


for index in range(10):
    print(df_query_by_response_dict[index])

[['tell me about your personality'], [0, 0, 0, 0, 1, 0]]
[['i want to know you good', 'define yourself', 'describe yourself', 'tell me about yourself', 'all about you', 'tell me some stuff about you', 'talk some stuff about you', 'talk about yourself', 'about yourself', 'who be you', 'introduce yourself', 'i want to know more about you', 'what be you', 'what be your personality', 'say about you', 'tell me about you', 'why be you here', 'why be you here'], [3, 4, 0, 8, 3, 0]]
[['be you year old', 'what be your age', 'how old be you', 'age of yours', 'how old be your platform', 'tell me your age', 'id like to know your age', 'id like to know your age'], [3, 1, 0, 3, 1, 0]]
[['you be annoy'], [0, 1, 0, 0, 0, 0]]
[['i find you annoy', 'youre incredibly annoy', 'youre so annoy', 'youre too annoy', 'you be annoy me so much', 'you annoy me', 'you be such annoy', 'you be irritate', 'you be annoy me', 'you be very annoy', 'how annoying you be', 'how annoying you be'], [3, 4, 0, 5, 0, 0]]
[['ans

### Now using the df_query_by_response_dict dictionary created above we'll identify which intent tag occures most often in a given group, and assign that tag to all queries in that group

In [33]:
def tag_occurence(df_dict):
    df_queries_tagged = df_dict.copy()
    for key, value in df_dict.items():
        highest_val = (0, 0)
        for tag in enumerate(value[1]):
            if tag[1] > highest_val[1]:
                highest_val = tag
        df_queries_tagged[key][1] = labels[highest_val[0]]
    return df_queries_tagged

df_queries_tagged = tag_occurence(df_query_by_response_dict)

for index in range(10):
    print(df_queries_tagged[index])

[['tell me about your personality'], 'query']
[['i want to know you good', 'define yourself', 'describe yourself', 'tell me about yourself', 'all about you', 'tell me some stuff about you', 'talk some stuff about you', 'talk about yourself', 'about yourself', 'who be you', 'introduce yourself', 'i want to know more about you', 'what be you', 'what be your personality', 'say about you', 'tell me about you', 'why be you here', 'why be you here'], 'greeting']
[['be you year old', 'what be your age', 'how old be you', 'age of yours', 'how old be your platform', 'tell me your age', 'id like to know your age', 'id like to know your age'], 'angry']
[['you be annoy'], 'compliment']
[['i find you annoy', 'youre incredibly annoy', 'youre so annoy', 'youre too annoy', 'you be annoy me so much', 'you annoy me', 'you be such annoy', 'you be irritate', 'you be annoy me', 'you be very annoy', 'how annoying you be', 'how annoying you be'], 'greeting']
[['answer me'], 'greeting']
[['i want the answer n

### Now we've processed our data, we need to convert it back into a Pandas DataFrame

In [34]:
def df_create(df):
    proc_query = []
    proc_response = []
    proc_tag = []

    for key, value in df.items():
        for query in value[0]:
            proc_query.append(query)
            proc_response.append(list(df_responses_numerical_dict.keys())[key])
            proc_tag.append(value[1])

    data_proc = {
        'Query': proc_query,
        'Response': proc_response,
        'Tag': proc_tag
    }
    df_proc = pd.DataFrame(data_proc)
    return df_proc
    
df_proc = df_create(df_queries_tagged)
df_proc.head(30)

Unnamed: 0,Query,Response,Tag
0,tell me about your personality,Just think of me as the ace up your sleeve.,query
1,i want to know you good,I can help you work smarter instead of harder,greeting
2,define yourself,I can help you work smarter instead of harder,greeting
3,describe yourself,I can help you work smarter instead of harder,greeting
4,tell me about yourself,I can help you work smarter instead of harder,greeting
5,all about you,I can help you work smarter instead of harder,greeting
6,tell me some stuff about you,I can help you work smarter instead of harder,greeting
7,talk some stuff about you,I can help you work smarter instead of harder,greeting
8,talk about yourself,I can help you work smarter instead of harder,greeting
9,about yourself,I can help you work smarter instead of harder,greeting


### Now we've got our database (somewhat) uniformally tagged with predicted intent, we can perform cosine similarity on the users input to try estimate the queries intent with hopefully a greater deal of accuracy than we've previously had, as well as providing a response from our DataFrame.

### First we transform and fit our data using Tf-idf, and then run the same function we used earlier for cosine similarity. I've removed the stopwords removal function here as I found it increased accuracy.

In [35]:
df_proc_transform = tfidf.fit_transform(df_proc['Query']).toarray()
df_proc_tfidf = pd.DataFrame(df_proc_transform, columns=tfidf.get_feature_names_out())

In [36]:
def chat_tag_optimised(text):
    question_clean = []
    for word in text.split():
        word = re.sub(r"[^a-zA-Z0-9]+", '', word) # Removes special characters
        # if word in stopwords: # Remove stopwords from our text
        #     pass
        # else:
        question_clean.append(word)
    lemma = text_normalization(" ".join(question_clean)) # Join and normalize the text
    print(f"Question: {text}\nLemma: {lemma}")
    lemma_tfidf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1-pairwise_distances(df_proc_tfidf, lemma_tfidf, metric='cosine') # cosine similarity
    similarity_index = cos.argmax() # Get index value of highest similarity
    response = df_proc['Tag'].loc[similarity_index]
    print(f"Similar query: {df_proc['Query'].loc[similarity_index]}")
    return print(f"Intent: {response}")

In [37]:
for question in questions:
    chat_tag_optimised(question)
    print("")

Question: Will you help me, and tell me about yourself?
Lemma: will you help me and tell me about yourself
Similar query: tell me about yourself
Intent: greeting

Question: How are you doing?
Lemma: how be you do
Similar query: how be you do
Intent: greeting

Question: I love you
Lemma: i love you
Similar query: love you
Intent: angry

Question: Thanks for the support!
Lemma: thanks for the support
Similar query: thanks for your help
Intent: thanks

Question: Will you reply accurately?
Lemma: will you reply accurately
Similar query: i will fire you
Intent: angry

Question: Will you marry me?
Lemma: will you marry me
Similar query: marry me
Intent: greeting

Question: You are amazing, I hope to see you soon!
Lemma: you be amaze i hope to see you soon
Similar query: see you soon
Intent: goodbye

Question: What is the meaning of life?
Lemma: what be the meaning of life
Similar query: how be your life
Intent: greeting



### Considering how small our intent database is, this is a pretty good result, there's very obvious errors, but if I were to take some time to expand the database, perhaps change some of the parameters on the NN training, we could probably get some decent results out of this.





#





### We're going to try boost the performance of our very small dataset by using EDA_NLP (Easy Data Augmentation NLP): https://github.com/jasonwei20/eda_nlp

### We need to format our data to work with EDA_NLP

In [38]:
intent_tags = []
intent_tags_numerical = []
queries = []

for intent in intents.values():
    for tags in intent:
        for pattern in tags['patterns']:
            queries.append(pattern)
            intent_tags.append(tags['tag'])
  
for intent_tag in intent_tags:
    for index, label in enumerate(labels):
        if label == intent_tag:
            intent_tags_numerical.append(index)
            
print(intent_tags)
print("Converted into:")
print(intent_tags_numerical)

['greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'greeting', 'goodbye', 'goodbye', 'goodbye', 'goodbye', 'goodbye', 'thanks', 'thanks', 'thanks', 'thanks', 'thanks', 'query', 'query', 'query', 'query', 'query', 'query', 'angry', 'angry', 'angry', 'angry', 'compliment', 'compliment', 'compliment', 'compliment', 'compliment']
Converted into:
[3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 1, 1, 1, 1, 1]


In [39]:
queries_dict = {
    'Tag': intent_tags_numerical,
    'Query': queries    
}
queries_df = pd.DataFrame(queries_dict)
queries_df.head(10)

Unnamed: 0,Tag,Query
0,3,Hi there
1,3,How are you
2,3,Is anyone there?
3,3,Hey
4,3,Hola
5,3,Hello
6,3,Good day
7,2,Bye
8,2,See you later
9,2,Goodbye


### Save our data as a .txt file to be processed by EDA_NLP

### Had to run eda_nlp outside of Jupyter

### Since our dataset contains a lot of short sentences, I found that the boosting algorithm I used resulted in a lot of duplicate queries, we'll not add duplicate queries to our lists

In [40]:
eda_words = [] # Unique words
eda_queries = [] # x axis
eda_queries_join = [] # x axis joined
eda_intents = [] # y axis

with open('eda_queries.txt', 'r') as file:
    for line in file:
        line_sep = re.search(r"(\d*)\t(.*)", line) # Group 1 is the tag, group 2 is the query
        lemma = nltk.word_tokenize(line_sep.group(2))
        if lemma not in eda_queries:
            eda_queries.append(lemma)
            eda_words.extend(lemma)
            eda_queries_join.append(" ".join(lemma)) # Appends query to our list if it doesn't already exist
            eda_intents.append(line_sep.group(1)) # Appends intent tag


eda_words = sorted(list(set(eda_words)))
            
print(f"Original queries len: {len(queries)}\nNew queries len: {len(eda_queries_join)}")
print(eda_queries[:50])
print("")
print(f"Original words len: {len(words)}\neda_words len: {len(eda_words)}")
print(eda_words[:50])

Original queries len: 32
New queries len: 186
[['there', 'hi'], ['hi', 'in', 'that', 'respect', 'there'], ['aloha', 'state', 'there'], ['hi', 'there'], ['in', 'that', 'respect', 'hi', 'there'], ['howdy', 'there'], ['how', 'are', 'you'], ['how', 'represent', 'are', 'you'], ['you', 'are', 'how'], ['are', 'how', 'you'], ['is', 'anyone', 'there'], ['there', 'anyone', 'is'], ['anyone', 'is', 'there'], ['is', 'anyone', 'in', 'that', 'respect', 'there'], ['in', 'that', 'respect', 'is', 'anyone', 'there'], ['is', 'in', 'that', 'respect', 'anyone', 'there'], ['hey'], ['hola'], ['hello'], ['howdy'], ['hi'], ['how', 'do', 'you', 'do'], ['hi', 'hello'], ['salutary', 'day'], ['solar', 'day', 'good', 'day'], ['day', 'good'], ['good', 'day'], ['adept', 'good', 'day'], ['good', 'adept', 'day'], ['good', 'clarence', 'day'], ['au', 'revoir'], ['bye'], ['good', 'bye', 'bye'], ['bye', 'bye'], ['figure', 'you', 'later'], ['you', 'see', 'later'], ['see', 'you', 'get', 'word', 'later'], ['see', 'get', 'word'

### We generated 320 queries from our 32 query database, once we remove the duplicates we're left with 186

### The amount of unique words in our database is also about triple what it was

In [41]:
eda_intents_alpha = []
for intent in eda_intents:
    alpha_intent = labels[int(intent)]
    eda_intents_alpha.append(alpha_intent)

In [42]:
eda_queries_dict = {
    'Query': eda_queries_join, 
    'Tag': eda_intents_alpha
}
eda_queries_df = pd.DataFrame(eda_queries_dict)
eda_queries_df.head(10)

Unnamed: 0,Query,Tag
0,there hi,greeting
1,hi in that respect there,greeting
2,aloha state there,greeting
3,hi there,greeting
4,in that respect hi there,greeting
5,howdy there,greeting
6,how are you,greeting
7,how represent are you,greeting
8,you are how,greeting
9,are how you,greeting


### Convert our data to an array for cosine similarity checks

In [43]:
eda_lemma_tfidf = tfidf.fit_transform(eda_queries_df['Query']).toarray()
eda_tfidf = pd.DataFrame(eda_lemma_tfidf, columns=tfidf.get_feature_names_out())
eda_tfidf

Unnamed: 0,about,adept,adieu,adjacent,aloha,amazing,ampere,an,annoying,anyone,...,unbelievable,useless,what,whats,will,wit,with,word,you,youre
0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.67365,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,0.0,0.0,0.0,0.0,0.00000,0.777391,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,0.0,0.0,0.0,0.0,0.00000,0.558068,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
184,0.0,0.0,0.0,0.0,0.00000,0.777391,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### We'll again use BOW to convert our data to a numerical form

In [44]:
eda_training = []
eda_output = []
eda_out_empty = [0 for _ in range(len(labels))]

# One hot encoding, Converting the words to numerals
for x, doc in enumerate(eda_queries):
    bag = []
    for w in eda_words:
        if w in doc:
            bag.append(1)
        else:
            bag.append(0)

    output_row = out_empty[:]
    output_row[labels.index(eda_intents_alpha[x])] = 1

    eda_training.append(bag)
    eda_output.append(output_row)


eda_training = np.array(eda_training)
eda_output = np.array(eda_output)

### I trained the model initially using the same code as I used earlier, but I found not only did I get better results by removing 'batch_size=8', but it also processed significantly quicker. I suspect that the better results come from the model no longer overfitting the data.

### Load the model we trained

In [45]:
tf.compat.v1.reset_default_graph() # Without this, you cannot train a NN twice in one notebook

eda_net = tflearn.input_data(shape=[None, len(eda_training[0])])
eda_net = tflearn.fully_connected(eda_net, 10)
eda_net = tflearn.fully_connected(eda_net, 10)
eda_net = tflearn.fully_connected(eda_net, 10)
eda_net = tflearn.fully_connected(eda_net, len(eda_output[0]), activation='softmax')

model_eda = tflearn.DNN(eda_net)
model_eda.load('model_eda.tflearn')

INFO:tensorflow:Restoring parameters from C:\Users\aaron\Jupyter\model_eda.tflearn


### Now we've trained our new model, we'll test it in the same way we tested our first one earlier

In [46]:
def chat_tag_nn_eda(question):
    question_norm = text_normalization(question)
    question_lemma = nltk.word_tokenize(question_norm)
    # question_lemma = [stemmer.stem(word.lower()) for word in question_lemma]
    
    results = model_eda.predict([bag_of_words(question_lemma, eda_words)])
    
    results_index = np.argmax(results)

    tag = labels[results_index]

    for value in intents.values():
        for tg in value:
            if tg['tag'] == tag:
                print(f"Question: {question}\nQuestion Stemmed: {question_lemma}\nTag: {tag}\nResponse: Null")

In [47]:
for question in questions:
    chat_tag_nn_eda(question)
    print("")

Question: Will you help me, and tell me about yourself?
Question Stemmed: ['will', 'you', 'help', 'me', 'and', 'tell', 'me', 'about', 'yourself']
Tag: query
Response: Null

Question: How are you doing?
Question Stemmed: ['how', 'be', 'you', 'do']
Tag: greeting
Response: Null

Question: I love you
Question Stemmed: ['i', 'love', 'you']
Tag: compliment
Response: Null

Question: Thanks for the support!
Question Stemmed: ['thanks', 'for', 'the', 'support']
Tag: thanks
Response: Null

Question: Will you reply accurately?
Question Stemmed: ['will', 'you', 'reply', 'accurately']
Tag: query
Response: Null

Question: Will you marry me?
Question Stemmed: ['will', 'you', 'marry', 'me']
Tag: query
Response: Null

Question: You are amazing, I hope to see you soon!
Question Stemmed: ['you', 'be', 'amaze', 'i', 'hope', 'to', 'see', 'you', 'soon']
Tag: goodbye
Response: Null

Question: What is the meaning of life?
Question Stemmed: ['what', 'be', 'the', 'meaning', 'of', 'life']
Tag: query
Response: Nu

### We'll try predict the intents of our original DataFrame once more with our new model

In [48]:
df_tagged_eda_nn = []

def df_tag_eda_nn(text):
    text_norm = text_normalization(text)
    text_lemma = nltk.word_tokenize(text_norm)
    # text_lemma = [stemmer.stem(word.lower()) for word in text_lemma]
    
    results = model_eda.predict([bag_of_words(text_lemma, eda_words)])
    
    results_index = np.argmax(results)

    tag = labels[results_index]
    for value in intents.values():
        for tg in value:
            if tg['tag'] == tag:
                df_tagged_eda_nn.append((text, tag))

In [49]:
for data in df['Lemmatized Context']:
    df_tag_eda_nn(data)

In [50]:
for query in df_tagged_eda_nn[0:100:5]:
    print(query)

('tell me about your personality', 'query')
('all about you', 'greeting')
('who be you', 'thanks')
('say about you', 'query')
('what be your age', 'query')
('id like to know your age', 'thanks')
('youre so annoy', 'angry')
('you be irritate', 'thanks')
('answer me', 'query')
('can you answer a question for me', 'query')
('just answer the question', 'query')
('answer the question', 'query')
('give me the answer', 'thanks')
('you be horrible', 'thanks')
('you be no good', 'greeting')
('youre a bad', 'greeting')
('youre not very good', 'angry')
('you be bad', 'thanks')
('be smarter', 'thanks')
('be clever', 'thanks')


In [51]:
df_tagged_eda_nn_query = []
df_tagged_eda_nn_tags = []
for query, tag in df_tagged_eda_nn:
    df_tagged_eda_nn_query.append(query)
    df_tagged_eda_nn_tags.append(tag)

In [52]:
df_tagged_eda_nn_data = {
    'Query': df_tagged_eda_nn_query,
    'Response': df['Text Response'],
    'Tag': df_tagged_eda_nn_tags
}
df_tagged_eda_nn_df = pd.DataFrame(df_tagged_nn_data)

In [53]:
df_tagged_tuples_eda, df_responses_numerical_dict_eda, df_query_by_response_dict_eda = df_tag_norm(df_tagged_eda_nn_df)
df_queries_eda_tagged = tag_occurence(df_query_by_response_dict_eda)
df_eda_proc = df_create(df_queries_eda_tagged)


### We'll write our final chat function, to use our newest NN model to check for intent, obtain a response using our original cosine similarity method, and also obtain the predicted intent from our response we obtained via cosine similarity

In [54]:
lemma_tfidf = tfidf.fit_transform(df['Lemmatized Context']).toarray()
df_tfidf = pd.DataFrame(lemma_tfidf, columns=tfidf.get_feature_names_out())

def chat_nn_eda(question):
    question_norm = text_normalization(question)
    question_lemma = nltk.word_tokenize(question_norm)
    # Predict intent using our NN model
    results = model_eda.predict([bag_of_words(question_lemma, eda_words)])
    results_index = np.argmax(results)
    tag = labels[results_index]

    # Obtain response
    question_clean = []
    for word in question.split():
        word = re.sub(r"[^a-zA-Z0-9]+", '', word) # Removes special characters
        if word in stopwords: # Remove stopwords from our text
            pass
        else:
            question_clean.append(word)
    lemma = text_normalization(" ".join(question_clean)) # Join and normalize the text
    response = df['Text Response'].loc[results_index]
    lemma_tfidf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1-pairwise_distances(df_tfidf, lemma_tfidf, metric='cosine') # cosine similarity
    similarity_index = cos.argmax() # Get index value of highest similarity
    response = df['Text Response'].loc[similarity_index]
    response_tag = df_eda_proc['Tag'].loc[similarity_index]
    
    if response_tag == tag:
        match = True
    else:
        match = False


    for value in intents.values():
        for tg in value:
            if tg['tag'] == tag:
                print(f"Question: {question}\nIntent: {tag}\nResponse: {response}\ndf_intent: {response_tag}\nTag match: {match}")

In [55]:
for question in questions:
    chat_nn_eda(question)
    print("")

Question: Will you help me, and tell me about yourself?
Intent: query
Response: I'm glad to help. What can I do for you?
df_intent: thanks
Tag match: False

Question: How are you doing?
Intent: greeting
Response: Lovely, thanks.
df_intent: greeting
Tag match: True

Question: I love you
Intent: compliment
Response: That's great to hear.
df_intent: angry
Tag match: False

Question: Thanks for the support!
Intent: thanks
Response: It's my pleasure to help.
df_intent: thanks
Tag match: True

Question: Will you reply accurately?
Intent: query
Response: Oh, don't give up on me!
df_intent: angry
Tag match: False

Question: Will you marry me?
Intent: query
Response: In the virtual sense that I can, sure.
df_intent: greeting
Tag match: False

Question: You are amazing, I hope to see you soon!
Intent: goodbye
Response: Bye.
df_intent: goodbye
Tag match: True

Question: What is the meaning of life?
Intent: query
Response: Sorry. I think I may have been a little confused by what you said.
df_inten

### Intent prediction on my questions appears decently accurate, however it is of course a tiny test set to use against my NN. The intent prediction for the original dataframe starts to show the short comings of my NN.

### However, considering I initially trained this NN on a such a small database, I'm really happy with the results I've achieved. I'm confident that if I were to start expanding the database, adding in more tagged queries, I'd see an increase in accuracy.

### I also wonder if I could improve the NN by labelling its predictions as True or False, allowing it to learn from its mistakes. I've read about this being possible with a NN, but I've no idea how I'd implement something like this, so it's something I'll need to research further.