# Appendix ____________________ - Python Code To Create Chatbots about HR Policy

## 1) Chatbot 1 - Sentence-Based Transformer Chatbot That Leverages Cosine Similarity

In [None]:
!pip install pydantic==1.10.2

In [1]:
import pandas as pd
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from collections import Counter
from dataclasses import dataclass
from timeit import default_timer as timer
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize 

import gensim
from gensim.models import Word2Vec

#import spacy
#from spacy import displacy

#from spacy.matcher import Matcher 
#from spacy.tokens import Span 

import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


from IPython.display import display, HTML

from typing import List, Callable, Dict, Tuple, Set

pd.set_option('max_colwidth', 600)
pd.set_option('display.max_rows', 500)




[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
#Load Sentence Transformer model optimized for  sentence cosine similarity calculations

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [3]:
# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

True

In [4]:
#read in data
CORPUS_PATH = 'C:/Users/steve/OneDrive/Desktop/Github/HR_Policy_Chatbot_Capstone_Project/corpus.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()
raw=raw.lower()# converts to lowercase

#create list of sentences and words
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

In [5]:
#create greetings and greetings function

GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["Hello"]


# Checking for greetings
def greeting(sentence):
    """If user's input is a greeting, return a greeting response"""
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [6]:

# Generating response function 
def response(user_response):
    chatbot_response=''
    sentence_encodings=model.encode(sent_tokens, convert_to_tensor=True)# generate sentence transformer embeddings
    sentence_encodings=sentence_encodings.cpu()
    vals = cosine_similarity(sentence_encodings[-1].reshape(1, -1), sentence_encodings) #the chatbot conversation code 
    #in the next cell adds the question as the last sentence of the sentence tokens, before calling this response function.
    #The code takes the last sentence (which is the question) and gets cosine similarities vs all the sentences in the corpus,
    #including itself
    idx=vals.argsort()[0][-2] #gets the index of the second highest similarity (the first highest would be the question itself)
    flat = vals.flatten()#reduces dimension of cosine similarity array to be able to sort
    flat.sort() #sort the cosine similarity values
    second_cos_sim_val = flat[-2] #get the second highest cosine similarity value.
    if(second_cos_sim_val==0): #check the second highest cosine similarity value. If it's zero return the no match response,
        #else return highest cosine similarity sentence.
        chatbot_response=chatbot_response+"Sorry, I do not have an answer to your question in my database"
        return chatbot_response
    else:
        chatbot_response = chatbot_response+sent_tokens[idx] #use index of highest cosine similarity to get original sentence
        return chatbot_response

In [7]:
#Chatbot interaction code

flag=True
print("Welcome to the Informational Chatbot About Human Resources policy at the Department of Health and Human Services.  Please note that this chatbot is just one graduate student's experimental final project and in no way actually speaks for the U.S. Department of Health and Human Resources. To end this session, please type exit.")
print("\n")

while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if user_response!='exit':
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("Answer: You are welcome!")
        else:
            if(greeting(user_response)!=None):
                print("Answer: "+greeting(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens=word_tokens+nltk.word_tokenize(user_response)
                final_words=list(set(word_tokens))
                print("Answer: ",end="")
                print(response(user_response))
                print("\n")
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("Thank you for using this chatbot service. Goodbye.")    


Welcome to the Informational Chatbot About Human Resources policy at the Department of Health and Human Services.  Please note that this chatbot is just one graduate student's experimental final project and in no way actually speaks for the U.S. Department of Health and Human Resources. To end this session, please type exit.


How is compensation determined when appointing special consultants?
Answer: â§ 22.3(a) appointments of special consultants, provides:

when the public health service requires the services of consultants who cannot be obtained when needed through regular civil service appointment or under the compensation provisions of the classification act of 1949, special consultants to assist and advise in the operations of the service may be appointed, subject to the provisions of the following paragraphs and in accordance with such instructions as may be issued from time to time by the secretary of health and human services.


thank you
Answer: You are welcome!


In [None]:






 ######################## UPDATE THIS WHEN ASSESSING MODEL PERFORMANCE #####################################################


    
    
    
    
    
chatbot_models = []

for i in range(20):
    chatbot_models.append(1)
    
model_1_performance_list = ["Incorrect",
                          "Correct",
                          "Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Partially Correct",
                          "Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Correct",
                          "Incorrect",
                           "Correct",
                           "Correct"]

## 2) Chatbot 2 - Fine-Tune GPT2 Model

In [None]:
!pip install transformers

In [None]:
pip install --upgrade protobuf

In [None]:




############################## UPDATE THIS TO USE FINE TUNING QUESTIONS SPECIFIC TO THE CORPUS ###############################










import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW

# Load the GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Define multiple question and answer pairs
qa_pairs = [
    ("Which species are at the top of the food chain in coral reefs?", "sharks and giant moray. STOP"),
    ("What fish eat coral?", "parrotfish and butterflyfish. STOP"),
    ("Which coral reef fish has the shortest lifespan?", "seven-figure pygmy goby. STOP"),
    ("What species in coral reefs can inflate themselves?", "puffers, striated frogfish, and porcupinefish. STOP"),
    ("How do sea anemones protect themselves?", "tentacles that bristle tiny harpoons primed with toxins. STOP"),
    ("What species commonly serves as a cleaner fish?", "bluestreak cleaner wrasse. STOP"),
    ("What species in coral reefs eat sponges?", "emperor angelfish. STOP"),
    ("What species in coral reefs eat stingrays?", "caribbean reef shark and great hammerheads. STOP"),
    ("Which species in coral reefs are hermaphrodites", "grouper. STOP"),
    ("What species in coral reefs is known to eat birds?", "blacktip reef shark. STOP"),
    ("What percent of the ocean is covered by coral reefs?", "less than 1%. STOP"),
    ("When is the colouration of clown triggerfish vivid?", "when they are not threatened. STOP"),
    ("What are reef lizardfish coated in?", "mucus. STOP"),
    ("Name some coral reef fish that feed on zooplankton?", "damselfish, surgeonfish, and cardinalfish. STOP"),
    ("Parrotfish often school with what other species?", "spinefoot rabbitfish. STOP"),
    ("What species eat sea anemones?", "saddle butterflyfish. STOP"),
    ("How many venomous spines does the reef stonefish have?", "13. STOP"),
    ("What predators eat reef stonefish?", "bottom feeding rays, sharks, and Stokes' seasnake. STOP"),
    ("What toxin is sometimes found coral reef carnivores?", "ciguatera toxin. STOP"),
    ("How long are whitetip reef sharks?", "usually less than 1.6 meters. STOP"),
    ("How long are whitetip reef sharks?", "less than 1.6 meters. STOP"),
    ("What predators eat whitetip reef sharks?", "tiger sharks and Galapagos sharks. STOP"),
    ("What predators eat blacktip reef sharks?", "groupers, grey reef sharks, tiger sharks, and members of their own species. STOP"),
    ("What sharks are known for having small home ranges?", "blacktip reef sharks. STOP"),
    ("Are grey reef sharks social or territorial?", "social. STOP"),
    ("Name a coral reef shark that likes drop-offs?", "whitetip reef shark, blacktip reef shark, or grey reef shark. STOP"),
    ("What do benthic algae grow on?", "dead coral and other inert surfaces. STOP"),
    ("What do goatfish use to search for food?", "chemosensory barbels (whiskers). STOP"),   
    ("Goatfish commonly change their colouration to match that of which fish?", "snapper. STOP"),   
    ("What does the tassled scorpionfish camouflage itself to look like?", "coral encrusted sea floor. STOP"),  
    ("What are some threats to the survival of coral reef fish?", "pollution, overfishing, and habitat loss and degradation. STOP"),  
    ("What percent of marine fish species live in coral reefs?", "25 percent. STOP"), 
    ("Where do most coral reef fish have spines?", "on their fins. STOP"),  
    ("Why do toadfish sing?", "to attract mates. STOP"), 
    ("What do Synchiropus splendidus eat?", "small crustaceans and other invertebrates. STOP"), 
    ("What color is the mouth of the clown triggerfish?", "bright yellow. STOP"),   
    ("How many species of parasites would go extinct if one coral reef fish species of average size went extinct?", "at least 10. STOP"),      
    ("What types of sharks can enter brackish or freshwater environments?", "blacktip reef sharks. STOP"),
    ("What sound frequency do whitetip reef sharks recognize as coming from struggling fish?", "25 - 100 Hz. STOP"),
    ("Can whitetip reef sharks rest on the sea floor or do they have to keep moving?", "They can rest on the sea floor. STOP"),    
    ("Are whitetip reef sharks better at hunting in tight crevices or in open water?", "tight crevices. STOP"),
    ("How long are Caribbean reef sharks?", "up to 3 meters. STOP"), 
    ("Titan triggerfish use jets of water to expose what species buried in the sand?", "sand dollars. STOP"), 
    ("Are coral reef fish bodies often optimized for straight-line speed or for manoeuvrability?", "manoeuvrability. STOP"),
    ("Where does the foureye butterflyfish get its name from?", "the large dark spots each side of their bodies. STOP"), 
    ("How long are striated frogfish?", "about 10 centimeters. STOP"),
    ("Why do some coral reef fish engage in schooling?", "defense against predators through better predator detection. STOP"),
    ("What are lateral lines in coral reef fish?", "pressure sensors that allow schooling fish to feel each others' movements and stay synchronized. STOP"),
    ("What is a primary producer?", "a plant that synthesizes food from solar energy. STOP"), 
    ("What is the name for the stinging cells in fire coral?", " nematocysts. STOP")
]

# Concatenate the question and answer pairs with appropriate formatting
formatted_pairs = [f"Q: {q}\nA: {a}\n" for q, a in qa_pairs]
qa_text = "\n".join(formatted_pairs)

# Fine-tune the GPT-2 model with the Q&A pairs
inputs = tokenizer.encode(qa_text, return_tensors="pt", max_length = 1000)
model.train()

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Run the fine-tuning loop (example: 1 epoch)
for j in range(120):
    print (j)
    outputs = model(inputs, labels=inputs)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-gpt2")

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("fine-tuned-gpt2")



In [None]:






########################### UPDATE THIS SECTTION WEHN ASSESSING MODEL PERFORMANCE ########################################







# Test the fine-tuned model with a question
test_question = "Q: How many species of fish live in coral reefs?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])


In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What is the most venomous fish?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])


In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What sharks live in coral reefs?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What fish are poisonous?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What fish can electrocute you?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species can sting you?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species are parasitic?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species are venomous?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species are known for attacking scuba divers?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What are common herbivorous fish?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: Why do fish camouflage themselves?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: How do fish get rid of their parasites?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species have a mutualistic relationship?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species have a commensalistic relationship?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: Where in the world are coral reefs found?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species engage in schooling?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What species are ambush predators?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: Which species are ambush predators?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: Why are some coral reef fish colorful?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: Which species is blue?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:
# Test the fine-tuned model with a question
test_question = "Q: What is the slowest species that lives in coral reefs?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

In [None]:



########################################## UPDATE THIS AFTER ASSESSING MODEL PERFORMANCE ###################################





for i in range(20):
    chatbot_models.append(2)
    
    
model_2_performance_list = ["Partially Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect", 
                          "Partially Correct",
                          "Incorrect",
                          "Correct",
                          "Partially Correct",
                          "Incorrect",
                          "Partially Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Correct",
                          "Incorrect",
                          "Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect"]


## 3) Chatbot 3 - Chatbot Emphasizing Cosine Similarity of TF-IDF Representations of Sententces

In [8]:
import io
import random
import string # to process standard python strings
import warnings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [9]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('popular', quiet=True) # for downloading packages
#nltk.download('punkt') # first-time use only
#nltk.download('wordnet') # first-time use only

True

In [11]:
CORPUS_PATH = 'C:/Users/steve/OneDrive/Desktop/Github/HR_Policy_Chatbot_Capstone_Project/corpus.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()
raw = raw.lower()# converts to lowercase

In [12]:
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

In [13]:
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [14]:
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["Hi", "Hey", "*nods*", "Hi there", "Hello", "I am glad! You are talking to me"]
def greeting(sentence):
 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [15]:
def response(user_response):
    robo_response=''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response

In [16]:
flag=True
print("Hello, I am the Unofficial Department of Health and Human Services (HHS) Human Resources (HR) Policy Chatbot! I will answer your queries related to HR policy at HHS. Please note that this resource is one curious graduate student's experimental capstone project and does not actually speak for the U.S. Department of Health and Human Services. If you want to exit, type bye.")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("You are welcome.")
        else:
            if(greeting(user_response)!=None):
                print(greeting(user_response))
            else:
                print("",end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("Bye! take care..")

Hello, I am the Unofficial Department of Health and Human Services (HHS) Human Resources (HR) Policy Chatbot! I will answer your queries related to HR policy at HHS. Please note that this resource is one curious graduate student's experimental capstone project and does not actually speak for the U.S. Department of Health and Human Services. If you want to exit, type bye.
what are the educational requirements for appointed special consultants?
â§ 209(f) special consultants
42 c.f.r.
how is compensation determined for appointed special consultants?
â§ 209(f) special consultants
42 c.f.r.
thank you
You are welcome.


In [None]:




################################## UPDATE THIS SECTION AFTER ASSESSING MODEL PERFORMANCE ##############################






for i in range(20):
    chatbot_models.append(3)

model_3_performance_list = ["Incorrect",
                          "Incorrect", 
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Correct",
                          "Correct",
                          "Correct",
                          "Correct",
                          "Incorrect",
                          "Partially Correct",
                          "Incorrect",
                          "Correct",
                          "Incorrect",
                          "Partially Correct",
                          "Correct",
                          "Correct",
                          "Incorrect",
                           "Correct",
                           "Incorrect"]


## 4) Chatbot 4 - Distillbert Chatbot

Source: https://huggingface.co/distilbert-base-cased-distilled-squad

In [17]:
#read in data
CORPUS_PATH = 'C:/Users/steve/OneDrive/Desktop/Github/HR_Policy_Chatbot_Capstone_Project/corpus.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()
raw=raw.lower()# converts to lowercase

In [18]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

#context = r"""
#Extractive Question Answering is the task of extracting an answer from a text given a question. An example     of a
#question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
#a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
#"""

result = question_answerer(question="What is a good example of a question answering dataset?",     context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

#Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160

Answer: 'official transcript', score: 0.4861, start: 19914, end: 19933


In [19]:


######################### UPDATE THIS SECTION TO ASK QUESTIONS ABOUT HHS HR POLICY #########################################





result = question_answerer(question="What are the experience requirements for appointed special consultants?",     context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'professional experience and stature', score: 0.3316, start: 5987, end: 6022


In [None]:
result = question_answerer(question="What is the most venomous fish?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What sharks live in coral reefs?",     context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What fish are poisonous?",     context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What fish can electrocute you?",     context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What species can sting you?",     context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What species are parasitic?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What species are venomous?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What species are known for attacking scuba divers?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What are common herbivorous fish?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="Why do fish camouflage themselves?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="How do fish get rid of their parasites?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What species have mutualistic relationships?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What species have commensalistic relationships?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="Where in the world are coral reefs found?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What species engage in schooling?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="Which species are ambush predators?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="Why are some coral reef fish colorful?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="Which species is blue?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What is the slowest species that lives in coral reefs?", context=raw)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:


######################################## UPDATE THIS TO REFLECT MODEL PERFORMANCE #############################################



for i in range(20):
    chatbot_models.append(4)

model_4_performance_list = ["Incorrect",
                          "Correct",
                           "Correct",
                           "Incorrect",
                           "Incorrect",
                           "Correct",
                           "Incorrect",
                           "Correct",
                           "Correct",
                           "Correct",
                           "Incorrect",
                           "Incorrect",
                           "Correct",
                           "Incorrect",
                           "Incorrect",
                           "Correct",
                           "Correct",
                           "Incorrect",
                           "Partially Correct",
                           "Correct"]

## 5) Chatbot 5 - Roberta Chatbot

https://huggingface.co/deepset/roberta-base-squad2

In [20]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, RobertaTokenizer, RobertaModel, TFRobertaModel
from transformers import optimization

batch_size = 96
n_epochs = 2
base_LM_model = "roberta-base"
max_seq_len = 386
learning_rate = 3e-5
#lr_schedule = optimization.LinearWarmup
warmup_proportion = 0.2
doc_stride=128
max_query_length=64

In [None]:
#!pip install farm-haystack[inference]

In [21]:








############################# UPDATE THIS SECTION TO REFLECT HHS HR POLICY QUESTIONS ######################################














#from haystack.reader.farm import FARMReader

#reader = TransformersReader(model_name_or_path="deepset/roberta-base-squad2",tokenizer="deepset/roberta-base-squad2")

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'What are the experience requirements for appointed special consultants?',
    'context': raw
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

res

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

{'score': 0.18069009482860565,
 'start': 5987,
 'end': 6049,
 'answer': 'professional experience and stature in their area of expertise'}

In [22]:
# a) Get predictions
QA_input = {
    'question': 'How is compensation determined when appointing special consultants?',
    'context': raw
}
res = nlp(QA_input)

res

{'score': 0.0006357975071296096,
 'start': 5987,
 'end': 6010,
 'answer': 'professional experience'}

In [23]:
# a) Get predictions
QA_input = {
    'question': 'What are the educational requirements for appointed special consultants?',
    'context': raw
}
res = nlp(QA_input)

res

{'score': 0.01072785072028637,
 'start': 5987,
 'end': 6022,
 'answer': 'professional experience and stature'}

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What fish are poisonous?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What fish can electrocute you?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species can sting you?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species are parasitic?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species are venomous?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species are known for attacking scuba divers?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What are common herbivorous fish?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'Why do fish camouflage themselves?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'How do fish get rid of their parasites?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species have mutualistic relationships?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species have commensalistic relationships?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'Where in the world are coral reefs found?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species engage in schooling?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What species are ambush predators?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'Why are some coral reef fish colorful?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'Which species is blue?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:
# a) Get predictions
QA_input = {
    'question': 'What is the slowest species that lives in coral reefs?',
    'context': raw
}
res = nlp(QA_input)

res

In [None]:








######################### UPDATE THIS TO REFLECT PERFORMANCE ASSESSMENT OF MODEL ###############################################











for i in range(20):
    chatbot_models.append(5)

model_5_performance_list = ["Correct",
                           "Correct",
                           "Correct",
                           "Partially Correct",
                           "Incorrect",
                           "Partially Correct",
                           "Partially Correct",
                           "Correct",
                           "Correct",
                           "Correct",
                           "Correct",
                           "Correct",
                           "Correct",
                           "Incorrect",
                           "Correct",
                           "Incorrect",
                           "Correct",
                           "Incorrect",
                           "Partially Correct",
                           "Correct"]

## 6) Evaluation of Chatbot Performance Results

In [None]:

import pandas as pd


chatbot_performance_df = pd.DataFrame({'Correct': [100 * model_1_performance_list.count("Correct")/len(model_1_performance_list),
                               100 * model_2_performance_list.count("Correct")/len(model_2_performance_list),
                               100 * model_3_performance_list.count("Correct")/len(model_3_performance_list),
                               100 * model_4_performance_list.count("Correct")/len(model_4_performance_list),
                               100 * model_5_performance_list.count("Correct")/len(model_5_performance_list)],
                    'Partially Correct': [100 * model_1_performance_list.count("Partially Correct")/len(model_1_performance_list),
                               100 * model_2_performance_list.count("Partially Correct")/len(model_2_performance_list),
                               100 * model_3_performance_list.count("Partially Correct")/len(model_3_performance_list),
                               100 * model_4_performance_list.count("Partially Correct")/len(model_4_performance_list),
                               100 * model_5_performance_list.count("Partially Correct")/len(model_5_performance_list)],
                   'Incorrect': [100 * model_1_performance_list.count("Incorrect")/len(model_1_performance_list),
                               100 * model_2_performance_list.count("Incorrect")/len(model_2_performance_list),
                               100 * model_3_performance_list.count("Incorrect")/len(model_3_performance_list),
                               100 * model_4_performance_list.count("Incorrect")/len(model_4_performance_list),
                               100 * model_5_performance_list.count("Incorrect")/len(model_5_performance_list)]},
                  index=['1', '2', '3', '4', '5'])



In [None]:
chatbot_performance_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

plt.style.use('seaborn')

ax = chatbot_performance_df.plot(kind = 'bar', stacked = True, figsize = (12, 12), fontsize = 13, color = ["forestgreen", "goldenrod", 'firebrick'])

ax.set_xticklabels(labels = ["Sentence-Based Transformer", 
                             "Fine-Tuned GPT2",
                             "TF-IDF Cosine Similarity",
                             "DistilBERT",
                             "Roberta"], rotation = 45)
ax.set_xlabel("Chatbot Model", fontsize=18)
ax.set_ylabel("Percent of Responses That Are Correct, Partially Correct, and Incorrect", fontsize=17)
ax.set_title('Stacked Barplot Summarizing Performance Of Chatbots About HHS HR Policy', fontsize = 20)

plt.legend(bbox_to_anchor=(1.02, 0.55), loc='upper left', borderaxespad=0, fontsize = 13)

plt.show();
