# Assignment 4 - Chatbots

#### Steve Desilets

#### August 26, 2023

## 1) Chatbot 1 - Sentence-Based Transformer Chatbot That Leverages Cosine Similarity

In [1]:
import pandas as pd
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from collections import Counter
from dataclasses import dataclass
from timeit import default_timer as timer
import random
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize 

import gensim
from gensim.models import Word2Vec

import spacy
from spacy import displacy

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


from IPython.display import display, HTML

from typing import List, Callable, Dict, Tuple, Set

pd.set_option('max_colwidth', 600)
pd.set_option('display.max_rows', 500)





In [2]:
#Load Sentence Transformer model optimized for  sentence cosine similarity calculations

#The models below fully downloaded in Google Colab. This is the version of the google colab notebook but 
#it was open in anoconda to be saved as pdf and the download graphics did not transfer properly so 
#it seems like it didn't download. However, it did in the orignal google colab notebook, where all the
#analysis was run.

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [3]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [4]:
# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)

True

In [5]:
#read in data
CORPUS_PATH = 'C:/Users/steve/OneDrive/Desktop/Github/Natural_Language_Processing/Chatbot Assignment/coral_reef_fish.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()
raw=raw.lower()# converts to lowercase

#create list of sentences and words
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

In [6]:
#create greetings and greetings function

GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["Hello"]


# Checking for greetings
def greeting(sentence):
    """If user's input is a greeting, return a greeting response"""
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [7]:

# Generating response function 
def response(user_response):
    chatbot_response=''
    sentence_encodings=model.encode(sent_tokens, convert_to_tensor=True)# generate sentence transformer embeddings
    sentence_encodings=sentence_encodings.cpu()
    vals = cosine_similarity(sentence_encodings[-1].reshape(1, -1), sentence_encodings) #the chatbot conversation code 
    #in the next cell adds the question as the last sentence of the sentence tokens, before calling this response function.
    #The code takes the last sentence (which is the question) and gets cosine similarities vs all the sentences in the corpus,
    #including itself
    idx=vals.argsort()[0][-2] #gets the index of the second highest similarity (the first highest would be the question itself)
    flat = vals.flatten()#reduces dimension of cosine similarity array to be able to sort
    flat.sort() #sort the cosine similarity values
    second_cos_sim_val = flat[-2] #get the second highest cosine similarity value.
    if(second_cos_sim_val==0): #check the second highest cosine similarity value. If it's zero return the no match response,
        #else return highest cosine similarity sentence.
        chatbot_response=chatbot_response+"Sorry, I do not have an answer to your question in my database"
        return chatbot_response
    else:
        chatbot_response = chatbot_response+sent_tokens[idx] #use index of highest cosine similarity to get original sentence
        return chatbot_response

In [8]:
#Chatbot interaction code

flag=True
print("Welcome to the Informational Chatbot About Coral Reef Fish. To end session please type exit.")
print("\n")

while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if user_response!='exit':
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("Answer: You are welcome!")
        else:
            if(greeting(user_response)!=None):
                print("Answer: "+greeting(user_response))
            else:
                sent_tokens.append(user_response)
                word_tokens=word_tokens+nltk.word_tokenize(user_response)
                final_words=list(set(word_tokens))
                print("Answer: ",end="")
                print(response(user_response))
                print("\n")
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("Thank you for using the Informational Chatbot About Coral Reef Fish. Goodbye.")    


Welcome to the Informational Chatbot About Coral Reef Fish. To end session please type exit.


How many species of fish live in coral reefs?
Answer: the fish that inhabit coral reefs are numerous and diverse.


What is the most venomous fish?
Answer: the most venomous known fish is the reef stonefish.


What sharks live in coral reefs?
Answer: the whitetip reef shark almost exclusively inhabits coral reefs.


What fish are poisonous?
Answer: there is a distinction between poisonous fish and venomous fish.


What fish can electrocute you?
Answer: some unmistakable contrasting patterns are used to warn predators that the fish has venomous spines or poisonous flesh.


What species can sting you?
Answer: the most venomous known fish is the reef stonefish.


What species are parasitic?
Answer: monogenean parasites of the genus pseudorhabdosynochus (arrows) on the gill filament of a grouper.


What species are venomous?
Answer: few of these venoms have been studied.


What species are known 

In [45]:
chatbot_models = []

for i in range(20):
    chatbot_models.append(1)
    
chatbot_performance = ["Incorrect",
                          "Correct",
                          "Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Partially Correct",
                          "Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Correct",
                          "Incorrect"]

######################### NEED TO ADD LAST 2 QUESTIONS ##################################3

## 2) Chatbot 2 - Fine-Tune GPT2 Model

In [9]:
!pip install transformers







In [11]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW

# Load the GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Define multiple question and answer pairs
qa_pairs = [
    ("Which species are at the top of the food chain?", "sharks and giant moray. STOP"),
    ("What fish eat coral?", "parrotfish and butterflyfish. STOP"),
    ("Which coral reef fish has the shortest lifespan?", "seven-figure pygmy goby. STOP"),
    ("What species can inflate themselves?", "puffers, striated frogfish, and porcupinefish. STOP"),
    ("How do sea anemones protect themselves?", "the tentacles of sea anemones bristle with tiny harpoons (nematocysts) primed with toxins, and are an effective deterrent against most predators. STOP"),
    ("What species commonly serves as a cleaner fish?", "bluestreak cleaner wrasse. STOP"),
    ("What species eat sponges?", "emperor angelfish. STOP"),
    ("What species eat stingrays?", "caribbean reef shark and great hammerheads. STOP"),
    ("Which species are hermaphrodites", "grouper. STOP"),
    ("What species is known to eat birds?", "blacktip reef shark. STOP")
]

# Concatenate the question and answer pairs with appropriate formatting
formatted_pairs = [f"Q: {q}\nA: {a}\n" for q, a in qa_pairs]
qa_text = "\n".join(formatted_pairs)

# Fine-tune the GPT-2 model with the Q&A pairs
inputs = tokenizer.encode(qa_text, return_tensors="pt")
model.train()

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Run the fine-tuning loop (example: 1 epoch)
for j in range(120):
    print (j)
    outputs = model(inputs, labels=inputs)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-gpt2")

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("fine-tuned-gpt2")





0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119


In [14]:
# Test the fine-tuned model with a question
test_question = "Q: How many species of fish live in coral reefs?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: How many species of fish live in coral reefs?', 'A: About 1,000 species of coral reef fish live in the United States. These fish are an important food source for many species of birds and other marine life. However, coral reef fish are an invasive species and are an invasive species throughout the United States. What species are at the top of the food chain?']


In [15]:
# Test the fine-tuned model with a question
test_question = "Q: What is the most venomous fish?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What is the most venomous fish?', 'A: parrotfish and butterflyfish. Parrotfish are the most venomous fish in the world, and are the most venomous fish in the world. Butterflyfish are the most venomous fish in the world, and are the most venomous fish in the world.']


In [16]:
# Test the fine-tuned model with a question
test_question = "Q: What sharks live in coral reefs?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What sharks live in coral reefs?', 'A: sharks and giant moray. ']


In [17]:
# Test the fine-tuned model with a question
test_question = "Q: What fish are poisonous?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What fish are poisonous?', 'A: sharks and giant moray. ']


In [18]:
# Test the fine-tuned model with a question
test_question = "Q: What fish can electrocute you?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What fish can electrocute you?', 'A: sharks and giant moray. ']


In [19]:
# Test the fine-tuned model with a question
test_question = "Q: What species can sting you?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species can sting you?', 'A: sharks and giant moray. ']


In [20]:
# Test the fine-tuned model with a question
test_question = "Q: What species are parasitic?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species are parasitic?', 'A: sharks and giant moray. ']


In [21]:
# Test the fine-tuned model with a question
test_question = "Q: What species are venomous?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species are venomous?', 'A: sharks and giant moray. ']


In [22]:
# Test the fine-tuned model with a question
test_question = "Q: What species are known for attacking scuba divers?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species are known for attacking scuba divers?', 'A: sharks and giant moray. ']


In [23]:
# Test the fine-tuned model with a question
test_question = "Q: What are common herbivorous fish?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What are common herbivorous fish?', 'A: puffers, striated frogfish, and porcupinefish. They are great at eating invertebrates, and are an excellent source of vitamin A. They are an excellent source of calcium, and are an excellent source of iron.']


In [24]:
# Test the fine-tuned model with a question
test_question = "Q: Why do fish camouflage themselves?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: Why do fish camouflage themselves?', 'A: Because fish camouflage themselves. They are the most effective predators on land. They are the most effective predators on fish. They are the most effective predators on birds. They are the most effective predators on fish. They are the most effective predators on fish. They are the most effective predators on fish. They are the most effective predators on fish. They are the most effective predators on fish. They are the most effective predators on fish. They are the']


In [25]:
# Test the fine-tuned model with a question
test_question = "Q: How do fish get rid of their parasites?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: How do fish get rid of their parasites?', 'A: Fish eat parasites, and fish eat fish. Fish eat parasites because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish eat parasites. Fish eat fish because fish']


In [26]:
# Test the fine-tuned model with a question
test_question = "Q: What species have a mutualistic relationship?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species have a mutualistic relationship?', 'A: sharks and giant moray. ']


In [27]:
# Test the fine-tuned model with a question
test_question = "Q: What species have a commensalistic relationship?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species have a commensalistic relationship?', 'A: sharks and giant moray. ']


In [28]:
# Test the fine-tuned model with a question
test_question = "Q: Where in the world are coral reefs found?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: Where in the world are coral reefs found?', 'A: Coral reefs are the largest coral reef in the world. They are the largest coral reef in the world because of their size and because of their ability to withstand the elements. They are the most abundant coral reef fish in the world. They are an important food source for birds, and are an important food source for sea anemones. They are an excellent predator for fish and are an excellent finisher for reef fish.']


In [29]:
# Test the fine-tuned model with a question
test_question = "Q: What species engage in schooling?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species engage in schooling?', 'A: sharks and giant moray. ']


In [30]:
# Test the fine-tuned model with a question
test_question = "Q: What species are ambush predators?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What species are ambush predators?', 'A: sharks and giant moray. ']


In [32]:
# Test the fine-tuned model with a question
test_question = "Q: Which species are ambush predators?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: Which species are ambush predators?', 'A: sharks and giant moray. ']


In [31]:
# Test the fine-tuned model with a question
test_question = "Q: Why are some coral reef fish colorful?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: Why are some coral reef fish colorful?', 'A: Coral reef fish are known to eat fish, and are an effective deterrent against most predators. However, coral reef fish are also known to eat stingrays, which are an effective deterrent against most predators.']


In [33]:
# Test the fine-tuned model with a question
test_question = "Q: Which species is blue?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: Which species is blue?', 'A: sharks and giant moray. ']


In [34]:
# Test the fine-tuned model with a question
test_question = "Q: What is the slowest species that lives in coral reefs?"

# Generate a completion for the test question
input_ids = tokenizer.encode(test_question, return_tensors="pt")
output = fine_tuned_model.generate(input_ids, max_length=100)
completion = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated completion for the test question:")
print(completion.split("STOP")[0].splitlines()[0:2])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated completion for the test question:
['Q: What is the slowest species that lives in coral reefs?', 'A: sharks and giant moray. ']


In [46]:
for i in range(20):
    chatbot_models.append(2)
    
    
model_2_performance_list = ["Partially Correct",
                          "Incorrect",
                          "Incorrect",
                          "Partially Correct",
                          "Incorrect", 
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Partially Correct",
                          "Incorrect",
                          "Incorrect",
                          "Partially Correct",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Incorrect"]

for i in model_2_performance_list:
    chatbot_performance.append(i)
    

######################### NEED TO ADD LAST 2 QUESTIONS ##################################3

## 3) Chatbot 3 - Chatbot Emphasizing Cosine Similarity of TF-IDF Representations of Sententces

In [35]:
import io
import random
import string # to process standard python strings
import warnings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [36]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('popular', quiet=True) # for downloading packages
#nltk.download('punkt') # first-time use only
#nltk.download('wordnet') # first-time use only

True

In [37]:
CORPUS_PATH = 'C:/Users/steve/OneDrive/Desktop/Github/Natural_Language_Processing/Chatbot Assignment/coral_reef_fish.txt'
f=open(CORPUS_PATH,'r',errors = 'ignore')
raw=f.read()
raw = raw.lower()# converts to lowercase

In [38]:
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

In [39]:
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [40]:
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["Hi", "Hey", "*nods*", "Hi there", "Hello", "I am glad! You are talking to me"]
def greeting(sentence):
 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [41]:
def response(user_response):
    robo_response=''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response

In [42]:
flag=True
print("Hello, I am the Third Coral Reef Chatbot! I will answer your queries related to coral reef fish. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("You are welcome.")
        else:
            if(greeting(user_response)!=None):
                print(greeting(user_response))
            else:
                print("",end="")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag=False
        print("Bye! take care..")

Hello, I am the Third Coral Reef Chatbot! I will answer your queries related to coral reef fish. If you want to exit, type Bye!
How many species of fish live in coral reefs?
 coral reef fish are fish which live amongst or in close relation to coral reefs.
What is the most venomous fish?
 there is a distinction between poisonous fish and venomous fish.
What sharks live in coral reefs?
 coral reef fish are fish which live amongst or in close relation to coral reefs.
What fish are poisonous?
 there is a distinction between poisonous fish and venomous fish.
What fish can electrocute you?
 as with all fish, coral reef fish harbour parasites.
What species can sting you?
 in return, the anemones provide the clownfish protection from their predators, who are not immune to anemone stings.
What species are venomous?
 the most venomous known fish is the reef stonefish.
What species are parasitic?
 parasites of coral reef fish include nematodes, platyhelminthes (cestodes, digeneans, and monogenean

In [47]:

for i in range(20):
    chatbot_models.append(3)

model_3_performance_list = ["Incorrect",
                          "Incorrect", 
                          "Incorrect",
                          "Incorrect",
                          "Incorrect",
                          "Correct",
                          "Correct",
                          "Correct",
                          "Correct",
                          "Incorrect",
                          "Partially Correct",
                          "Incorrect",
                          "Correct",
                          "Incorrect",
                          "Partially Correct",
                          "Correct",
                          "Correct",
                          "Incorrect"]

for i in model_3_performance_list:
    chatbot_performance.append(i)

######################### NEED TO ADD LAST 2 QUESTIONS ##################################3

## 4) Chatbot 4

## 5) Evaluation of Results

In [48]:
chatbot_performance_df = pd.DataFrame(list(zip(chatbot_models, chatbot_performance)), 
                                      columns=['Chatbot Model Number', 'Performance'])

In [50]:
plt.style.use('seaborn')
chatbot_performance_df.plot(x = 'Chatbot Model Number', 
                kind = 'bar', 
                stacked = True,
                title = 'Stacked Barplot of Summarizing Chatbot Performance',
                ylabel = 'Percent of Responses')

ax.legend
plt.show();

TypeError: no numeric data to plot