<a href="https://colab.research.google.com/github/SeanGMONeill/nlpworkshop_instructor/blob/main/Lesson4_Checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# This command will install chatterbot-corpus, a library which contains a corpus of conversations in YAML format
# You can view these raw files in the chatterbot-corpus GitHub repo: https://github.com/gunthercox/chatterbot-corpus/tree/master/chatterbot_corpus/data/english
!pip install chatterbot-corpus levenshtein

import chatterbot_corpus
from yaml import load
import inspect
import os
import random
import math
import Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting levenshtein
  Downloading Levenshtein-0.20.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.0/174.0 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.13.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, levenshtein
Successfully installed levenshtein-0.20.9 rapidfuzz-2.13.7


In [9]:
# Initialize the run variable to True
run = True

def tokenize(msg):
  msg = msg.lower()
  tokens = msg.split(' ')
  return tokens


def remove_punctuation(msg):
  symbols = ['?','-',',',':',';','!']
  for symbol in symbols:
    msg = msg.replace(symbol, '')
  return msg

elements = {
    'hydrogen': 1,
    'oxygen': 8,
    'carbon': 3,
    'plutonium': 94,
    'helium': 2,
    'lithium': 3
}


def normalize_text(msg):
  msg = msg.lower()
  symbols = ['?','-',',',':',';']
  for symbol in symbols:
    msg = msg.replace(symbol, '')
  return msg


def choose_response(msg):
  try:

    # Fetch the list of possible responses
    options = lookup[normalize_text(msg)]
    # Return a randomly selected item from the list (using the Python random library)
    return random.choice(options)

  # Handle the case where the input isn't in the dictionary
  except KeyError:
    return None

# Find distance between two strings
# Using this to abstract away the library calls, so we can quickly swap out Levenshtein and Jaro Winkler in one place
def find_distance(string1, string2):
  return Levenshtein.distance(string1, string2)

# Find the string from options which has the shortest distance to input
# input is a cleaned input string
# options is a list of cleaned strings
def find_shortest_distance(input, options):
  shortest_distance = math.inf # Initialize to infinity to start
  string_with_shortest_distance = ''
  for option in options:
    distance = find_distance(input, option)
    if distance < shortest_distance:
      shortest_distance = distance
      string_with_shortest_distance = option
  return string_with_shortest_distance


# Create a dict of msg->response from the files in the corpus
def load_conversations_from_corpus():
  # 1) Get the location of the corpus YAML files installed with the chatterbot corpus package
  data_path = os.path.join(os.path.dirname(inspect.getfile(chatterbot_corpus)), 'data/english')

  # 2) Build a list of conversations (each file is a full conversation)
  conversations = []
  for file in os.listdir(data_path):
    convos = load(open(os.path.join(data_path, file), 'r'))
    conversations = conversations + convos['conversations']

  # 3) Build a dictionary of all the msg->[response] pairs in every conversation
  lookup = {}
  for convo in conversations:
    lookup[normalize_text(convo.pop(0))] = convo # Note we're now normalizing the dictionary key. We're keeping the responses in their original case, with punctuation.
  return lookup

lookup = load_conversations_from_corpus()


# While run is still True, loop through the rest of the script
while run:
  # Wait for the user to input text, and store it in the msg variable
  msg = input().lower()
  msg = remove_punctuation(msg)
  tokens = tokenize(msg)
  # Give a response, based on the input (if we recognise it)
  if msg == 'exit':
    print('Goodbye!')
    # Set run to False, so the loop won't run again
    # This means we won't be trapped in an infinite loop
    run = False
  elif msg == 'hello':
    print('Hi!')
  elif msg == 'how are you':
    print('I\'m pretty good, thanks!')
  elif 'rain' in tokens:
    print('I love rain!')
  elif 'atomic number' in msg:
    found_element = False
    for token in tokens:
      if token in elements:
        print('The atomic number for {element} is {symbol}'.format(element=token, symbol=elements[token]))
        found_element = True
    if not found_element:
      print('You asked about an atomic number, but I don\'t recognise an element name in your message')
  # If the input doesn't match any of our statements, print a generic answer
  else:
    closest_input = find_shortest_distance(msg, lookup.keys()) # keys from lookup is a list of the input strings from the corpus
    print('Closest input: {}'.format(closest_input))
    print(choose_response(closest_input))

Hello!
Hi!
How are you?
I'm pretty good, thanks!
What is a computer?
Closest input: what is a computer
A device which maps one set of numbers onto another set of numbers.
What's a number?
Closest input: what is your number
23 skiddoo!
What's a computer?
Closest input: what is a computer
An electronic device capable of performing calculations at very high speed and with very high accuracy.
exit
Goodbye!
