In [1]:
#pip install wikipedia

In [2]:
import wikipedia
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

def remove_words(wordstr, words_to_remove):
  querywords = wordstr.split()

  resultwords  = [word for word in querywords if word.lower() not in words_to_remove]
  return ' '.join(resultwords)

In [3]:
#add animals you want to add to the dictionary to test your description *NOT all wiki pages are one on one to be found by the animal name
animals = ['Wolf', 'Giant panda', 'Kangaroo', 'Elephant', 'Penguin']

library_map = {}

print('Loading data from wikipedia...')
for animal in animals:
  temp = wikipedia.page(animal)
  print(temp.url)
  print(temp.title)
  temp_content = remove_words(temp.content, ENGLISH_STOP_WORDS)
  library_map.update({animal:temp_content})


Loading data from wikipedia...
https://en.wikipedia.org/wiki/Wolf
Wolf
https://en.wikipedia.org/wiki/Giant_panda
Giant panda
https://en.wikipedia.org/wiki/Kangaroo
Kangaroo
https://en.wikipedia.org/wiki/Elephant
Elephant
https://en.wikipedia.org/wiki/Penguin
Penguin


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transformation
vectorizer = TfidfVectorizer()
# built vocabulary from wikipedia library on selected subjects -> all words that are part of the wikipedia pages (except removed stop words) should have a place in the vector representation
vectorizer = vectorizer.fit(library_map.values())

#print some information
# summarize
print(vectorizer.vocabulary_)

{'wolf': 6278, 'canis': 1039, 'lupus': 3569, 'known': 3329, 'gray': 2727, 'grey': 2748, 'large': 3361, 'canine': 1036, 'native': 3948, 'eurasia': 2178, 'north': 4023, 'america': 492, 'largest': 3364, 'extant': 2272, 'member': 3739, 'canidae': 1034, 'males': 3617, 'averaging': 694, '40': 219, 'kg': 3299, '88': 306, 'lb': 3392, 'females': 2388, '37': 211, '82': 296, 'average': 692, 'wolves': 6287, 'measure': 3712, '105': 11, '160': 41, 'cm': 1258, '41': 224, '63': 266, 'in': 3039, 'length': 3421, '80': 291, '85': 298, '31': 195, '33': 202, 'shoulder': 5210, 'height': 2847, 'distinguished': 1865, 'species': 5383, 'pointed': 4422, 'ears': 1973, 'muzzle': 3923, 'shorter': 5208, 'torso': 5835, 'longer': 3533, 'tail': 5653, 'nonetheless': 4016, 'related': 4799, 'closely': 1251, 'smaller': 5305, 'coyote': 1528, 'golden': 2693, 'jackal': 3227, 'produce': 4558, 'fertile': 2393, 'hybrids': 2979, 'them': 5749, 'fur': 2580, 'usually': 6052, 'mottled': 3878, 'white': 6238, 'brown': 958, 'black': 853

In [5]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [6]:
def remove_words(wordstr, words_to_remove):
  querywords = wordstr.split()

  resultwords  = [word for word in querywords if word.lower() not in words_to_remove]
  return ' '.join(resultwords)

In [7]:
#Above a vector is built for all vocabulary in the selected wikipedia pages. However, now we want a vector for each subject seperately so we can compare new description of a specific animal with the different wik-pages
vector_dictionary = {}

for item in library_map:
  print("Print filtered wiki page of: " + item)
  temp_obj = library_map.get(item).replace(item.lower(),"")
  temp_obj = temp_obj.replace(",", "").replace(".", "")
  #remove the animal name and all the stopwords from the wikipedia text
  words_to_remove = list(ENGLISH_STOP_WORDS) + [str(item), str(item.lower())] + item.split() + item.lower().split() #TODO: add plural form to this (best to do this in def remove_words? (& add taboo words to this list also :)
  temp_obj = remove_words(temp_obj, words_to_remove)
  vector_dictionary.update({item:vectorizer.transform([temp_obj])})
  print("   " + temp_obj)
  print("")

Print filtered wiki page of: Wolf
   (Canis lupus) known gray grey large canine native Eurasia North America largest extant member Canidae males averaging 40 kg (88 lb) females 37 kg (82 lb) average wolves measure 105–160 cm (41–63 in) length 80–85 cm (31–33 in) shoulder height distinguished Canis species pointed ears muzzle shorter torso longer tail nonetheless related closely smaller Canis species coyote golden jackal produce fertile hybrids fur usually mottled white brown gray black banded 38 subspecies recognized including domestic dog members genus Canis specialized cooperative game hunting demonstrated physical adaptations tackling large prey social nature highly advanced expressive behaviour travels nuclear families consisting mated pair accompanied offspring Offspring leave form packs onset sexual maturity response competition pack food Wolves territorial fights territory principal causes mortality mainly carnivore feeds primarily large wild hooved mammals eats smaller animals 

In [8]:
from sklearn.metrics.pairwise import cosine_similarity as cossim

In [9]:
def find_similarity(description, vector_dictionary):
  similarities = {}
  total = 0
  for item in vector_dictionary:
    temp_sim = cossim(vector_dictionary.get(item).toarray(),description.toarray())
    similarities.update({item:temp_sim})
    total = total + temp_sim
  
  for item in similarities:
    print(str(int(similarities.get(item)/total*100)) + '% chance it is a ' + item)

In [10]:
testtext = "australia large foot jumping pouch"
find_similarity(vectorizer.transform([testtext]), vector_dictionary)

5% chance it is a Wolf
3% chance it is a Giant panda
76% chance it is a Kangaroo
8% chance it is a Elephant
5% chance it is a Penguin


In [11]:
testtext = "black white bear bamboo"
find_similarity(vectorizer.transform([testtext]), vector_dictionary)

2% chance it is a Wolf
87% chance it is a Giant panda
0% chance it is a Kangaroo
0% chance it is a Elephant
8% chance it is a Penguin


In [12]:
testtext = "lives in a pack howling mowgli Jungle Book gray"
find_similarity(vectorizer.transform([testtext]), vector_dictionary)

46% chance it is a Wolf
11% chance it is a Giant panda
4% chance it is a Kangaroo
21% chance it is a Elephant
16% chance it is a Penguin


In [13]:
#pip install SpeechRecognition

In [14]:
import speech_recognition as sr

In [15]:
def recognize_speech_from_mic(recognizer, microphone):
    """Transcribe speech from recorded from `microphone`.

    Returns a dictionary with three keys:
    "success": a boolean indicating whether or not the API request was
               successful
    "error":   `None` if no error occured, otherwise a string containing
               an error message if the API could not be reached or
               speech was unrecognizable
    "transcription": `None` if speech could not be transcribed,
               otherwise a string containing the transcribed text
    """
    # check that recognizer and microphone arguments are appropriate type
    if not isinstance(recognizer, sr.Recognizer):
        raise TypeError("`recognizer` must be `Recognizer` instance")

    if not isinstance(microphone, sr.Microphone):
        raise TypeError("`microphone` must be `Microphone` instance")

    # adjust the recognizer sensitivity to ambient noise and record audio
    # from the microphone
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    # set up the response object
    response = {
        "success": True,
        "error": None,
        "transcription": None
    }

    # try recognizing the speech in the recording
    # if a RequestError or UnknownValueError exception is caught,
    #     update the response object accordingly
    try:
        response["transcription"] = recognizer.recognize_google(audio)
    except sr.RequestError:
        # API was unreachable or unresponsive
        response["success"] = False
        response["error"] = "API unavailable"
    except sr.UnknownValueError:
        # speech was unintelligible
        response["error"] = "Unable to recognize speech"

    return response

In [16]:
# create recognizer and mic instances
recognizer = sr.Recognizer()
microphone = sr.Microphone()

In [17]:
import operator
def other_other_find_similarity(description, vector_dictionary):
  similarities = {}
  perc = {}
  total = 0
#Compute cosine similarities
  for item in vector_dictionary:
    temp_sim = cossim(vector_dictionary.get(item).toarray(),description.toarray())
    similarities.update({item:temp_sim})
    total = total + temp_sim
#Compute similarities as percentage 
  for item in similarities:
    perc.update({item:int(similarities.get(item)/total*100)})
#Sort dic by items with highest similarity percentage  
  sorted_perc = sorted(perc.items(), key=operator.itemgetter(1), reverse=True)
  sorted_dict = dict(sorted_perc)

  
  return sorted_dict

In [20]:
NUM_GUESSES = 3
PROMPT_LIMIT = 5

for j in range(PROMPT_LIMIT):
    print('Describe!')
    description = recognize_speech_from_mic(recognizer, microphone)
    if description["transcription"]:
        break
    if not description["success"]:
        break
    print("I didn't catch that. What did you say?\n")

        # if there was an error, stop the game
    if description["error"]:
        print("ERROR: {}".format(description["error"]))
        break

# show the user the transcription
print("You said: {}".format(description["transcription"]))

for i in range(NUM_GUESSES):
    
    guess = other_other_find_similarity(vectorizer.transform([description["transcription"]]), vector_dictionary)
    
    list_guess = list(guess.keys())
    
    print('Is it a {}?'.format(list_guess[i]))
    
    for j in range(PROMPT_LIMIT):
        print('Waiting for your answer: ')
        answer = recognize_speech_from_mic(recognizer, microphone)
        if answer["transcription"]:
            break
        if not answer["success"]:
            break
        print("I didn't catch that. What did you say?\n")

        # if there was an error, stop the game
        if ansyer["error"]:
            print("ERROR: {}".format(answer["error"]))
            break

    # show the user the transcription
    print("{}".format(answer["transcription"])) 

    # determine if guess is correct and if any attempts remain
    guess_is_correct = answer["transcription"].lower() == 'yes'
    agent_has_more_attempts = i < NUM_GUESSES - 1

    # determine if the user has won the game
    # if not, repeat the loop if user has more attempts
    # if no attempts left, the user loses the game
    if guess_is_correct:
        print("Correct! I won!")
        break
    elif agent_has_more_attempts:
        print("Incorrect. Ok I'll try again.\n")
    else:
        print("Oh I lost.")
        break

Describe!
You said: four legs
Is it a Elephant?
Waiting for your answer: 
no
Incorrect. Ok I'll try again.

Is it a Kangaroo?
Waiting for your answer: 
no
Incorrect. Ok I'll try again.

Is it a Wolf?
Waiting for your answer: 
yes
Correct! I won!
