In [16]:
from pprint import pprint
import numpy as np

import speech_recognition as sr

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords

import spacy
nlp = spacy.load("en_core_web_sm")



***

## Speech to Text

In [17]:
def transcribe():
    recognizer = sr.Recognizer()
    
    with sr.Microphone() as src:
        audio = recognizer.listen(src)
        
    try:
        res = recognizer.recognize_google(audio).lower()
        print(res)
        return res
    
    except sr.UnknownValueError:
        print("Could not understand audio")
        return None
        
    except sr.RequestError as e:
        print("Could not request results from the service; {0}".format(e))
        return None


In [18]:
search_string = "i would like to go to barcelona for a week from tomorrow and i would like my hotel to be near a forest"

***

## Tokenization

In [19]:
tokens = word_tokenize(search_string)

In [20]:
tokens

['i',
 'would',
 'like',
 'to',
 'go',
 'to',
 'barcelona',
 'for',
 'a',
 'week',
 'from',
 'tomorrow',
 'and',
 'i',
 'would',
 'like',
 'my',
 'hotel',
 'to',
 'be',
 'near',
 'a',
 'forest']

***

## Noise removal / data cleaning

In [21]:
def clean(a):
    noise = stopwords.words("english")   
    new = []

    for x in word_tokenize(a):
        if not(x in noise):
            new.append(x)
            
    return new

In [22]:
clean_tokens = clean(search_string)

In [23]:
clean_tokens

['would',
 'like',
 'go',
 'barcelona',
 'week',
 'tomorrow',
 'would',
 'like',
 'hotel',
 'near',
 'forest']

***

## Part of Speech tagging

In [24]:
pos = pos_tag(tokens)

In [25]:
pos

[('i', 'NN'),
 ('would', 'MD'),
 ('like', 'VB'),
 ('to', 'TO'),
 ('go', 'VB'),
 ('to', 'TO'),
 ('barcelona', 'VB'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('week', 'NN'),
 ('from', 'IN'),
 ('tomorrow', 'NN'),
 ('and', 'CC'),
 ('i', 'NN'),
 ('would', 'MD'),
 ('like', 'VB'),
 ('my', 'PRP$'),
 ('hotel', 'NN'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('near', 'IN'),
 ('a', 'DT'),
 ('forest', 'NN')]

***

## Named Entity Recognition

### Preprocessing

In [26]:
def ner_preprocessing(a):
    tokens = word_tokenize(a)
    prepositions = ["to", "visit", "in"]
    
    for prep in prepositions:
        idxs = [i for i, x in enumerate(tokens) if x == prep]
        for i in idxs:
            if i+1 < len(tokens):
                tokens[i+1] = tokens[i+1][0].upper() + tokens[i+1][1:]
        
    return ' '.join(tokens)

In [27]:
doc = nlp(ner_preprocessing(search_string))

In [14]:
pprint([(x.text, x.label_) for x in doc.ents])

[('Barcelona', 'GPE'), ('a week from tomorrow', 'DATE')]


***

## Search Tags

In [14]:
def extract_tags(data):
    idxs = np.unique([x[0] for x in data.treepositions() if len(x) > 1])
    return [list(data[int(i)]) for i in idxs]

def get_search_tags(a):
    search_tag_parser = nltk.RegexpParser("STAG: {<IN><DT>?<NN>}")
    data = search_tag_parser.parse(pos_tag(word_tokenize(a)))
    return extract_tags(data)

In [15]:
get_search_tags(search_string)

[[('for', 'IN'), ('a', 'DT'), ('week', 'NN')],
 [('from', 'IN'), ('tomorrow', 'NN')],
 [('near', 'IN'), ('a', 'DT'), ('forest', 'NN')]]