In [None]:
# regular expressions for street addressses
import re
# process US street address

# the address to match
text = "223 5th Street NW, Plymouth, PA 19001"

print(text)

# first define components of an address

# at the beginning of a string, match at least one digit
street_number_re = "^\d{1,}"

# match street names containing upper and lower case letters and digits, including spaces,
# followed by an optional comma
street_name_re = "[a-zA-Z0-9\s]+,?"

# match city names containing letters, but not spaces, followed by a comma
# note that two word city names (like "New York") won't get matched
# try to modify the regular expression to include two word city names
city_name_re = " [a-zA-Z]+(\,)?"

# to match US state abbreviations, match any two upper case alphabetic characters
# notice that this overgenerates and accepts state names that don't exist
# because it doesn't check for a valid state name
state_abbrev_re = " [A-Z]{2}" 

# match US postal codes consisting of exactly 5 digits. 9 digit codes exist, but this
# expression doesn't match them
postal_code_re = " [0-9]{5}$"
 
# put the components together -- define the overall pattern
address_pattern_re = street_number_re + street_name_re + city_name_re + state_abbrev_re + postal_code_re

# is this an address? 

is_match = re.match(address_pattern_re,text)
if is_match is not None:
    print("matches address_pattern")
else:
    print("doesn't match")


In [None]:
# replace the whole expression with a class tag -- "ADDRESS"
address_class = re.sub(address_pattern_re,"ADDRESS",text)
print(address_class)
    
# suppose we need to label a matched portion of the string
# this function will label the matched string as an address
def add_address_label(address_obj):
    labeled_address = add_label("address",address_obj)
    return(labeled_address)

# this creates the desired format for the labeled output
def add_label(label, match_obj):
    labeled_result = "{" + label + ":" + "'" + match_obj.group() + "'" + "}"
    return(labeled_result)
  
# add labels to the string   
address_label_result = re.sub(address_pattern_re,add_address_label,text)
print(address_label_result)


In [None]:
# get the WordNet list of all vegetables

import nltk
from nltk.corpus import wordnet as wn
wn.synsets('vegetable')

In [None]:
print(wn.synset('vegetable.n.01').definition())
print(wn.synset('vegetable.n.02').definition())

In [None]:
word_list = wn.synset('vegetable.n.01').hyponyms()
simple_names = []
for word in range (len(word_list)):
    simple_name = word_list[word].lemma_names()[0]
    simple_names.append(simple_name)
print(simple_names)

In [None]:
# generate some sample data

text_frame = "can you give me some good recipes for "
for vegetable in range(len(simple_names)):
    print(text_frame + simple_names[vegetable])

In [None]:
# an NLTK CFG grammar 
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N N |Det N PP | Pro
Pro -> 'I' |'you'|'we'
VP -> V NP | VP PP
Det -> 'an' | 'my' | 'the'
N -> 'elephant' | 'pajamas' | 'movie' |'family' | 'room' |'children'
V -> 'saw'|'watched'
P -> 'in'
""")

In [None]:
# parse and visualize a sentence
# we will need this to tokenize the input
import nltk
from nltk import word_tokenize
# a package for visualizing parse trees
import svgling
# to use svgling we need to disable NLTK's normal visualization functions
svgling.disable_nltk_png()
# example sentence that can be parsed with the grammar we've defined
sent = nltk.word_tokenize("the children watched the movie in the family room")
# create a chart parser based on the grammar above
parser = nltk.ChartParser(grammar)
# parse the sentence
trees = list(parser.parse(sent))
# print a text-formatted parse tree
print(trees[0])
# print an SVG formatted parse tree
trees[0]

In [None]:
import spacy
from spacy.lang.en import English

nlp = English()

ruler = nlp.add_pipe("entity_ruler")
cuisine_patterns = [
    {"label": "CUISINE", "pattern": "italian"},
    {"label": "CUISINE", "pattern": "german"},
    {"label": "CUISINE", "pattern": "chinese"}]
price_range_patterns = [
    {"label": "PRICE_RANGE", "pattern": "inexpensive"},
    {"label": "PRICE_RANGE", "pattern": "reasonably priced"},
    {"label": "PRICE_RANGE", "pattern": "good value"}]
atmosphere_patterns = [
    {"label": "ATMOSPHERE", "pattern": "casual"},
    {"label": "ATMOSPHERE", "pattern": "nice"},
    {"label": "ATMOSPHERE", "pattern": "cozy"}]
location_patterns = [
    {"label": "LOCATION", "pattern": "near here"},
    {"label": "LOCATION", "pattern": "walking distance"},
    {"label": "LOCATION", "pattern": "close by",id:"nearby"},
    {"label": "LOCATION", "pattern": "a short drive"}]
           
ruler.add_patterns(cuisine_patterns)
ruler.add_patterns(price_range_patterns)
ruler.add_patterns(atmosphere_patterns)
ruler.add_patterns(location_patterns)

doc = nlp("can you recommend a casual italian within walking distance")
print([(ent.text, ent.label_) for ent in doc.ents])


In [None]:
from spacy import displacy
colors = {"CUISINE": "#ea7e7e",
          "PRICE_RANGE": "#baffc9",
          "ATMOSPHERE": "#abcdef",
          "LOCATION": "#ffffba"}
options = {"ents": ["CUISINE","PRICE_RANGE","ATMOSPHERE","LOCATION"], "colors": colors}
displacy.render(doc, style="ent", options=options,jupyter = True)

In [None]:
# using the spacy id attribute
import spacy
from spacy.lang.en import English

nlp = English()

ruler = nlp.add_pipe("entity_ruler")
cuisine_patterns = [
    {"label": "CUISINE", "pattern": "italian"},
    {"label": "CUISINE", "pattern": "german"},
    {"label": "CUISINE", "pattern": "chinese"}]
price_range_patterns = [
    {"label": "PRICE_RANGE", "pattern": "inexpensive"},
    {"label": "PRICE_RANGE", "pattern": "reasonably priced"},
    {"label": "PRICE_RANGE", "pattern": "good value"}]
atmosphere_patterns = [
    {"label": "ATMOSPHERE", "pattern": "casual"},
    {"label": "ATMOSPHERE", "pattern": "nice"},
    {"label": "ATMOSPHERE", "pattern": "cozy"}]
location_patterns = [
    {"label": "LOCATION", "pattern": "near here", "id":"nearby"},
    {"label": "LOCATION", "pattern": "close by","id":"nearby"},
    {"label": "LOCATION", "pattern": "near me","id":"nearby"},
    {"label": "LOCATION", "pattern": "walking distance", "id":"short_walk"},
    {"label": "LOCATION", "pattern": "short walk", "id":"short_walk"},
    {"label": "LOCATION", "pattern": "a short drive", "id":"short_drive"}]
           
ruler.add_patterns(cuisine_patterns)
ruler.add_patterns(price_range_patterns)
ruler.add_patterns(atmosphere_patterns)
ruler.add_patterns(location_patterns)

doc = nlp("can you recommend a casual italian restaurant close by")
print([(ent.text, ent.label_,ent.ent_id_) for ent in doc.ents])
