### Solution 1: Straight Match

Build a phrase matcher in Spacy; count occurances over the title, description, and article, pick the most common.

### Issues:

- Doesnt handle multiple valid mentions within a doc
- Doesnt match single name restaurants.

In [1]:
from collections import Counter
from tqdm import tqdm_notebook
import time
from spacy.matcher import PhraseMatcher
import spacy

from db.dbclient import MongoClient

def read_gazetteer(tokenizer, restaurant_iter):
    
    names = map(lambda x: x.get('name'), restaurant_iter)    
    for i, name in enumerate(names):
        phrase = tokenizer(name)
        for w in phrase:
            _ = tokenizer.vocab[w.text]
        if len(phrase) >= 2 and len(phrase) < 10:
            yield phrase
            
def get_matches(tokenizer, phrases, texts, max_length=10):
    matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
    matcher.add('Phrase', None, *phrases)
    for text in texts:
        doc = tokenizer(text)
        for w in doc:
            _ = doc.vocab[w.text]
        matches = matcher(doc)
        for ent_id, start, end in matches:
            yield (ent_id, doc[start:end].text)  
            
def get_phrases():
    restaurant_iter = MongoClient('restaurant').collection.find({})
    nlp = spacy.blank('en')
    nlp.vocab.lex_attr_getters = {}
    phrases = read_gazetteer(nlp.tokenizer, restaurant_iter)
    return phrases
            
def main():
    restaurant_iter = MongoClient('restaurant').collection.find({})
    article_iter = MongoClient('articles').collection.find({'is_review_score': {"$gt":.5}})
    nlp = spacy.blank('en')
    nlp.vocab.lex_attr_getters = {}
    phrases = read_gazetteer(nlp.tokenizer, restaurant_iter)
    matcher = PhraseMatcher(nlp.tokenizer.vocab, max_length=10)
    matcher.add('Phrase', None, *phrases)
    results = {}
    for article in tqdm_notebook(article_iter):
        texts = [article.get('content', " "), article.get('title', " "), article.get('meta_description', " ")]
        results[article['_id']] = {'text': texts, 'matches': Counter(), 'is_review_score': article['is_review_score']}
        for text in texts:
            doc = nlp.tokenizer(text)
            for w in doc:
                _ = doc.vocab[w.text]
            matches = matcher(doc)
            for ent_id, start, end in matches:
                results[article['_id']]['matches'][doc[start:end].text] += 1
        most_common = results[article['_id']]['matches'].most_common(1)
        if most_common:
            pick, count = most_common[0]
            if count > 1:
                results[article['_id']]['best_match'] = pick
            else:
                results[article['_id']]['best_match'] = None
        else:
            results[article['_id']]['best_match'] = None
    return results

In [25]:
"""
The models.py module contains abstractions for grabbing information from db
to present within a view.

A view consumes a list of dictionaries; each of which either contains:

* lat/lon coordinates
* title
* text/images for each info window (styling will be handled elsewhere)

To avoid relying on places API whenever we want to get the location of a restaurant, lat/lon
should be stored in db.

We have have to this type of annotation as an ETL process.

See documentation for details on data models.
"""


from db.dbclient import MongoClient

class Restaurants:
    def __init__(self, dao):
        self.dao = dao.db.get_collection('restaurant')

    def filter_by_coords(self, west=None, east=None, north=None, south=None):
        if any([i is None for i in [west, east, south, north]]):
            raise ValueError("Must pass in values for all coordinates")

        query = {
            'coords.lat': {'$lt': east, '$gt': west},
            'coords.lon': {'$gt': south, '$lt': north},
        }

        return self.dao.find(query)

    def get_all(self):
        return filter(self.is_5_star, filter(self.is_valid, self.dao.find({})))

    def is_valid(self, blob):
        coords = blob.get('coords')
        if coords is None:
            return False
        lat, lon = coords.get('lat'), coords.get('lon')
        if lat is None or lon is None:
            return False
        if not isinstance(lat, float) or not isinstance(lon, float):
            return False
        return True

    def is_5_star(self, review):
        yelp = review.get('yelp')
        if yelp is None:
            return False
        rating = yelp.get('rating')
        if rating is None:
            return False
        if rating < 4:
            return False
        return True

class Model:
    def __init__(self):
        self.DataAccessObject = MongoClient()
        self.restaurants = Restaurants(self.DataAccessObject)
db_model = Model()


In [19]:
import pymongo
client = MongoClient('restaurant').collection
cursor = client.find({})
example = client.find_one({})
cursor = cursor.sort('yelp.rating', pymongo.DESCENDING).limit(5)

In [39]:
example['yelp']['rating']

{'alias': 'sunset-international-grill-and-deli-los-angeles',
 'categories': [{'alias': 'delis', 'title': 'Delis'},
  {'alias': 'breakfast_brunch', 'title': 'Breakfast & Brunch'},
  {'alias': 'burgers', 'title': 'Burgers'}],
 'is_closed': False,
 'price': '$',
 'rating': 3.5,
 'review_count': 6}

In [37]:

v = [('west', '34.01661809956266'), ('east', '34.01966587308045'), ('south', '-118.43972860094902'), ('north', '-118.43601739905091'), ('max_results', '20')]
form = {i: j for i, j in v}
sub_form = {i: float(j) for i, j in v if i != 'max_results'}
restaurant_iter = db_model.restaurants.filter_by_coords(**sub_form)
list(restaurant_iter)
#restaurant_iter = restaurant_iter.sort('yelp.rating', pymongo.DESCENDING).limit(max_results)


{'east': 34.01966587308045,
 'north': -118.43601739905091,
 'south': -118.43972860094902,
 'west': 34.01661809956266}

In [57]:
results = main()




In [79]:
#ids = (i for i in results.keys())
results[next(ids)]

{'best_match': 'Food',
 'is_review_score': 0.9990043640136719,
 'matches': Counter({'3': 4,
          'America': 1,
          'California': 1,
          'Food': 4,
          'Mori': 1,
          'San': 1,
          'Shiki Beverly Hills': 1}),
 'text': ["\nDONATE\n \nAll Things Considered\nNational Public Radio's weekday afternoon newsmagazine.\n \nEclectic 24\nKCRW's all-music channel Eclectic24, blending the collected talents and tastes of KCRW's DJs into a single voice. Free music streaming 24/7 at KCRW.com.\n \nAll Things Considered\nNational Public Radio's weekday afternoon newsmagazine.\n \nLost Notes\nThe greatest music stories never told. Explore the amazing stories of how 60s rock hit “Louie, Louie” triggered an FBI investigation, the outlaw Brooklyn radio station WBAD that tracked the rise of 90s hip hop, and the man who went from Folsom Prison inmate to Johnny Cash’s bandmate.\n \nUnFictional\nUnbelievably true stories of chance encounters that changed the world. A pair of ma

In [4]:
from tqdm import tqdm_notebook
import spacy
nlp = spacy.load('en')
article_docs = articles.collection.find({})#, projection=['content'])
article_docs = map(lambda x: x.get('content'), article_docs)
article_docs = filter(lambda x: x is not None and x != '', article_docs)
article_docs = list(article_docs)

words = set()
for name in tqdm_notebook(nlp.pipe(article_docs, disable=nlp.pipe_names), total=len(article_docs)):
    for word in name:
        words.add(word.text)
        
words = list(words)




### Make Labels + Noise

In [6]:
import numpy as np
import random
from spacy.gold import GoldParse

names = restaurants.collection.find({}, projection=['name'])
names = map(lambda x: x.get('name'), names)
names = list(names)
golds = []
docs=[]
for name in tqdm_notebook(names, total=len(names)):
    start_n, end_n = np.random.choice([1, 2, 3]), np.random.choice([1, 2, 3])
    
    start_string = " ".join(np.random.choice(words, size = start_n).tolist())
    end_string = " ".join(np.random.choice(words, size = end_n).tolist())
    new_name = start_string + " " + name +" "+ end_string
    
    doc = nlp(new_name, disable=nlp.pipe_names)
    inner_n = len(doc) - start_n - end_n
    #print(doc)
    entities = []
    N = len(doc)
    for i, word in enumerate(doc):
        # start n(2) -> 0, 1, not 2
        if i < start_n:
            ent = 'O'
        elif i < start_n + inner_n:
            if inner_n == 1:
                ent = 'U-RESTAURANT'
            else:
                if i == start_n:
                    ent = 'B-RESTAURANT'
                elif i < (inner_n + start_n - 1):
                    ent = 'I-RESTAURANT'
                else:
                    ent = 'L-RESTAURANT'
        else:
            ent = 'O'
        entities.append(ent)
        
    golds.append(GoldParse(doc, entities=entities))
    docs.append(doc)
    
    

In [71]:
review_iter = articles.collection.find({'is_review': True})

### Train

In [59]:

import random
txt = 'Is Los Angeles a fried chicken town? The hours-long lines at Howlin’ Ray’s at the Far East Plaza in Chinatown seem to suggest so.'
n_iter = 1
nlp = spacy.load('en')
nlp.entity.add_label('RESTAURANT')

train_data =list(zip(docs, golds))
optimizer = nlp.begin_training()
try:
    for itn in tqdm_notebook(range(n_iter), total=n_iter):
        random.shuffle(train_data)
        for j, (doc, gold) in enumerate(tqdm_notebook(train_data, total=len(train_data))):
            nlp.update([doc], [gold], drop=0.5, sgd=optimizer)   
            if j % 20 == 0:
                for e in nlp(txt).ents:
                    print(e, e.label_)
                print()
except KeyboardInterrupt:
    pass



Los Angeles GPE
hours TIME
Ray PERSON
the Far East Plaza FAC

Los Angeles GPE
hours TIME
Ray PERSON
the Far East Plaza LOC

Los Angeles GPE
hours TIME
Ray PERSON
the Far East Plaza LOC

Los Angeles GPE
Ray PERSON
the Far East Plaza LOC

the Far East Plaza LOC

the Far East Plaza LOC

the Far East Plaza LOC

Los Angeles GPE
Ray PERSON
the Far East Plaza LOC

the Far East Plaza LOC

Los Angeles GPE
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT

Los Angeles GPE
Ray’s RESTAURAN

Los Angeles a fried chicken RESTAURANT
The hours-long lines at RESTAURANT
Ray’s at RESTAURANT

Los Angeles a fried chicken RESTAURANT
The hours-long lines at RESTAURANT
Ray’s at RESTAURANT

Los Angeles a fried chicken town? RESTAURANT
The hours-long lines at Howlin’ Ray’s at the Far East Plaza RESTAURANT

Los Angeles a fried chicken town? RESTAURANT
The hours-long lines at Howlin’ Ray’s at the Far East Plaza in Chinatown RESTAURANT

The hours-long lines at Howlin’ Ray’s at the Far East Plaza in Chinatown RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

