### Solution 1: Straight Match

Build a phrase matcher in Spacy; count occurances over the title, description, and article, pick the most common.

In [90]:
from collections import Counter
from tqdm import tqdm_notebook
import time
from spacy.matcher import PhraseMatcher
import spacy

from db.dbclient import MongoClient

def read_gazetteer(tokenizer, restaurant_iter):
    
    names = map(lambda x: x.get('name'), restaurant_iter)    
    for i, name in enumerate(names):
        phrase = tokenizer(name)
        for w in phrase:
            _ = tokenizer.vocab[w.text]
        if len(phrase) >= 2 and len(phrase) < 10:
            yield phrase
            
def get_matches(tokenizer, phrases, texts, max_length=10):
    matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
    matcher.add('Phrase', None, *phrases)
    for text in texts:
        doc = tokenizer(text)
        for w in doc:
            _ = doc.vocab[w.text]
        matches = matcher(doc)
        for ent_id, start, end in matches:
            yield (ent_id, doc[start:end].text)  
            
def get_phrases():
    restaurant_iter = MongoClient('restaurant').collection.find({})
    nlp = spacy.blank('en')
    nlp.vocab.lex_attr_getters = {}
    phrases = read_gazetteer(nlp.tokenizer, restaurant_iter)
    return phrases
            
def main():
    restaurant_iter = MongoClient('restaurant').collection.find({})
    article_iter = MongoClient('articles').collection.find({'is_review_score': {"$gt":.5}})
    nlp = spacy.blank('en')
    nlp.vocab.lex_attr_getters = {}
    phrases = read_gazetteer(nlp.tokenizer, restaurant_iter)
    matcher = PhraseMatcher(nlp.tokenizer.vocab, max_length=10)
    matcher.add('Phrase', None, *phrases)
    results = {}
    for article in tqdm_notebook(article_iter):
        texts = [article.get('content', " "), article.get('title', " "), article.get('meta_description', " ")]
        results[article['_id']] = {'text': texts, 'matches': Counter(), 'is_review': article['is_review']}
        for text in texts:
            doc = nlp.tokenizer(text)
            for w in doc:
                _ = doc.vocab[w.text]
            matches = matcher(doc)
            for ent_id, start, end in matches:
                results[article['_id']]['matches'][doc[start:end].text] += 1
        most_common = results[article['_id']]['matches'].most_common(1)
        if most_common:
            results[article['_id']]['best_match'] = most_common[0][0]
        else:
            results[article['_id']]['best_match'] = None
    return results

In [79]:
results = main()




In [89]:
ids = list(results.keys())
results[ids[52]]

{'best_match': 'Chinatown After Dark',
 'is_review': True,
 'matches': Counter({'Cafe Pinot': 1,
          'Chinatown After Dark': 3,
          'Orsa & Winston': 1,
          'Patina Group': 1}),
 'text': ['The sun set on a recent evening in L.A.\'s Chinatown neighborhood, the light from the lanterns strung across the second floor of the Far East Plaza replacing that of the sky. Families gathered around tables in the outdoor courtyard of the \'70s-era shopping mall as lines moved along the booths set up outside the shops and restaurants. At a table near the stairway, pastry chef Isa Fabro set out her desserts — thick wedges of pie, individual Valrhona-dusted confections — like a jeweler unloading the contents of a display case. How Fabro got here, to a tiny table at Chinatown After Dark, the neighborhood monthly micro food festival, is the story of one highly talented chef, but it\'s also a story of the contemporary Los Angeles dining scene. Over the course of the last decade, many of 

In [4]:
from tqdm import tqdm_notebook
import spacy
nlp = spacy.load('en')
article_docs = articles.collection.find({})#, projection=['content'])
article_docs = map(lambda x: x.get('content'), article_docs)
article_docs = filter(lambda x: x is not None and x != '', article_docs)
article_docs = list(article_docs)

words = set()
for name in tqdm_notebook(nlp.pipe(article_docs, disable=nlp.pipe_names), total=len(article_docs)):
    for word in name:
        words.add(word.text)
        
words = list(words)




### Make Labels + Noise

In [6]:
import numpy as np
import random
from spacy.gold import GoldParse

names = restaurants.collection.find({}, projection=['name'])
names = map(lambda x: x.get('name'), names)
names = list(names)
golds = []
docs=[]
for name in tqdm_notebook(names, total=len(names)):
    start_n, end_n = np.random.choice([1, 2, 3]), np.random.choice([1, 2, 3])
    
    start_string = " ".join(np.random.choice(words, size = start_n).tolist())
    end_string = " ".join(np.random.choice(words, size = end_n).tolist())
    new_name = start_string + " " + name +" "+ end_string
    
    doc = nlp(new_name, disable=nlp.pipe_names)
    inner_n = len(doc) - start_n - end_n
    #print(doc)
    entities = []
    N = len(doc)
    for i, word in enumerate(doc):
        # start n(2) -> 0, 1, not 2
        if i < start_n:
            ent = 'O'
        elif i < start_n + inner_n:
            if inner_n == 1:
                ent = 'U-RESTAURANT'
            else:
                if i == start_n:
                    ent = 'B-RESTAURANT'
                elif i < (inner_n + start_n - 1):
                    ent = 'I-RESTAURANT'
                else:
                    ent = 'L-RESTAURANT'
        else:
            ent = 'O'
        entities.append(ent)
        
    golds.append(GoldParse(doc, entities=entities))
    docs.append(doc)
    
    

In [71]:
review_iter = articles.collection.find({'is_review': True})

### Train

In [59]:

import random
txt = 'Is Los Angeles a fried chicken town? The hours-long lines at Howlin’ Ray’s at the Far East Plaza in Chinatown seem to suggest so.'
n_iter = 1
nlp = spacy.load('en')
nlp.entity.add_label('RESTAURANT')

train_data =list(zip(docs, golds))
optimizer = nlp.begin_training()
try:
    for itn in tqdm_notebook(range(n_iter), total=n_iter):
        random.shuffle(train_data)
        for j, (doc, gold) in enumerate(tqdm_notebook(train_data, total=len(train_data))):
            nlp.update([doc], [gold], drop=0.5, sgd=optimizer)   
            if j % 20 == 0:
                for e in nlp(txt).ents:
                    print(e, e.label_)
                print()
except KeyboardInterrupt:
    pass



Los Angeles GPE
hours TIME
Ray PERSON
the Far East Plaza FAC

Los Angeles GPE
hours TIME
Ray PERSON
the Far East Plaza LOC

Los Angeles GPE
hours TIME
Ray PERSON
the Far East Plaza LOC

Los Angeles GPE
Ray PERSON
the Far East Plaza LOC

the Far East Plaza LOC

the Far East Plaza LOC

the Far East Plaza LOC

Los Angeles GPE
Ray PERSON
the Far East Plaza LOC

the Far East Plaza LOC

Los Angeles GPE
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s at RESTAURANT
the Far East Plaza LOC

Los Angeles GPE
Ray’s RESTAURANT

Los Angeles GPE
Ray’s RESTAURAN

Los Angeles a fried chicken RESTAURANT
The hours-long lines at RESTAURANT
Ray’s at RESTAURANT

Los Angeles a fried chicken RESTAURANT
The hours-long lines at RESTAURANT
Ray’s at RESTAURANT

Los Angeles a fried chicken town? RESTAURANT
The hours-long lines at Howlin’ Ray’s at the Far East Plaza RESTAURANT

Los Angeles a fried chicken town? RESTAURANT
The hours-long lines at Howlin’ Ray’s at the Far East Plaza in Chinatown RESTAURANT

The hours-long lines at Howlin’ Ray’s at the Far East Plaza in Chinatown RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

Los Angeles RESTAURANT
The hours-long RESTAURANT
Ray’s at RESTAURANT

