In [98]:
import re

from tqdm import tqdm
from anytree import Node, RenderTree, search
from collections import defaultdict

from litecoder.db import City, session

In [3]:
City.query.count()

344249

In [4]:
def keyify(text, lower=True):
    """Normalize text string -> index key.
    """
    text = text.strip()
    
    # Remove periods
    text = re.sub('\.', '', text)
    
    # Comma -> space
    text = re.sub(',', ' ', text)
    
    # 2+ whitespace -> space
    text = re.sub('\s{2,}', ' ', text)

    if lower:
        text = text.lower()

    return text.split()

In [107]:
def keys_iter(row):
    
    states = (row.name_a1, row.us_state_abbr)
    
    for state in states:
        yield '%s %s' % (row.name, state)
    
    if row.population and row.population > 500000:
        yield row.name

In [134]:
idx = defaultdict(list)

cities = City.query.filter(City.country_iso=='US')

for c in tqdm(cities):
    for key_raw in keys_iter(c):
        
        tokens = keyify(key_raw)
        
        parent = Node(tokens[0])
        idx[tokens[0]].append(parent)
        
        for token in tokens[1:]:
            parent = Node(token, parent=parent)

54727it [00:08, 6667.31it/s]


In [172]:
def find_locs(text):
    
    active = []
    closed = []
    for token in keyify(text):
        
        extensions = []
        for an in active:
            if not an.is_leaf:
                extensions += search.findall(an, lambda n: n.name==token, maxlevel=an.depth+2)
                
        if not extensions:
            closed += [n for n in active if n.is_leaf]
            
        active = extensions
            
        if not active:
            active = idx[token]
            
    closed += [n for n in active if n.is_leaf]

    return closed

In [190]:
%time find_locs('midway fl')

CPU times: user 1.32 ms, sys: 909 µs, total: 2.23 ms
Wall time: 2.24 ms


[Node('/midway/fl'), Node('/midway/fl'), Node('/midway/fl')]