In [1]:
import re

from tqdm import tqdm
from collections import defaultdict, Counter, UserDict
from itertools import product
from cached_property import cached_property

from litecoder.models import session, City
from litecoder import logger

In [2]:
def keyify(text):
    
    text = text.lower()
    text = text.strip()

    text = text.replace('.', '')
    text = re.sub('[,-]', ' ', text)
    
    # 2+ whitespace -> 1 space
    text = re.sub('\s{2,}', ' ', text)
    
    return text

In [3]:
class NameCounts(Counter):
    
    def __init__(self):
        logger.info('Indexing name -> counts.')
        names = [keyify(r[0]) for r in session.query(City.name)]
        super().__init__(names)
    
    def __getitem__(self, text):
        return super().__getitem__(keyify(text))

In [4]:
class NamePopulations(defaultdict):
    
    def __init__(self):
        """Index name -> [pops].
        """
        super().__init__(list)
        
        logger.info('Indexing name -> populations.')
        
        median_pop = City.median_population()
        
        for city in tqdm(City.query):
            for name in city.names:
                self[keyify(name)].append(city.population or median_pop)
                
    def __getitem__(self, text):
        return super().__getitem__(keyify(text))

In [5]:
class AllowBareName:
    
    def __init__(self, min_p2_ratio=10):
        self.name_pops = NamePopulations()
        self.min_p2_ratio = min_p2_ratio
    
    def __call__(self, city, name):
        
        all_pops = sorted(self.name_pops[name], reverse=True)

        if len(all_pops) < 2:
            return True
        
        p2_ratio = (city.population or 0) / all_pops[1]
        
        if p2_ratio > self.min_p2_ratio:
            return True
        
        return False

In [6]:
USA_NAMES = (
    'USA',
    'United States',
    'United States of America',
    'US',
    'America',
)

class USCityKeyIter:
    
    def __init__(self, *args, **kwargs):
        self.allow_bare = AllowBareName(*args, **kwargs)
    
    def _iter_keys(self, city):
        """Enumerate index keys for a city.
        
        Args:
            city (db.City)
            
        Yields: str
        """    
        bare_names = [n for n in city.names if self.allow_bare(city, n)]
        
        states = (city.name_a1, city.us_state_abbr)
        
        for name in bare_names:
            yield name
            
        for name, usa in product(bare_names, USA_NAMES):
            yield ' '.join((name, usa))
            
        for name, state in product(city.names, states):
            yield ' '.join((name, state))
            
        for name, state, usa in product(city.names, states, USA_NAMES):
            yield ' '.join((name, state, usa))
            
    def __call__(self, city):
        for text in self._iter_keys(city):
            yield keyify(text)

In [7]:
class USCityIndex:
    
    def __init__(self):
        self._idx = defaultdict(set)
        
    def __getitem__(self, text):
        return self._idx[keyify(text)]
    
    def query(self, text):
        """Get ids, query database records.
        """
        ids = self[text]
        
        return (
            City.query.filter(City.wof_id.in_(ids)).all()
            if ids else []
        )
    
    def build(self):
        """Index all US cities.
        """
        iter_keys = USCityKeyIter()
        
        cities = City.query.filter(City.country_iso=='US')
        
        logger.info('Indexing US cities.')
        
        for city in tqdm(cities):
            
            try:
                
                # Generate keys, ensure no errors.
                keys = list(iter_keys(city))
                
                # Index complete key set.
                for key in iter_keys(city):
                    self[key].add(city.wof_id)

            except Exception as e:
                pass

In [8]:
ci = USCityIndex()
ci.build()

2018-07-19 15:12:55,550 | INFO : Indexing name -> populations.


344249it [00:13, 26266.64it/s]

2018-07-19 15:13:18,149 | INFO : Indexing US cities.



54727it [00:14, 3650.05it/s]


In [9]:
len(ci._idx)

793038

In [19]:
%time ci.query('hollywood')

CPU times: user 1.78 ms, sys: 401 µs, total: 2.19 ms
Wall time: 1.88 ms


[City<85923517, Los Angeles, California, United States>]

In [16]:
ci.query('sf')

[City<85922583, San Francisco, California, United States>]