In [1]:
import re

from tqdm import tqdm
from collections import defaultdict, Counter, UserDict
from itertools import product
from cached_property import cached_property

from litecoder.models import session, City
from litecoder import logger

In [2]:
def keyify(text):
    
    text = text.lower()
    text = text.strip()

    text = text.replace('.', '')
    text = re.sub('[,-]', ' ', text)
    
    # 2+ whitespace -> 1 space
    text = re.sub('\s{2,}', ' ', text)
    
    return text

In [3]:
keyify('la-la land')

'la la land'

In [4]:
keyify('Tuscaloosa, AL')

'tuscaloosa al'

In [5]:
keyify('Washington,DC')

'washington dc'

In [6]:
class NameCounts(Counter):
    
    def __init__(self):
        logger.info('Indexing name -> counts.')
        names = [keyify(r[0]) for r in session.query(City.name)]
        super().__init__(names)
    
    def __getitem__(self, text):
        return super().__getitem__(keyify(text))

In [7]:
class NamePopulations(defaultdict):
    
    def __init__(self):
        """Index name -> [pops].
        """
        super().__init__(list)
        
        logger.info('Indexing name -> populations.')
        
        median_pop = City.median_population()
        
        for city in tqdm(City.query):
            for name in city.names:
                self[keyify(name)].append(city.population or median_pop)
                
    def __getitem__(self, text):
        return super().__getitem__(keyify(text))

In [8]:
class AllowBareName:
    
    def __init__(self, min_p2_ratio=10):
        self.name_pops = NamePopulations()
        self.min_p2_ratio = min_p2_ratio
    
    def __call__(self, city, name):
        
        all_pops = sorted(self.name_pops[name], reverse=True)

        if len(all_pops) < 2:
            return True
        
        p2_ratio = (city.population or 0) / all_pops[1]
        
        if p2_ratio > self.min_p2_ratio:
            return True
        
        return False

In [9]:
USA_NAMES = (
    'USA',
    'United States',
    'United States of America',
    'US',
    'America',
)

class USCityKeyIter:
    
    def __init__(self, *args, **kwargs):
        self.allow_bare = AllowBareName(*args, **kwargs)
    
    def _iter_keys(self, city):
        """Enumerate index keys for a city.
        
        Args:
            city (db.City)
            
        Yields: str
        """    
        bare_names = [n for n in city.names if self.allow_bare(city, n)]
        
        states = (city.name_a1, city.us_state_abbr)
        
        for name in bare_names:
            yield name
            
        for name, usa in product(bare_names, USA_NAMES):
            yield ' '.join((name, usa))
            
        for name, state in product(city.names, states):
            yield ' '.join((name, state))
            
        for name, state, usa in product(city.names, states, USA_NAMES):
            yield ' '.join((name, state, usa))
            
    def __call__(self, city):
        for text in self._iter_keys(city):
            yield keyify(text)

In [10]:
city_key_iter = USCityKeyIter()

2018-07-19 14:43:54,932 | INFO : Indexing name -> populations.


344249it [00:13, 25053.08it/s]


In [11]:
la = City.query.filter(City.country_iso=='US').filter(City.name=='Los Angeles').first()
tt = City.query.filter(City.country_iso=='US').filter(City.name=='Tuscaloosa').first()

In [12]:
list(city_key_iter(la))

['los angeles',
 'la',
 'la la land',
 'los angeles usa',
 'los angeles united states',
 'los angeles united states of america',
 'los angeles us',
 'los angeles america',
 'la usa',
 'la united states',
 'la united states of america',
 'la us',
 'la america',
 'la la land usa',
 'la la land united states',
 'la la land united states of america',
 'la la land us',
 'la la land america',
 'los angeles california',
 'los angeles ca',
 'la california',
 'la ca',
 'la la land california',
 'la la land ca',
 'los angeles california usa',
 'los angeles california united states',
 'los angeles california united states of america',
 'los angeles california us',
 'los angeles california america',
 'los angeles ca usa',
 'los angeles ca united states',
 'los angeles ca united states of america',
 'los angeles ca us',
 'los angeles ca america',
 'la california usa',
 'la california united states',
 'la california united states of america',
 'la california us',
 'la california america',
 'la ca us

In [13]:
list(city_key_iter(tt))

['tuscaloosa',
 'tuscaloosa usa',
 'tuscaloosa united states',
 'tuscaloosa united states of america',
 'tuscaloosa us',
 'tuscaloosa america',
 'tuscaloosa alabama',
 'tuscaloosa al',
 'tuscaloosa alabama usa',
 'tuscaloosa alabama united states',
 'tuscaloosa alabama united states of america',
 'tuscaloosa alabama us',
 'tuscaloosa alabama america',
 'tuscaloosa al usa',
 'tuscaloosa al united states',
 'tuscaloosa al united states of america',
 'tuscaloosa al us',
 'tuscaloosa al america']

In [19]:
class USCityIndex:
    
    def __init__(self):
        self._idx = defaultdict(set)
        
    def __getitem__(self, text):
        return self._idx[keyify(text)]
    
    def query(self, text):
        return City.query.filter(City.wof_id.in_(self[text])).all()
    
    def build(self):
        """Index all US cities.
        """
        iter_keys = USCityKeyIter()
        
        cities = City.query.filter(City.country_iso=='US')
        
        logger.info('Indexing US cities.')
        
        for city in tqdm(cities):
            
            try:
                
                # Generate keys, ensure no errors.
                keys = list(iter_keys(city))
                
                # Index complete key set.
                for key in iter_keys(city):
                    self[key].add(city.wof_id)

            except Exception as e:
                pass

In [20]:
ci = USCityIndex()
ci.build()

2018-07-19 14:45:27,558 | INFO : Indexing name -> populations.


344249it [00:13, 25455.01it/s]

2018-07-19 14:45:50,469 | INFO : Indexing US cities.



54727it [00:14, 3729.63it/s]


In [21]:
len(ci._idx)

793008

In [26]:
%time ci.query('pdx')

CPU times: user 1.69 ms, sys: 389 µs, total: 2.08 ms
Wall time: 1.81 ms


[City<101715829, Portland, Oregon, United States>]

In [36]:
ny = ci.query('new york')[0]

In [40]:
ny.__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x115b51e80>,
 'area_m2': 783424470.067211,
 'country_iso': 'US',
 'dbpedia_id': 'Manhattan',
 'elevation': 10,
 'factual_id': '08d08428-8f76-11e1-848f-cfd5bf3ef515',
 'fips_code': 3651000,
 'freebase_id': 'en.new_york_county',
 'geonames_id': 5128581,
 'geoplanet_id': 2459115,
 'latitude': 40.694457,
 'library_of_congress_id': 'n79007751',
 'longitude': 40.694457,
 'name': 'New York',
 'name_a0': 'United States',
 'name_a1': 'New York',
 'new_york_times_id': 'N63718991197345770861',
 'population': 8175133,
 'population_rank': 13,
 'quattroshapes_id': 826077,
 'wikidata_id': 'Q60',
 'wikipedia_page': 'New York City',
 'wikipedia_wordcount': 27531,
 'wof_id': 85977539}

In [33]:
list(City.__table__.columns)

[Column('wof_id', Integer(), table=<city>, primary_key=True, nullable=False),
 Column('dbpedia_id', String(), table=<city>),
 Column('freebase_id', String(), table=<city>),
 Column('factual_id', String(), table=<city>),
 Column('fips_code', Integer(), table=<city>),
 Column('geonames_id', Integer(), table=<city>),
 Column('geoplanet_id', Integer(), table=<city>),
 Column('library_of_congress_id', String(), table=<city>),
 Column('new_york_times_id', String(), table=<city>),
 Column('quattroshapes_id', Integer(), table=<city>),
 Column('wikidata_id', String(), table=<city>),
 Column('wikipedia_page', String(), table=<city>),
 Column('name', String(), table=<city>, nullable=False),
 Column('country_iso', String(), table=<city>, nullable=False),
 Column('name_a0', String(), table=<city>),
 Column('name_a1', String(), table=<city>),
 Column('latitude', Float(), table=<city>),
 Column('longitude', Float(), table=<city>),
 Column('population', Integer(), table=<city>),
 Column('population_ra