## Trie

A trie is a tree-like data structure whose nodes store the letters of an alphabet.

In [14]:
import re
from pytrie import StringTrie

### Autocomplete

In [15]:
def suggest(input_str, key_trie, top_n=10):
    """autocomplete the input_str, show the top_n suggestions with highest score
    """
    input_low = input_str.lower()
    out_items = key_trie.items(prefix=input_low)
    out_sort = sorted(out_items, key=lambda tup: tup[1], reverse=True)
    out_list = [i for i,v in out_sort[:top_n]]
    return out_list

In [16]:
key_dict =  dict({"hello":10, "dog":3, "hell":20, "cat":3, "h":4,  
        "hel":15, "help":33, "helps":47, "helping":40})
key_trie = StringTrie(key_dict)

In [17]:
suggest("hel", key_trie)

['helps', 'helping', 'help', 'hell', 'hel', 'hello']

An practical example - location autocomplete

In [18]:
import pandas as pd
input_file = "data/worldcities.xlsx"
input_df = pd.read_excel(input_file)

In [24]:
def make_trie(input_df):
    input_df["fullname"] = input_df[["city", "admin_name", "country"]].apply(lambda x: ', '.join(x.astype(str).str.lower()), axis=1)
    input_df["population"] = input_df["population"].fillna(0)
    out_dict = dict(zip(input_df.fullname.str.lower(), input_df.population))
    return StringTrie(out_dict)

In [25]:
key_trie = make_trie(input_df)

In [26]:
suggest("par", key_trie, top_n=5)

['paris, île-de-france, france',
 'parbhani, mahārāshtra, india',
 'paraná, entre ríos, argentina',
 'paramaribo, paramaribo, suriname',
 'paradise, nevada, united states']

In [27]:
suggest("sha", key_trie, top_n=5)

['shanghai, shanghai, china',
 'shangqiu, henan, china',
 'shantou, guangdong, china',
 'shangrao, jiangxi, china',
 'sharjah, ash shāriqah, united arab emirates']

In [28]:
suggest("ber", key_trie, top_n=5)

['berlin, berlin, germany',
 'bern, bern, switzerland',
 'bertoua, est, cameroon',
 'bergen, hordaland, norway',
 'bergamo, lombardy, italy']

### Ray's search

Another practical example - large key search

In [29]:
from Trie import Trie
import re

In [30]:
def make_regex(input_list):
    """Build regex from trie structure.
    """
    t = Trie()
    for w in input_list:
        t.add(w)
    regex = re.compile(r"\b" + t.pattern() + r"\b", re.IGNORECASE)
    return regex

In [42]:
small_keys = ["munich", "paris", "berlin", "brussels"]
regex_small = make_regex(small_keys)

In [43]:
text = """Amsterdam - Van Gogh Museum
Paris –  Eiffel tower, Louvre Museum
Munich – Oktoberfest
Berlin – Zoo etc."""

In [44]:
%%time
regex_small.findall(text)

CPU times: user 14 µs, sys: 1 µs, total: 15 µs
Wall time: 18.1 µs


['Paris', 'Munich', 'Berlin']

In [45]:
large_key = input_df.city.values
len(large_key)

15493

In [46]:
regex_large = make_regex(input_df.city.values)

In [48]:
%%time
regex_large.findall(text)

CPU times: user 67 µs, sys: 0 ns, total: 67 µs
Wall time: 70.1 µs


['Amsterdam', 'Van', 'Paris', 'Munich', 'Berlin']

Ref: 

Intro to Trie: https://medium.com/basecs/trying-to-understand-tries-3ec6bede0014
pytrie: https://pytrie.readthedocs.io/en/latest/
Trie script: https://gist.github.com/EricDuminil/8faabc2f3de82b24e5a371b6dc0fd1e0
city data : https://simplemaps.com/data/world-cities