## Trie

A trie is a tree-like data structure whose nodes store the letters of an alphabet.

In [139]:
import re
from pytrie import StringTrie

### Autocomplete

In [117]:
def suggest(input_str, key_trie, top_n=10):
    """autocomplete the input_str, show the top_n suggestions with highest score
    """
    input_low = input_str.lower()
    out_items = key_trie.items(prefix=input_low)
    out_sort = sorted(out_items, key=lambda tup: tup[1], reverse=True)
    out_list = [i for i,v in out_sort[:top_n]]
    return out_list

In [142]:
key_dict =  dict({"hello":10, "dog":3, "hell":20, "cat":3, "h":4,  
        "hel":15, "help":33, "helps":47, "helping":40})
key_trie = StringTrie(key_dict)

In [143]:
suggest("hel", key_trie)

['helps', 'helping', 'help', 'hell', 'hel', 'hello']

An practical example - location autocomplete

In [102]:
import pandas as pd
input_file = "data/worldcities.xlsx"
input_df = pd.read_excel(input_file)

In [118]:
def make_dict(input_df):
    input_df["fullname"] = input_df[["city", "admin_name", "country"]].apply(lambda x: ', '.join(x.astype(str).str.lower()), axis=1)
    input_df["population"] = input_df["population"].fillna(0)
    out_dict = dict(zip(input_df.fullname.str.lower(), input_df.population))
    return StringTrie(key_dict)

In [119]:
key_trie = make_dict(input_df)

In [124]:
suggest("par", key_trie, top_n=5)

['paris, île-de-france, france',
 'parbhani, mahārāshtra, india',
 'paraná, entre ríos, argentina',
 'paramaribo, paramaribo, suriname',
 'paradise, nevada, united states']

In [126]:
suggest("sha", key_trie, top_n=5)

['shanghai, shanghai, china',
 'shangqiu, henan, china',
 'shantou, guangdong, china',
 'shangrao, jiangxi, china',
 'sharjah, ash shāriqah, united arab emirates']

In [127]:
suggest("ber", key_trie, top_n=5)

['berlin, berlin, germany',
 'bern, bern, switzerland',
 'bertoua, est, cameroon',
 'bergen, hordaland, norway',
 'bergamo, lombardy, italy']

### Ray's search

Another practical example - large key search

In [128]:
from Trie import Trie

In [129]:
def make_trie(input_list):
    """Build regex from trie structure.
    """
    t = Trie()
    for w in input_list:
        t.add(w)
    regex = re.compile(r"\b" + t.pattern() + r"\b", re.IGNORECASE)
    return regex

In [132]:
keys = ["munich", "paris", "berlin", "brussels"]
regex = make_trie(keys)

In [131]:
text = """Amsterdam - Van Gogh Museum
Paris –  Eiffel tower, Louvre Museum
Munich – Oktoberfest
Berlin – Zoo etc."""

In [134]:
%%time
regex.findall(text)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 42.2 µs


['Paris', 'Munich', 'Berlin']

In [145]:
len(input_df.city.values)

15493

In [146]:
regex_large = make_trie(input_df.city.values)

In [137]:
%%time
regex_large.findall(text)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 151 µs


['Amsterdam', 'Paris', 'Munich', 'Berlin']

Ref: 

Intro to Trie: https://medium.com/basecs/trying-to-understand-tries-3ec6bede0014
pytrie: https://pytrie.readthedocs.io/en/latest/
Trie script: https://gist.github.com/EricDuminil/8faabc2f3de82b24e5a371b6dc0fd1e0
city data : https://simplemaps.com/data/world-cities