In [4]:
%matplotlib inline
#%load_ext autoreload
#%autoreload 2
%reload_ext autoreload
import numpy as np
import matplotlib.pyplot as plt
import math, sys, os
from numpy.random import randn

PROJECT_HOME = os.environ.get('PROJECT_HOME', None)
sys.path.insert(0, PROJECT_HOME + "/util")
from loaders import get_english_dictionary

/Users/rcoleman/rob/code/python/data_science_ipython_notebooks/data


In [2]:
class Trie_dict:
    
    def __init__(self):
        self._end = '_end_'
        self._root = dict()
    
    def insert(self, word):
        current_dict = self._root
        for letter in word:
            current_dict = current_dict.setdefault(letter, {})
        current_dict[self._end] = self._end
    
    def insert_batch(self, words):
        for word in words:
            self.insert(word)
    
    def view(self):
        print self._root
    
    def view_root_keys(self):
        print self._root.keys()
    
    def contains(self, word):
        current_dict = self._root
        for letter in word:
            if letter in current_dict:
                current_dict = current_dict[letter]
            else:
                return False
        # the _end flag indicates this is the end of a word
        # if it's not there, the word continues
        if self._end in current_dict:
            return True
        else:
            return False
    

    def suggest(self, partial, limit = 5):
        """
        Since this trie doesn't store frequency of words as it trains, we're just 
        going to return the alphabetically first 'limit', shortest, terms.
        """
        suggestions = []

        def suggest_dfs(partial_dict, partial ):
                if len(suggestions) < limit:
                    for ch in sorted(partial_dict.keys()): 
                        # sorting by alpha, this happens to give us _end_ first
                        # could be pre-sorting by frequency for better 
                        #   speed and smarted recommendations
                        if len(suggestions) >= limit:
                            break
                        elif ch == self._end:
                            suggestions.append(partial)
                        else:
                            # recurse
                            suggest_dfs(partial_dict[ch], partial + ch)

        partial_dict = self._find_patial(partial)
        if not partial_dict == None:
            suggest_dfs(partial_dict, partial)
        
        return suggestions

    def _find_patial(self, partial):
        top_dict = self._root
        for char in partial:
            if char in top_dict:
                top_dict = top_dict[char]
            else:
                # there are no words starting with this sequence
                return None
        return top_dict

        

In [3]:
trie = Trie_dict()
trie.insert_batch(get_english_dictionary())

IOError: [Errno 21] Is a directory: '/Users/rcoleman/rob/code/python/data_science_ipython_notebooks/nlp_nlu'

In [15]:
print "suggestions"
print ""
print "'reac': "
print trie.suggest("reac")
print ""
print "'poo': "
print trie.suggest("poo")
print ""
print "'whal': "
print trie.suggest("whal")
print ""
print "'dan': "
print trie.suggest("dan")
print ""

suggestions

'reac': 
['reacceptance', 'reaccess', 'reaccession', 'reacclimatization', 'reacclimatize']

'poo': 
['pooa', 'pooch', 'pooder', 'poodle', 'poodledom']

'whal': 
['whale', 'whaleback', 'whalebacker', 'whalebird', 'whaleboat']

'dan': 
['dan', 'danaid', 'danaide', 'danaine', 'danaite']



# Tries with some statistical flavor

## Trie with frequency distribution
Create a Trie where we keep track of how many times we've gone down each branch of the tree.  We can use this distribution over suggestions to rank our suggestions.

This prob. can be expressed as P( next_word = word_i | incomplete)

## Trie with simple Markov-Transition Distribution
We can use some sentance context to make suggestions as well.  We can build a transition matrix from work X to work Y (represented sparsely because the # of words is likely huge), to get the probability of the the next word, given the last word, or 

P( next_word = word_i | incomplete, last_word = word_j) = P( next_word = word_i | incomplete) * P( next_word = word_i | last_word = word_j )

## HMMs
We can extend the Markov toolkit even further, by modeling the word sequence as a Hidden-Markov Model.  The Hidden-Markov model creates a tractible way of computing not just P( next_word = word_i | last_word = word_j ) but P( next_word = word_i | last_word = word_j, last_last_word = word_j, ..., all the way to firs_word = word_x ).

HMMs are a whole different beast, but once you've got one, you can update your ranking of the next word with the following:

P( next_word = word_i | incomplete, all_previous_words) = P( next_word = word_i | incomplete) * P( next_word = word_i | all_previous_words )