### Predefined +/- string list
- process into text file
- read from file
- convert into tries

#### Process raw words

In [None]:
with open(r'..\storage\raw_text.txt', 'r') as reader:
    splitted_text = reader.read().split(',')
    proc_text = [x.strip().lower() for x in splitted_text]
    print(proc_text)

In [None]:
sorted_word = sorted(proc_text)
print(sorted_word)
# with open(r'..\storage\pos_words.txt', 'w') as writer:
#     for word in sorted_word:
#         writer.write(word+'\n')

In [None]:
with open(r'..\storage\neg_words.txt', 'r') as reader:
    splitted_text = reader.read().split('\n')
    proc_text = sorted(splitted_text)
    print(proc_text)

# with open(r'..\storage\neg_words.txt', 'w') as writer:
#     for word in proc_text:
#         writer.write(word+'\n')

### Convert to tries

In [28]:
import json
import copy

In [29]:
SENT_VALUE_KEY = '___'
TRIE_JSON_FILE = r'..\storage\compressed_trie.json'

In [30]:
# read postive words list from file
pos_words = []
with open(r'..\storage\pos_words.txt', 'r') as reader:
    pos_words = sorted(reader.read().split('\n'))
print(len(pos_words))

# # read negative words list from file
neg_words = []
with open(r'..\storage\neg_words.txt', 'r') as reader:
    neg_words = sorted(reader.read().split('\n'))
print(len(neg_words))

1419
4548


In [31]:
# create imput variable for build_tries()
pos_values = [1] * len(pos_words)
pos_words_values = list(zip(pos_words, pos_values))
print(pos_words_values[0])

neg_values = [-1] * len(neg_words)
neg_words_values = list(zip(neg_words, neg_values))
print(neg_words_values[0])

words_values = pos_words_values + neg_words_values

('a reason for being', 1)
('abnormal', -1)


In [40]:
# build uncompressed trie
def build_tries(words_values: list) -> dict:
    'Creates a basic uncompressed trie from a sorted list of words with sentiment values.'
    root = {}
    for (word, value) in words_values:
        node = root
        word_len = len(word)
        for i, char in enumerate(word):
            # create a new child if no char found
            if char not in node:
                node[char] = {}
            
            # put a key in child indicate end of a complete word
            if i == word_len - 1:
                node[char][SENT_VALUE_KEY] = value
            node = node[char]
    return root

newTrie = build_tries(words_values)
print('\n'.join(json.dumps(newTrie, indent=4).split('\n')[:30]))

with open(TRIE_JSON_FILE, 'w') as outfile:
    json.dump(newTrie, outfile)

{
    "a": {
        " ": {
            "r": {
                "e": {
                    "a": {
                        "s": {
                            "o": {
                                "n": {
                                    " ": {
                                        "f": {
                                            "o": {
                                                "r": {
                                                    " ": {
                                                        "b": {
                                                            "e": {
                                                                "i": {
                                                                    "n": {
                                                                        "g": {
                                                                            "___": 1
                                                                        }
                             

In [43]:
# compress the tries
def rec_search(key: str, trie: dict) -> (str, dict):
    # base case
    # end of last character of complete word
    if type(trie) is int:
        return key[:-len(SENT_VALUE_KEY)], {SENT_VALUE_KEY:trie}

    children_size = len(trie)
    newChildren = {}

    if children_size == 1:
        for childKey, childTrie in trie.items():
            prefixKey = key+childKey
            newKey, newChild = rec_search(prefixKey, childTrie)

            # special case
            # if the root tree only has one children
            if key == '':
                return '', {newKey:newChild}

            return newKey, newChild
    else:
        for childKey, childTrie in trie.items():
            if childKey == SENT_VALUE_KEY:
                newChildren[childKey] = childTrie
            else:
                prefixKey = ''+childKey
                newKey, newChild = rec_search(prefixKey, childTrie)
                newChildren[newKey] = newChild

    return key, newChildren

def compress_trie(trie: dict) -> dict:
    trie = copy.deepcopy(trie)
    _, compressedTrie = rec_search('', trie)
    return compressedTrie

cTrie = compress_trie(newTrie)
print('\n'.join(json.dumps(cTrie, indent=4).split('\n')[:30]))

with open(TRIE_JSON_FILE, 'w') as outfile:
    json.dump(cTrie, outfile)

{
    "a": {
        " reason for being": {
            "___": 1
        },
        "b": {
            "le": {
                "___": 1
            },
            "o": {
                "und": {
                    "___": 1,
                    "ing": {
                        "___": 1
                    },
                    "s": {
                        "___": 1
                    }
                },
                "lish": {
                    "___": -1
                },
                "mina": {
                    "bl": {
                        "e": {
                            "___": -1
                        },
                        "y": {
                            "___": -1
                        }


In [47]:
# test the compressed trie
def search(trie: dict, word: str) -> int:
    '''
    Helper method to search compressed trie
    '''
    word_length = len(word)
    offset = 0
    lastIndex = 0

    for i in range(word_length):
        key = word[offset:i+1]
        value = trie.get(key)

        if value is not None:
            trie = value
            offset += len(key)
            lastIndex = i

    result = trie.get(SENT_VALUE_KEY)
    if result is None or lastIndex != word_length - 1:
        return 0
    return result

for (word, value) in words_values:
    sent_value = search(cTrie,word)
    if(value != sent_value):
        print(word, ':', sent_value)


audacity : -1
blinding : -1
dope : -1
emphatic : -1
giddy : -1
incomparable : -1
joke : -1
jumpy : -1
unbelievable : -1
vulnerable : -1
zealous : -1
