In [61]:
vocabulary_hash_size = 30000000


class Vocabulary(object):
    """
    store vocabulary in huffman tree
    """

    def __init__(self, word):
        self.counter = 0  # vocabulary disappear counter
        self.path = list()  # path to huffman tree node
        self.word = word  # word
        self.huffman_code = list  # huffman code
        self.codeLen = 0  # huffman code length

    def __hash__(self):
        return hash(self.word) % vocabulary_hash_size

    def __str__(self):
        return "%s(%d)" % (self.word, self.counter)

In [62]:
voc = Vocabulary("中国")
print(hash(voc))
print(hash("中国") % vocabulary_hash_size)


28128819
28128819


In [63]:
import math

EXP_TABLE_SIZE = 1000
MAX_EXP = 6
EXP_TABLE = [math.exp((i / EXP_TABLE_SIZE * 2 - 1) * MAX_EXP) for i in range(1000)]
EXP_TABLE = [x / (x + 1) for x in EXP_TABLE]

In [64]:
vocabularies = []
vocabulary_hash = [-1 for i in range(vocabulary_hash_size)]
vocabularies_size = 0


def add_word_to_hash(word):
    global vocabularies_size

    word_voc = Vocabulary(word)
    vocabularies.append(word_voc)
    hash_idx = hash(word_voc)
    while vocabulary_hash[hash_idx] != -1:
        hash_idx = (hash_idx + 1) % vocabulary_hash_size
    vocabulary_hash[hash_idx] = vocabularies_size
    vocabularies_size = vocabularies_size + 1

    return vocabularies_size


add_word_to_hash("NL")

1

In [65]:
print(len(vocabularies))
print(vocabularies[0].word)
print(vocabulary_hash[hash(vocabularies[0])])
print(vocabulary_hash[hash("NL") % vocabulary_hash_size])

1
NL
0
0


In [66]:
def search_word(word):
    hash_idx = hash(word) % vocabulary_hash_size
    while True:
        if vocabulary_hash[hash_idx] == -1:
            return -1
        if vocabularies[vocabulary_hash[hash_idx]].word == word:
            return vocabulary_hash[hash_idx]
        hash_idx = (hash_idx + 1) % vocabulary_hash_size

In [23]:
print("search %s: %d" % ("NL", search_word("NL")))
print("search %s: %d" % ("中国", search_word("中国")))

search NL: 0
search 中国: -1


In [67]:
def reduceVocabulary(min_reduce=5):
    global vocabularies, vocabulary_hash, vocabularies_size
    new_idx = 0
    for (idx, v) in enumerate(vocabularies):
        if v.counter > min_reduce:
            vocabularies[new_idx] = v
            new_idx += 1
    vocabularies = vocabularies[:new_idx]
    vocabularies_size = new_idx

    vocabulary_hash = [-1 for _ in range(vocabulary_hash_size)]
    for (idx, v) in enumerate(vocabularies):
        hash_idx = hash(v)
        while vocabulary_hash[hash_idx] != -1:
            hash_idx = hash_idx + 1
        vocabulary_hash[hash_idx] = idx

In [25]:
 # test reduce
add_word_to_hash("中国")
vocabularies[0].counter = 9
add_word_to_hash("浙江")
vocabularies[2].counter = 7
reduceVocabulary()
print(vocabulary_hash[hash('NL') % vocabulary_hash_size])
print(vocabulary_hash[hash('浙江') % vocabulary_hash_size])
print(vocabulary_hash[hash('中国') % vocabulary_hash_size])
reduceVocabulary(999999)
add_word_to_hash("NL")


0
1
-1


1

In [68]:
trained_words_count = 0
reduceVocabulary(999999)
add_word_to_hash("NL")

with open(file="./data/lines.csv", mode="r", encoding="utf-8") as f:
    line = f.readline()
    while line:
        line = f.readline()
        words = line.strip().split(" ")
        for new_word in words:
            word_idx = search_word(new_word)
            if word_idx == -1:
                add_word_to_hash(new_word)
                vocabularies[-1].counter = 1
            else:
                vocabularies[word_idx].counter += 1

            if vocabularies_size >= vocabulary_hash_size * 0.7:
                reduceVocabulary()
            trained_words_count += 1
            if trained_words_count % 10000 == 0:
                print("trained words count: %d" % trained_words_count)
        vocabularies[0].counter += 1

trained words count: 10000
trained words count: 20000
trained words count: 30000
trained words count: 40000
trained words count: 50000
trained words count: 60000
trained words count: 70000
trained words count: 80000
trained words count: 90000
trained words count: 100000
trained words count: 110000
trained words count: 120000
trained words count: 130000
trained words count: 140000
trained words count: 150000
trained words count: 160000
trained words count: 170000


In [69]:
print("trained words count: %d" % trained_words_count)
print("vocabularies count: %d" % len(vocabularies))
print("vocabularies[500]： %s" % vocabularies[500])

trained words count: 177352
vocabularies count: 33042
vocabularies[500]： 关怀(12)


In [70]:
#buble sort descending vocabularies, exclude first element(NL)
#use quicksort if not recursive maybe quicker
def bubble(arr):
    arr_len = len(arr)
    while arr_len > 0:
        for i in range(arr_len - 1):
            if arr[i].counter < arr[i + 1].counter:
                arr[i], arr[i + 1] = arr[i + 1], arr[i]
        arr_len -= 1
    return arr


sorted_vocabularies = bubble(vocabularies[1:])


In [29]:
sorted_vocabularies.insert(0, vocabularies[0])
print(sorted_vocabularies[1])

日(1429)


In [30]:
# remove vocabularies that disappeared less than 5 times.
min_reduce = 5
vocabulary_hash = [-1 for _ in range(vocabulary_hash_size)]
vocabularies_size = 0
for (idx, voc) in enumerate(sorted_vocabularies):
    if voc.counter >= min_reduce:
        vocabulary_hash[hash(voc)] = idx
    else:
        vocabularies_size = idx
        break
sorted_vocabularies = sorted_vocabularies[:vocabularies_size]

In [31]:
len(sorted_vocabularies)

5720

In [58]:
arr_size = len(sorted_vocabularies)
arr = [v.counter for v in sorted_vocabularies]

count_arr = arr
count_arr.extend([1e15 for i in range(arr_size)])

[1000000000000000.0, 1000000000000000.0, 1000000000000000.0, 1000000000000000.0, 1000000000000000.0, 1000000000000000.0, 1000000000000000.0, 1000000000000000.0, 1000000000000000.0, 1000000000000000.0]


In [59]:
# log(N) algorithm for generating huffman tree in sorted array
pos1 = arr_size - 1
pos2 = arr_size
min1_idx, min2_idx = 0, 0
parent_node = [0 for i in range(arr_size * 2 + 1)]
binary = [0 for i in range(arr_size * 2 + 1)]
for a in range(arr_size):
    if pos1 >= 0:
        if count_arr[pos1] < count_arr[pos2]:
            min1_idx = pos1
            pos1 -= 1
        else:
            min1_idx = pos2
            pos2 += 1
    else:
        min1_idx = pos2
        pos2 += 1

    if pos1 >= 0:
        if count_arr[pos1] < count_arr[pos2]:
            min2_idx = pos1
            pos1 -= 1
        else:
            min2_idx = pos2
            pos2 += 1
    else:
        min2_idx = pos2
        pos2 += 1
    count_arr[arr_size + a] = count_arr[min1_idx] + count_arr[min2_idx]
    parent_node[min1_idx] = arr_size + a
    parent_node[min2_idx] = arr_size + a
    binary[min2_idx] = 1

In [75]:
count_arr[-10:]
for a in range(arr_size):
    b = a
    code_len = 0
    code = list()
    path_point = list()
    while True:
        code.append(binary[b])
        path_point.append(b)
        code_len += 1
        b = parent_node[b]
        if b == (arr_size * 2 - 2):
            break

    sorted_vocabularies[a].codeLen = code_len
    sorted_vocabularies[a].path = [0] * (code_len + 1)
    sorted_vocabularies[a].huffman_code = [0] * code_len
    sorted_vocabularies[a].path[0] = arr_size - 2
    for b in range(code_len):
        sorted_vocabularies[a].huffman_code[code_len - b - 1] = code[b]
        sorted_vocabularies[a].path[code_len - b] = path_point[b] - arr_size

In [78]:
sorted_vocabularies[0].path

[5718, 5717, 5715, 5711, 5702, -5720]