In [1]:
import numpy as np
import os

In [2]:
###############################################
# Set parameters
###############################################
D = 10000
M = 100    # Number of ones
N = 3       # Sliding window size
print("Density: " + str(M/D))

Density: 0.01


In [3]:
def u_gen_rand_hv(D):

    # Sanity checker
    if (D % 2):
        print("Error - D can't be an odd number")
        return 0

    hv = np.zeros(D, dtype = int)
    indices = np.random.permutation(D)

    hv[indices >= M] = 0
    hv[indices < M] = 1

    return hv

In [4]:
def cos_angle(u,v):
    return np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))

# Encoding

In [5]:
def lookup_item_mem(item_memory, key, D):
    if (item_memory.get(key) is not None):
        value = item_memory[key]
    else:
        value = u_gen_rand_hv(D)
        item_memory[key] = value
    return item_memory, value

In [6]:
def binarize_hv(v, final_density):
    sort_array = np.copy(v)
    sort_array[::-1].sort()
    cutoff = round(len(v)*final_density)
    threshold = sort_array[cutoff]
    print("Cutoff: " + str(cutoff))
    
    for i in range(len(v)):
        if v[i] > threshold:
            v[i] = 1
        else:
            v[i] = 0
    return v

In [7]:
def window_sum(buffer, letters_im, N, D):
    block = np.zeros((N, D), dtype = int)
    sum_hv = np.zeros(D, dtype = int)
    for i in range(len(buffer)):
        block = np.roll(block, (1, 1), axis=(1, 0))
        letters_im, block[0] = lookup_item_mem(letters_im, buffer[i], D)
        
        if i >= (N-1):
            n_grams = block[0]
            for j in range(1,N):
                # n_grams = np.logical_xor(n_grams, block[j])
                n_grams = n_grams + block[j]
            n_grams = binarize_hv(n_grams, 0.5)
            sum_hv = sum_hv + n_grams
            # sum_hv = np.logical_or(sum_hv,n_grams)
    
    THR = 0.2
    sum_hv = binarize_hv(sum_hv, THR)
    print('Density after sum: ' + str(np.sum(sum_hv)/len(sum_hv)))
    # print(np.sum(sum_hv)/len(sum_hv))

    # K = 1
    # thinned_list = []
    # for k in range(1,K+1):
    #     thinned_list.append(np.logical_and(sum_hv, np.roll(sum_hv,k)))
    # thinned_sum = np.array([1 if x >= 1 else 0 for x in sum(thinned_list)])

    # print('density after cdt: ' + str(np.sum(thinned_sum)/len(thinned_sum)))       
    return letters_im, sum_hv

In [8]:
def binarize_language_hv(lang_am, buffer_len):
    lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    for j in lang_labels:
        v = lang_am[j]
        lang_am[j] = binarize_hv(v, buffer_len)
    return lang_am

# Training

In [9]:
def build_memories(N, D):
    item_mem = dict()
    lang_am = dict()
    lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    # lang_labels = ['afr', 'bul']

    for i in lang_labels:
        print('Reading file: ' + i)
        file_address = './training_texts/' + i + '.txt'
        with open(file_address) as f:
            buffer = f.read()

        item_mem, lang_hv = window_sum(buffer, item_mem, N, D)
        # lang_am[i] = binarize_hv(lang_hv, len(buffer))
        lang_am[i] = lang_hv
        
    return item_mem, lang_am

In [10]:
def test(item_mem, lang_am, N, D):
    total = 0
    correct = 0
    lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    lang_map = dict()
    lang_map['af'] = 'afr'
    lang_map['bg'] = 'bul'
    lang_map['cs'] = 'ces'
    lang_map['da'] = 'dan'
    lang_map['nl'] = 'nld'
    lang_map['de'] = 'deu'
    lang_map['en'] = 'eng'
    lang_map['et'] = 'est'
    lang_map['fi'] = 'fin'
    lang_map['fr'] = 'fra'
    lang_map['el'] = 'ell'
    lang_map['hu'] = 'hun'
    lang_map['it'] = 'ita'
    lang_map['lv'] = 'lav'
    lang_map['lt'] = 'lit'
    lang_map['pl'] = 'pol'
    lang_map['pt'] = 'por'
    lang_map['ro'] = 'ron'
    lang_map['sk'] = 'slk'
    lang_map['sl'] = 'slv'
    lang_map['es'] = 'spa'
    lang_map['sv'] = 'swe'

    for file in os.listdir('./testing_texts/'):
        if file.endswith('.txt'):
            actual_label = file[0:2]
            predict_lang = ''

            file_address = './testing_texts/' + file
            with open(file_address) as f:
                buffer = f.read()
            print('Loaded testing text file: ' + str(file_address))

            item_mem_n, text_hv = window_sum(buffer, item_mem, N, D)
            # text_hv = binarize_hv(text_hv, len(buffer))
            if (item_mem != item_mem_n):
                print('\n>>>> NEW UNSEEN ITEM IN TEST FILE <<<<\n')
                exit()
            else:
                max_angle = -1
                # max_angle = 10000
                for j in range(len(lang_labels)):
                    if (lang_labels[j] in lang_am.keys()):
                        angle = cos_angle(lang_am[lang_labels[j]], text_hv)
                        # angle = np.sum(np.logical_xor(lang_am[lang_labels[j]], text_hv))
                        if (angle > max_angle):
                        # if (angle < max_angle):
                            max_angle = angle
                            predict_lang = lang_labels[j]
                
                if (predict_lang == lang_map[actual_label]):
                    correct += 1
                else:
                    print(str(lang_map[actual_label]) + '-->' + predict_lang)
                total += 1
    return correct / total

In [11]:
item_mem, lang_am = build_memories(N,D)

Reading file: afr


KeyboardInterrupt: 

In [None]:
np.save('item_mem1.npy', item_mem)
np.save('lang_am1.npy', lang_am)
# item_mem2 = np.load('item_mem1.npy', allow_pickle='TRUE').item()

In [None]:
test(item_mem, lang_am, N, D)

Loaded testing text file: ./testing_texts/bg_0_p.txt
density after cdt: 0.3251
bul-->afr
Loaded testing text file: ./testing_texts/bg_100_p.txt
density after cdt: 0.352
bul-->afr
Loaded testing text file: ./testing_texts/bg_101_p.txt
density after cdt: 0.3805
bul-->afr
Loaded testing text file: ./testing_texts/bg_102_p.txt
density after cdt: 0.3501
bul-->afr
Loaded testing text file: ./testing_texts/bg_103_p.txt
density after cdt: 0.3634
bul-->afr
Loaded testing text file: ./testing_texts/bg_104_p.txt
density after cdt: 0.3805
bul-->afr
Loaded testing text file: ./testing_texts/bg_105_p.txt
density after cdt: 0.3374
bul-->afr
Loaded testing text file: ./testing_texts/bg_106_p.txt
density after cdt: 0.3664
bul-->afr
Loaded testing text file: ./testing_texts/bg_107_p.txt
density after cdt: 0.3805
bul-->afr
Loaded testing text file: ./testing_texts/bg_108_p.txt
density after cdt: 0.336
bul-->afr
Loaded testing text file: ./testing_texts/bg_109_p.txt
density after cdt: 0.3664
bul-->afr
Loa

KeyboardInterrupt: 

In [None]:
# other cdt, superimpose part of buffer then cdt then superimpose those
# item_mem2 = np.load('item_mem1.npy', allow_pickle='TRUE').item()
# lang_am2 = np.load('lang_am1.npy', allow_pickle='TRUE').item()


In [None]:
v = np.array([ 3324, 234234, 24234])
d = np.copy(v)
d[::-1].sort()
print(d)
print(v)

[234234  24234   3324]
[  3324 234234  24234]
