In [18]:
import numpy as np
import os
# import copy
import random

In [19]:
###############################################
# Set parameters
###############################################
D = 2000
M = 40   # Number of ones
N = 5     # Sliding window size
m = 5     # Signature size
print("Density: " + str(M/D))

Density: 0.02


In [20]:
def u_gen_rand_hv(D):

    # Sanity checker
    if (D % 2):
        print("Error - D can't be an odd number")
        return 0

    hv = np.zeros(D, dtype = int)
    indices = np.random.permutation(D)

    hv[indices >= M] = 0
    hv[indices < M] = 1

    return hv

In [21]:
def u_gen_rand_hv2(D):
  # Sanity checker
    if (D % 2):
        print("Error - D can't be an odd number")
        return 0

    hv = np.zeros(D, dtype=int)
    hv = [1 if random.random() <= M else 0 for x in hv]
    return hv

In [22]:
def gen_signatures(m):
    if (m < 5):
        print('Small m not implemented yet')
        return 0
    return np.random.permutation(2**m)

In [23]:
def cos_angle(u,v):
    return np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))

# Encoding

In [24]:
def lookup_item_mem(item_memory, key, D):
    if (item_memory.get(key) is not None):
        value = item_memory[key]
    else:
        value = u_gen_rand_hv(D)
        item_memory[key] = value
        # Create signature
        item_memory['S'+key], item_memory['signatures'] = item_memory['signatures'][-1], item_memory['signatures'][:-1]

    return item_memory, value

In [25]:
def binarize_hv_sort(v, final_density):
    sort_array = np.copy(v)
    sort_array[::-1].sort()
    cutoff = round(len(v)*final_density)
    threshold = sort_array[cutoff-1]
    # print("Threshold: " + str(threshold))
    # threshold = final_density
    
    for i in range(len(v)):
        if v[i] >= threshold:
            v[i] = 1
        else:
            v[i] = 0
    return v

In [26]:
def binarize_hv_thr(v, threshold):
    
    for i in range(len(v)):
        if v[i] >= threshold:
            v[i] = 1
        else:
            v[i] = 0
    return v

In [27]:
def window_sum(buffer, letters_im, N, D, m, thr):
    block = np.zeros((N, D), dtype = int)
    sig_block = np.zeros(N, dtype=int)
    sum_hv = np.zeros(D, dtype = int)
    window_cnt = 0
    for i in range(len(buffer)):
        block = np.roll(block, (0, 1), axis=(1, 0))
        sig_block = np.roll(sig_block, 1)
        letters_im, block[0] = lookup_item_mem(letters_im, buffer[i], D)
        sig_block[0] = letters_im['S'+ buffer[i]]
        
        if i >= (N-1):
            window_hv = np.zeros(D, dtype = int)
            for k in range(0,N):
                xor_tot = 0
                for j in range(0,N):
                    if (j != k):
                        xor_tot = np.bitwise_xor(xor_tot, sig_block[j])
                perm = k + xor_tot    # k: 1->N, then k-1 + XOR's
                window_hv = window_hv + np.roll(block[k],perm)

            thr_window = binarize_hv_thr(window_hv, 2)  # Threshold the window sum
            # print('windowsum', sum(thr_window))
            sum_hv = sum_hv + thr_window    # Bundle the windows together into a text hv
            window_cnt += 1
    
    # THR = 310000
    # thr = 1
    # sum_hv = binarize_hv_sort(sum_hv, 0.1)   # Threshold the text hv
    # sum_hv = binarize_hv_thr(sum_hv, 0.075*window_cnt)   # Threshold the text hv
    # print('cnt: ' + str(window_cnt))
    # print('Density after sum: ' + str(np.sum(sum_hv)/len(sum_hv)))
    # print(np.sum(sum_hv)/len(sum_hv))

    # K = 1
    # thinned_list = []
    # for k in range(1,K+1):
    #     thinned_list.append(np.logical_and(sum_hv, np.roll(sum_hv,k)))
    # thinned_sum = np.array([1 if x >= 1 else 0 for x in sum(thinned_list)])

    # print('density after cdt: ' + str(np.sum(thinned_sum)/len(thinned_sum)))       
    return letters_im, sum_hv, window_cnt

In [28]:
def binarize_language_hv(lang_am, thr):
    # lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    lang_labels = ['bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    # lang_labels = ['afr', 'bul']
    lang_am_new = dict()
    for j in lang_labels:
        v = lang_am[j].copy()
        lang_am_new[j] = binarize_hv_sort(v, thr)
    return lang_am_new

# Training

In [29]:
def build_memories(N, D, m):
    item_mem = dict()
    lang_am = dict()
    lang_am_full = dict()
    # lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    lang_labels = ['bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    # lang_labels = ['bul','ces', 'dan','deu','eng','ell','spa']
    # lang_labels = ['afr', 'bul', 'ces', 'dan']
    item_mem['signatures'] = gen_signatures(m)

    for i in lang_labels:
        print('Reading file: ' + i)
        file_address = './training_texts/' + i + '.txt'
        with open(file_address) as f:
            buffer = f.read()

        THR = 310000
        # THR = 0.2*len(buffer)
        item_mem, lang_hv, window_cnt = window_sum(buffer, item_mem, N, D, m, THR)
        lang_am_full[i] = lang_hv.copy()
        # lang_am[i] = binarize_hv_thr(lang_hv, 0.024*window_cnt)
        lang_am[i] = binarize_hv_sort(lang_hv, 0.40)
        # lang_am[i] = lang_hv
        print('Density' + i + ': ' + str(np.sum(lang_am[i])/D))

        
    return item_mem, lang_am, lang_am_full

In [30]:
def test(item_mem, lang_am, N, D, m):
    total = 0
    correct = 0
    # lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    lang_labels = ['bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    lang_map = dict()
    lang_map['af'] = 'afr'
    lang_map['bg'] = 'bul'
    lang_map['cs'] = 'ces'
    lang_map['da'] = 'dan'
    lang_map['nl'] = 'nld'
    lang_map['de'] = 'deu'
    lang_map['en'] = 'eng'
    lang_map['et'] = 'est'
    lang_map['fi'] = 'fin'
    lang_map['fr'] = 'fra'
    lang_map['el'] = 'ell'
    lang_map['hu'] = 'hun'
    lang_map['it'] = 'ita'
    lang_map['lv'] = 'lav'
    lang_map['lt'] = 'lit'
    lang_map['pl'] = 'pol'
    lang_map['pt'] = 'por'
    lang_map['ro'] = 'ron'
    lang_map['sk'] = 'slk'
    lang_map['sl'] = 'slv'
    lang_map['es'] = 'spa'
    lang_map['sv'] = 'swe'

    for file in os.listdir('./testing_texts/'):
        if file.endswith('.txt'):
            actual_label = file[0:2]
            predict_lang = ''

            file_address = './testing_texts/' + file
            with open(file_address) as f:
                buffer = f.read()
            print('Loaded testing text file: ' + str(file_address))
            bufferlength = len(buffer)
            # print("Buffer length: " + str(bufferlength))
            test_thr = 0.3
            # print("THR value: " + str(test_thr))

            item_mem_n, text_hv, window_cnt = window_sum(buffer, item_mem, N, D, m, test_thr)
            # text_hv = binarize_hv_thr(text_hv, 0.022*window_cnt)
            text_hv = binarize_hv_thr(text_hv, 0.001*window_cnt)
            # text_hv = binarize_hv_sort(text_hv, 0.06)
            print('Density: '+ str(np.sum(text_hv)/D))
            if (item_mem != item_mem_n):
                print('\n>>>> NEW UNSEEN ITEM IN TEST FILE <<<<\n')
                exit()
            else:
                max_angle = -1
                # max_angle = 10000
                for j in range(len(lang_labels)):
                    if (lang_labels[j] in lang_am.keys()):
                        # angle = cos_angle(lang_am[lang_labels[j]], text_hv)     # Cosine similarity
                        angle = np.sum(np.logical_and(lang_am[lang_labels[j]], text_hv))       # Overlap
                        if (angle > max_angle):
                        # if (angle < max_angle):
                            max_angle = angle
                            predict_lang = lang_labels[j]
                
                if (predict_lang == lang_map[actual_label]):
                    correct += 1
                else:
                    print(str(lang_map[actual_label]) + '-->' + predict_lang)
                total += 1
                print('Accuracy: ' + str(correct / total))
    return correct / total

In [31]:
item_mem, lang_am, lang_am_full = build_memories(N,D,m)

Reading file: bul


KeyboardInterrupt: 

In [34]:
# np.save('item_mem_2k_2d_5w.npy', item_mem)
# np.save('lang_am_2k_2d.npy', lang_am)
# np.save('lang_am_full_2k_2d_5w.npy', lang_am_full)
# np.save('lang_am_all_500_full.npy', lang_am_full)
# item_mem = np.load('new_data/item_mem_2k_2d_5w.npy', allow_pickle='TRUE').item()
# lang_am_full = np.load('new_data/lang_am_full_2k_2d_5w.npy', allow_pickle='TRUE').item()

# lang_am = np.copy(lang_am2)
lang_am = binarize_language_hv(lang_am_full, 0.50)
# item_mem = dict()
# lang_am = dict()

In [35]:
test(item_mem, lang_am, N, D, m)

Loaded testing text file: ./testing_texts/bg_0_p.txt
Density: 0.24
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_100_p.txt
Density: 0.3435
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_101_p.txt
Density: 0.613
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_102_p.txt
Density: 0.4165
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_103_p.txt
Density: 0.3755
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_104_p.txt
Density: 0.485
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_105_p.txt
Density: 0.284
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_106_p.txt
Density: 0.5025
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_107_p.txt
Density: 0.3215
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_108_p.txt
Density: 0.5195
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_109_p.txt
Density: 0.4605
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_10_p.txt
Density: 0.214
Acc

0.7774655935996952

In [None]:
# item_mem = np.load('item_mem_all_500.npy', allow_pickle='TRUE').item()
# lang_am = np.load('lang_am_all_500_full.npy', allow_pickle='TRUE').item()
# item_mem = np.load('old_data/item_mem_all.npy', allow_pickle='TRUE').item() // this one has 10% IM density for D=10000
# lang_am = np.load('old_data/lang_am_all.npy', allow_pickle='TRUE').item()

# test = lang_am_full['bul'].copy()

# lang_am['afr'] = binarize_hv_thr(lang_am_full['afr'], 0.03*949192)
# lang_am['bul'] = binarize_hv_thr(test, 0.025*1074672)
# lang_am['afr'] = binarize_hv_sort(lang_am['afr'], 0.06)
# lang_am['bul'] = binarize_hv_sort(lang_am['bul'], 0.06)
# for i in lang_am.keys():
#   lang_am[i] = binarize_hv_sort(lang_am[i], 0.3)
# print(np.sum(item_mem['a'])/D)
# test = [1,3,4,4]
# arr = np.array(test)
# print(arr)
# a = arr.copy()
# arr[0] = 3
# print(a)

In [None]:
for i in lang_am:
  print(i + ':' + str(np.sum(lang_am[i])/D))
# for i in item_mem:
#   print(str(np.sum(item_mem[i])/D))

bul:0.5
ces:0.5
dan:0.5
nld:0.5
deu:0.5005
eng:0.5005
est:0.5
fin:0.5005
fra:0.5
ell:0.5
hun:0.5
ita:0.5
lav:0.5
lit:0.5
pol:0.5
por:0.5
ron:0.5
slk:0.501
slv:0.5
spa:0.5
swe:0.5
