In [1]:
import numpy as np
import os
# import copy
import random

In [2]:
###############################################
# Set parameters
###############################################
D = 2048
M = 40   # Number of ones
N = 3     # Sliding window size
m = 5     # Signature size
print("Density: " + str(M/D))

Density: 0.01953125


In [3]:
def u_gen_rand_hv(D):

    # Sanity checker
    if (D % 2):
        print("Error - D can't be an odd number")
        return 0

    hv = np.zeros(D, dtype = int)
    indices = np.random.permutation(D)

    hv[indices >= M] = 0
    hv[indices < M] = 1

    return hv

In [4]:
def gen_signatures(m):
    if (m < 5):
        print('Small m not implemented yet')
        return 0
    return np.random.permutation(2**m)

In [5]:
def cos_angle(u,v):
    return np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))

# Encoding

In [6]:
def lookup_item_mem(item_memory, key, D):
    if (item_memory.get(key) is not None):
        value = item_memory[key]
    else:
        value = u_gen_rand_hv(D)
        item_memory[key] = value
        # Create signature
        item_memory['S'+key], item_memory['signatures'] = item_memory['signatures'][-1], item_memory['signatures'][:-1]

    return item_memory, value

In [7]:
def binarize_hv_sort(v, final_density):
    sort_array = np.copy(v)
    sort_array[::-1].sort()
    cutoff = round(len(v)*final_density)
    threshold = sort_array[cutoff-1]
    # print("Threshold: " + str(threshold))
    # threshold = final_density
    
    for i in range(len(v)):
        if v[i] >= threshold:
            v[i] = 1
        else:
            v[i] = 0
    return v

In [8]:
def binarize_hv_thr(v, threshold):
    
    for i in range(len(v)):
        if v[i] >= threshold:
            v[i] = 1
        else:
            v[i] = 0
    return v

In [9]:
def window_sum(buffer, letters_im, N, D, m, window_write_en=False):
    block = np.zeros((N, D), dtype = int)
    sig_block = np.zeros(N, dtype=int)
    sum_hv = np.zeros(D, dtype = int)
    window_cnt = 0
    
    if window_write_en:
        open('lang_after_thr1.txt', 'w').close()

    for i in range(len(buffer)):
        block = np.roll(block, (0, 1), axis=(1, 0))
        sig_block = np.roll(sig_block, 1)
        letters_im, block[0] = lookup_item_mem(letters_im, buffer[i], D)
        sig_block[0] = letters_im['S'+ buffer[i]]
        
        # if i >= (N-1):
        window_hv = np.zeros(D, dtype = int)
        for k in range(0,N):
            xor_tot = 0
            for j in range(0,N):
                if (j != k):
                    xor_tot = np.bitwise_xor(xor_tot, sig_block[j])
            perm = k + xor_tot    # k: 1->N, then k-1 + XOR's
            # print("k: " + str(k) + "perm: " + str(perm))
            window_hv = window_hv + np.roll(block[k],perm)

        thr_window = binarize_hv_thr(window_hv, 2)  # Threshold the window sum
        # print('windowsum', sum(thr_window))

        if (window_write_en):
            after_thr = "".join(str(x) for x in thr_window)
            h = open("lang_after_thr1.txt", "a")
            h.write(after_thr+ "\n")
            h.close()

        sum_hv = sum_hv + thr_window    # Bundle the windows together into a text hv
        window_cnt += 1

    
    # THR = 310000
    # thr = 1
    # sum_hv = binarize_hv_sort(sum_hv, 0.1)   # Threshold the text hv
    # sum_hv = binarize_hv_thr(sum_hv, 0.075*window_cnt)   # Threshold the text hv
    # print('cnt: ' + str(window_cnt))
    # print('Density after sum: ' + str(np.sum(sum_hv)/len(sum_hv)))
    # print(np.sum(sum_hv)/len(sum_hv))

    # K = 1
    # thinned_list = []
    # for k in range(1,K+1):
    #     thinned_list.append(np.logical_and(sum_hv, np.roll(sum_hv,k)))
    # thinned_sum = np.array([1 if x >= 1 else 0 for x in sum(thinned_list)])

    # print('density after cdt: ' + str(np.sum(thinned_sum)/len(thinned_sum)))       
    return letters_im, sum_hv, window_cnt

In [10]:
def binarize_language_hv(lang_am, thr):
    # lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    # lang_labels = ['afr', 'bul']
    lang_labels = ['bul','ces','dan','deu','ell','eng','spa']
    for j in lang_labels:
        v = lang_am[j]
        lang_am[j] = binarize_hv_sort(v, thr)
    return lang_am

# Training

In [11]:
def build_memories(N, D, m, lang_labels, item_mem):
    # item_mem = dict()
    lang_am = dict()
    lang_am_full = dict()
    # lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    # lang_labels = ['bul','ces', 'dan','deu','eng','ell','spa']
    # lang_labels = ['afr', 'bul']
    # lang_labels = ['afr', 'bul', 'ces', 'dan']
    # item_mem['signatures'] = gen_signatures(m)

    for i in lang_labels:
        print('Reading file: ' + i)
        file_address = './training_texts/' + i + '.txt'
        with open(file_address) as f:
            buffer = f.read()

        # THR = 310000
        # THR = 0.2*len(buffer)
        item_mem, lang_hv, window_cnt = window_sum(buffer, item_mem, N, D, m, False)
        lang_am_full[i] = lang_hv.copy()
        # lang_am[i] = binarize_hv_thr(lang_hv, 0.024*window_cnt)
        lang_am[i] = binarize_hv_sort(lang_hv, 0.3)
        # lang_am[i] = binarize_hv_sort(lang_hv, 0.15)
        # lang_am[i] = lang_hv
        print('Density' + i + ': ' + str(np.sum(lang_am[i])/D))

        
    return item_mem, lang_am, lang_am_full

In [12]:
def test(item_mem, lang_am, N, D, m):
    total = 0
    correct = 0
    lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
    lang_map = dict()
    lang_map['af'] = 'afr'
    lang_map['bg'] = 'bul'
    lang_map['cs'] = 'ces'
    lang_map['da'] = 'dan'
    lang_map['nl'] = 'nld'
    lang_map['de'] = 'deu'
    lang_map['en'] = 'eng'
    lang_map['et'] = 'est'
    lang_map['fi'] = 'fin'
    lang_map['fr'] = 'fra'
    lang_map['el'] = 'ell'
    lang_map['hu'] = 'hun'
    lang_map['it'] = 'ita'
    lang_map['lv'] = 'lav'
    lang_map['lt'] = 'lit'
    lang_map['pl'] = 'pol'
    lang_map['pt'] = 'por'
    lang_map['ro'] = 'ron'
    lang_map['sk'] = 'slk'
    lang_map['sl'] = 'slv'
    lang_map['es'] = 'spa'
    lang_map['sv'] = 'swe'

    for file in os.listdir('./testing_texts/'):
        if file.endswith('.txt'):
            actual_label = file[0:2]
            predict_lang = ''

            file_address = './testing_texts/' + file
            with open(file_address) as f:
                buffer = f.read()
            print('Loaded testing text file: ' + str(file_address))
            bufferlength = len(buffer)
            print("Buffer length: " + str(bufferlength))

            item_mem_n, text_hv, window_cnt = window_sum(buffer, item_mem, N, D, m)
            text_hv = binarize_hv_thr(text_hv, 0.005*window_cnt)
            # text_hv = binarize_hv_thr(text_hv, 0.03*window_cnt)
            # text_hv = binarize_hv_sort(text_hv, 0.06)
            print('Density: '+ str(np.sum(text_hv)/D))
            if (item_mem != item_mem_n):
                print('\n>>>> NEW UNSEEN ITEM IN TEST FILE <<<<\n')
                exit()
            else:
                max_angle = -1
                # max_angle = 10000
                for j in range(len(lang_labels)):
                    if (lang_labels[j] in lang_am.keys()):
                        # angle = cos_angle(lang_am[lang_labels[j]], text_hv)     # Cosine similarity
                        angle = np.sum(np.logical_and(lang_am[lang_labels[j]], text_hv))       # Overlap
                        if (angle > max_angle):
                        # if (angle < max_angle):
                            max_angle = angle
                            predict_lang = lang_labels[j]
                
                if (predict_lang == lang_map[actual_label]):
                    correct += 1
                else:
                    print(str(lang_map[actual_label]) + '-->' + predict_lang)
                total += 1
                print('Accuracy: ' + str(correct / total))
    return correct / total

In [13]:
def import_im(D):
  item_mem = dict()
  letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','\n']
  f = open("im_man_2k.txt","r")
  for n in range(28):
    line = f.readline()
    line = list(line[-(D+2):-2])
    line = [int(x) for x in line]
    item_mem[letters[n]] = np.array(line)
    item_mem['S' + letters[n]] = n
    item_mem['signatures'] = np.array([28, 29, 30, 31])
  
  f.close()
  return item_mem

In [14]:
lang_labels = ['afr', 'bul', 'ces', 'dan', 'nld', 'deu', 'eng', 'est', 'fin', 'fra', 'ell', 'hun', 'ita', 'lav', 'lit', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe']
# lang_labels = ['bul','ces','dan','deu','ell','eng','spa']
# lang_labels = ['afr', 'bul']

# item_mem = dict()
# item_mem['signatures'] = gen_signatures(m)
item_mem = import_im(D)
# print(item_mem)

In [15]:
item_mem, lang_am, lang_am_full = build_memories(N,D,m,lang_labels, item_mem)

Reading file: afr
Densityafr: 0.30029296875
Reading file: bul
Densitybul: 0.2998046875
Reading file: ces
Densityces: 0.2998046875
Reading file: dan
Densitydan: 0.2998046875
Reading file: nld
Densitynld: 0.2998046875
Reading file: deu
Densitydeu: 0.2998046875
Reading file: eng
Densityeng: 0.2998046875
Reading file: est
Densityest: 0.2998046875
Reading file: fin
Densityfin: 0.30029296875
Reading file: fra
Densityfra: 0.2998046875
Reading file: ell
Densityell: 0.2998046875
Reading file: hun
Densityhun: 0.2998046875
Reading file: ita
Densityita: 0.2998046875
Reading file: lav
Densitylav: 0.2998046875
Reading file: lit
Densitylit: 0.2998046875
Reading file: pol
Densitypol: 0.2998046875
Reading file: por
Densitypor: 0.2998046875
Reading file: ron
Densityron: 0.2998046875
Reading file: slk
Densityslk: 0.2998046875
Reading file: slv
Densityslv: 0.2998046875
Reading file: spa
Densityspa: 0.2998046875
Reading file: swe
Densityswe: 0.2998046875


In [16]:
def export_am(lang_am, D):
  am_out = ""
  keys = list(lang_am.keys())
  for label_num in range(len(keys)):
    hv = "".join(str(x) for x in lang_am[keys[label_num]])
    # hv = str(D) + "'b" + hv
    # am_out = am_out + "6'd" + str(label_num) + ": " + "am_man_out = " + hv + ";\n"
    am_out = am_out + hv
    if (label_num != len(keys)-1):
      am_out += "\n"

  f = open("lang_am_2k.txt", "w")
  f.write(am_out)
  f.close()
  g = open("lang_am_labels.txt", "w")
  g.write(str(keys))
  g.close()
  return

In [17]:
def import_am(D, lang_labels):
  lang_am = dict()
  f = open("lang_am_2k.txt","r")
  for n in range(len(lang_labels)):
    line = f.readline()
    # line = list(line[-(D+2):-2])
    line = list(line[0:D])
    # print(len(line))
    line = [int(x) for x in line]
    lang_am[lang_labels[n]] = np.array(line)
  
  f.close()
  return lang_am

In [18]:
export_am(lang_am, D)
# item_mem2 = import_im(D)
# print(item_mem2)
# lang_am = import_am(D, lang_labels)
# print(lang_am2)

In [19]:
# np.save('item_mem.npy', item_mem)
# np.save('lang_am.npy', lang_am)
np.save('lang_am_2k_full.npy', lang_am_full)
# item_mem = np.load('item_mem.npy', allow_pickle='TRUE').item()
# lang_am = np.load('lang_am.npy', allow_pickle='TRUE').item()

# lang_am = np.copy(lang_am2)
# lang_am = binarize_language_hv(lang_am_full, 0.3)
print(lang_am)
# item_mem = dict()
# lang_am = dict()

{'afr': array([0, 0, 0, ..., 0, 0, 0]), 'bul': array([0, 0, 0, ..., 0, 0, 0]), 'ces': array([0, 0, 0, ..., 1, 0, 0]), 'dan': array([0, 1, 0, ..., 1, 0, 0]), 'nld': array([0, 0, 0, ..., 0, 0, 0]), 'deu': array([0, 0, 0, ..., 0, 1, 0]), 'eng': array([0, 0, 0, ..., 0, 1, 0]), 'est': array([0, 0, 0, ..., 1, 0, 0]), 'fin': array([0, 0, 0, ..., 0, 0, 0]), 'fra': array([0, 0, 0, ..., 0, 0, 0]), 'ell': array([0, 0, 0, ..., 1, 0, 0]), 'hun': array([1, 0, 0, ..., 1, 0, 0]), 'ita': array([0, 0, 0, ..., 1, 0, 0]), 'lav': array([0, 0, 0, ..., 1, 0, 0]), 'lit': array([1, 0, 0, ..., 1, 1, 0]), 'pol': array([0, 0, 0, ..., 0, 0, 0]), 'por': array([1, 0, 0, ..., 0, 0, 0]), 'ron': array([1, 0, 0, ..., 0, 0, 0]), 'slk': array([0, 0, 0, ..., 1, 0, 0]), 'slv': array([0, 0, 0, ..., 0, 0, 0]), 'spa': array([1, 0, 0, ..., 0, 0, 0]), 'swe': array([0, 1, 0, ..., 0, 0, 0])}


In [20]:
test(item_mem, lang_am, N, D, m)

Loaded testing text file: ./testing_texts/bg_0_p.txt
Buffer length: 82
Density: 0.087890625
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_100_p.txt
Buffer length: 132
Density: 0.10009765625
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_101_p.txt
Buffer length: 300
Density: 0.0654296875
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_102_p.txt
Buffer length: 173
Density: 0.1455078125
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_103_p.txt
Buffer length: 133
Density: 0.1240234375
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_104_p.txt
Buffer length: 196
Density: 0.1533203125
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_105_p.txt
Buffer length: 100
Density: 0.09521484375
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_106_p.txt
Buffer length: 241
Density: 0.056640625
Accuracy: 1.0
Loaded testing text file: ./testing_texts/bg_107_p.txt
Buffer length: 109
Density: 0.09716796875
Accuracy: 1.0
Loaded testi

0.9528548978522787

In [21]:
# other cdt, superimpose part of buffer then cdt then superimpose those
# item_mem = np.load('item_mem_all_500.npy', allow_pickle='TRUE').item()
# lang_am = np.load('lang_am_all_500_full.npy', allow_pickle='TRUE').item()

# test = lang_am_full['bul'].copy()

# lang_am['afr'] = binarize_hv_thr(lang_am_full['afr'], 0.03*949192)
# lang_am['bul'] = binarize_hv_thr(test, 0.025*1074672)
# lang_am['afr'] = binarize_hv_sort(lang_am['afr'], 0.06)
# lang_am['bul'] = binarize_hv_sort(lang_am['bul'], 0.06)
# for i in lang_am.keys():
#   lang_am[i] = binarize_hv_sort(lang_am[i], 0.3)
# print(np.sum(item_mem['a'])/D)
# test = [1,3,4,4]
# arr = np.array(test)
# print(arr)
# a = arr.copy()
# arr[0] = 3
# print(a)

In [22]:
for i in lang_am:
  print(i + ':' + str(np.sum(lang_am[i])/D))

afr:0.30029296875
bul:0.2998046875
ces:0.2998046875
dan:0.2998046875
nld:0.2998046875
deu:0.2998046875
eng:0.2998046875
est:0.2998046875
fin:0.30029296875
fra:0.2998046875
ell:0.2998046875
hun:0.2998046875
ita:0.2998046875
lav:0.2998046875
lit:0.2998046875
pol:0.2998046875
por:0.2998046875
ron:0.2998046875
slk:0.2998046875
slv:0.2998046875
spa:0.2998046875
swe:0.2998046875


In [23]:
def buffer_to_binary(buffer):
  letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','\n']
  lang_data = ""
  for i in range(len(buffer)):
    if (buffer[i] not in letters):
      print("Error: unknown letter encountered")
      return None
    idx = letters.index(buffer[i])
    bin_idx = bin(idx)[2:].zfill(6) # removes "0b" and adds zeros in front
    lang_data = lang_data + bin_idx
    if (i != len(buffer)-1):
      lang_data += "\n"
  return lang_data

In [24]:
def test_hw_small(item_mem, lang_am, N, D, m):

    predict_lang = ''

    file_address = './lang_data_text.txt'
    with open(file_address) as f:
        buffer = f.read()
    print('Loaded testing text file: ' + str(file_address))
    bufferlength = len(buffer)
    print("Buffer length: " + str(bufferlength))

    # Generate lang_data binary input for hardware
    lang_data = buffer_to_binary(buffer)
    if lang_data is None:
        print("Error")
        return
    f = open("lang_data.txt", "w")
    f.write(lang_data)
    f.close()

    window_write_en = True
    item_mem_n, text_hv, window_cnt = window_sum(buffer, item_mem, N, D, m, window_write_en)

    # Store after acc2 hv in file
    # acc2_hv = "".join(str(x) for x in text_hv)
    # g = open("lang_after_acc2.txt", "w")
    # g.write(acc2_hv)
    # g.close()

    text_hv = binarize_hv_thr(text_hv, 0.005*window_cnt)
    # text_hv = binarize_hv_thr(text_hv, 0.03*window_cnt)
    # text_hv = binarize_hv_sort(text_hv, 0.06)
    print("Window cnt: " + str(window_cnt))
    
    # Store encoded hv in file
    encoded_hv = "".join(str(x) for x in text_hv)
    g = open("lang_encoded_2k.txt", "w")
    g.write(encoded_hv)
    g.close()

    print('Density: '+ str(np.sum(text_hv)/D))
    if (item_mem != item_mem_n):
        print('\n>>>> NEW UNSEEN ITEM IN TEST FILE <<<<\n')
        exit()
    else:
        max_angle = -1
        predict_num = 0
        # max_angle = 10000
        for j in range(len(lang_labels)):
            if (lang_labels[j] in lang_am.keys()):
                # angle = cos_angle(lang_am[lang_labels[j]], text_hv)     # Cosine similarity
                angle = np.sum(np.logical_and(lang_am[lang_labels[j]], text_hv))       # Overlap
                if (angle > max_angle):
                # if (angle < max_angle):
                    max_angle = angle
                    predict_lang = lang_labels[j]
                    predict_num = j
        
        print('Prediction: ' + str(predict_lang) + ', Num: ' + str(predict_num) + ', Overlap: ' + str(max_angle))

    return

In [27]:
test_hw_small(item_mem,lang_am,N,D,m)

Loaded testing text file: ./lang_data_text.txt
Buffer length: 26
Window cnt: 26
Density: 0.02734375
Prediction: eng, Num: 6, Overlap: 40
