In [None]:
import pandas as pd
from SmilesPE.pretokenizer import atomwise_tokenizer
from hdlib.space import Vector, Space
from hdlib.arithmetic import bundle, bind
from hdlib.model import MLModel
import random
from sklearn import metrics

In [None]:
memory = Space() # ITEM MEMORY

In [None]:
df = pd.read_csv("tox21.csv")
df = df.dropna(subset=["NR-ER-LBD"]).reset_index(drop=True)

In [None]:
all_entries = list()
to_insert = list()

In [None]:
for i, row in df.iterrows():
    val = row['NR-ER-LBD']
    smiles = row['smiles']
    all_entries.append((val, smiles))
    tokens = atomwise_tokenizer(smiles)
    to_insert.extend(tokens)

In [None]:
memory.bulk_insert(to_insert)

In [None]:
random.shuffle(all_entries)
split_index = int(0.8 * len(all_entries))
sample_80 = all_entries[:split_index]
sample_20 = all_entries[split_index:]

In [None]:
zero_vecs = [all_entries[i][1] for i in range(0, len(sample_80)) if all_entries[i][0] == 0]
one_vecs = [all_entries[i][1] for i in range(0, len(sample_80)) if all_entries[i][0] == 1]

In [None]:
def encode_sample(sample, shared_space):
    str_vec = dict()
    for hd_vec in sample:
        cur_tokens = atomwise_tokenizer(hd_vec)
        if len(cur_tokens) == 1:
            return shared_space.get(names=[cur_tokens[0]])[0]
        token_vec0 = shared_space.get(names=[cur_tokens[0]])[0]
        token_vec1 = shared_space.get(names=[cur_tokens[1]])[0]
        token_vec0.permute(rotate_by=0)
        token_vec1.permute(rotate_by=1)
        culmination = bind(token_vec0, token_vec1)
        for i in range(2, len(cur_tokens)):
            current_vec = shared_space.get(names=[cur_tokens[i]])[0]
            current_vec.permute(rotate_by=i)
            culmination = bind(culmination, current_vec)
        
        str_vec[hd_vec] = culmination
    mol_vecs = list(str_vec.values())
    class_vec = bind(mol_vecs[0], mol_vecs[1])
    for i in range(2, len(mol_vecs)):
        current_vec = mol_vecs[i]
        class_vec = bind(class_vec, current_vec)
    return class_vec
    

In [None]:
def encode_smi(smiles, shared_space):
    tokens = atomwise_tokenizer(smiles)
    
    if len(tokens) == 1:
        return shared_space.get(names=[tokens[0]])[0]
    
    vec0 = shared_space.get(names=[tokens[0]])[0]
    vec1 = shared_space.get(names=[tokens[1]])[0]
    
    vec0.permute(rotate_by=0)
    vec1.permute(rotate_by=1)
    
    result = bind(vec0, vec1)
    
    for i in range(2, len(tokens)):
        v = shared_space.get(names=[tokens[i]])[0]
        v.permute(rotate_by=i)
        result = bind(result, v)
    
    return result

In [None]:
zero_cv = encode_sample(zero_vecs, memory)
one_cv = encode_sample(one_vecs, memory)

Now, we have our "0" class vector, our "1" class vector, and item memory (from up above)

In [None]:
real_vs_pred = dict()
for i in range(0, len(sample_20)):
    vec_rep = encode_smi(sample_20[i][1], memory)
    distance_to_0 = vec_rep.dist(zero_cv, method="cosine")
    distance_to_1 = vec_rep.dist(one_cv, method="cosine")
    prediction = 0 if distance_to_0 < distance_to_1 else 1
    real_vs_pred[sample_20[i][1]] = (sample_20[i][0], prediction)

In [None]:
rvp_vals = list(real_vs_pred.values())

In [None]:
desired_index = 1
real = [t[0] for t in rvp_vals]
pred = [t[1] for t in rvp_vals]

In [None]:
cm = metrics.confusion_matrix(real, pred)

In [None]:
print(cm)