In [None]:
import pandas as pd
from SmilesPE.pretokenizer import atomwise_tokenizer
from hdlib.space import Vector, Space
from hdlib.arithmetic import bundle, bind
import random
from sklearn.model_selection import KFold
from sklearn import metrics

In [None]:
memory = Space() # ITEM MEMORY

In [None]:
df = pd.read_csv("smiles_code/tox21.csv")
df = df.dropna(subset=["NR-ER-LBD"]).reset_index(drop=True)

In [None]:
all_entries = list()
to_insert = list()

In [None]:
for i, row in df.iterrows():
    val = row['NR-ER-LBD']
    smiles = row['smiles']
    all_entries.append((val, smiles))
    tokens = atomwise_tokenizer(smiles)
    to_insert.extend(tokens)

In [None]:
memory.bulk_insert(to_insert)

In [None]:
random.shuffle(all_entries)
split_index = int(0.8 * len(all_entries))
sample_80 = all_entries[:split_index]
sample_20 = all_entries[split_index:]

In [None]:
zero_vecs = [all_entries[i][1] for i in range(0, len(sample_80)) if all_entries[i][0] == 0]
one_vecs = [all_entries[i][1] for i in range(0, len(sample_80)) if all_entries[i][0] == 1]

In [None]:
def encode_sample(sample, shared_space):
    str_vec = dict()
    for hd_vec in sample:
        cur_tokens = atomwise_tokenizer(hd_vec)
        if len(cur_tokens) == 1:
            return shared_space.get(names=[cur_tokens[0]])[0]
        token_vec0 = shared_space.get(names=[cur_tokens[0]])[0]
        token_vec1 = shared_space.get(names=[cur_tokens[1]])[0]
        token_vec0.permute(rotate_by=0)
        token_vec1.permute(rotate_by=1)
        culmination = bind(token_vec0, token_vec1)
        for i in range(2, len(cur_tokens)):
            current_vec = shared_space.get(names=[cur_tokens[i]])[0]
            current_vec.permute(rotate_by=i)
            culmination = bind(culmination, current_vec)
        
        str_vec[hd_vec] = culmination
    mol_vecs = list(str_vec.values())
    class_vec = bind(mol_vecs[0], mol_vecs[1])
    for i in range(2, len(mol_vecs)):
        current_vec = mol_vecs[i]
        class_vec = bind(class_vec, current_vec)
    return class_vec
    

In [None]:
def encode_smi(smiles, shared_space):
    tokens = atomwise_tokenizer(smiles)
    
    if len(tokens) == 1:
        return shared_space.get(names=[tokens[0]])[0]
    
    vec0 = shared_space.get(names=[tokens[0]])[0]
    vec1 = shared_space.get(names=[tokens[1]])[0]
    
    vec0.permute(rotate_by=0)
    vec1.permute(rotate_by=1)
    
    result = bind(vec0, vec1)
    
    for i in range(2, len(tokens)):
        v = shared_space.get(names=[tokens[i]])[0]
        v.permute(rotate_by=i)
        result = bind(result, v)
    
    return result

In [None]:
K = 5

In [None]:
kf = KFold(n_splits=K, shuffle=True, random_state=40)

In [None]:
for fold_idx, (training_indices, testing_indices) in enumerate(kf.split(all_entries)):
    real_all, pred_all = [], []
    train_data = [all_entries[i] for i in training_indices]
    test_data = [all_entries[i] for i in testing_indices]
    zero_vecs = [smi for lbl, smi in train_data if lbl == 0]
    one_vecs = [smi for lbl, smi in train_data if lbl == 1]
    zero_cv = encode_sample(zero_vecs, memory)
    one_cv = encode_sample(one_vecs, memory)
    for lbl, smiles in test_data:
        vec_rep = encode_smi(smiles, memory)
        dist0 = vec_rep.dist(zero_cv, method="cosine")
        dist1 = vec_rep.dist(one_cv, method="cosine")
        pred = 0 if dist0 < dist1 else 1
        real_all.append(lbl)
        pred_all.append(pred)
    print(f"\t\tIteration {fold_idx + 1}")
    labels = [0, 1]
    cm = metrics.confusion_matrix(real_all, pred_all, labels=labels)
    df_cm = pd.DataFrame(cm, index=[f"True {l}" for l in labels],
                            columns=[f"Pred {l}" for l in labels])
    print("Confusion Matrix (cross-validated):")
    print(df_cm)
    print("\nClassification Report:")
    print(metrics.classification_report(real_all, pred_all, labels=labels, digits=4))