## NeuroAlign - Test

Unit testing notebook.

In [5]:
import numpy as np
import Config as config
import Data as data


bb11001 = data.Fasta("./data/BB11001.fasta", gaps = True, contains_lower_case = True)
bb11002 = data.Fasta("./data/BB11002.fasta", gaps = True, contains_lower_case = True)
bb11003 = data.Fasta("./data/BB11003.fasta", gaps = True, contains_lower_case = True)

def test_property(x, y):
    assert x == y, "should be " + str(y) + " but is " + str(x)
    
def test_markers(s, lengths):
    for seq, l in zip(s, lengths):
        test_property(np.argmax(seq[0]), data.GAP_MARKER)
        test_property(np.argmax(seq[1]), data.START_MARKER)
        test_property(np.argmax(seq[l+2]), data.END_MARKER)
        assert np.all(seq[(l+3):] == 0), "invalid padding"
        
aa_to_index = {aa : i for i, aa in enumerate(data.ALPHABET+["-"])}
def test_aminoacid(vec, aa):
    test_property(np.argmax(vec), aa_to_index[aa.upper()])
def test_sequence(seq, seq_ref):
    for s,sr in zip(seq[2:], seq_ref):
        test_aminoacid(s,sr)

full_seq_1 = bb11001.one_hot_sequences()
full_seq_2 = bb11002.one_hot_sequences()
full_seq_3 = bb11003.one_hot_sequences()
part_seq_1 = bb11001.one_hot_sequences([0,1])
part_seq_2 = bb11002.one_hot_sequences([3,4,5])
part_seq_3 = bb11003.one_hot_sequences([1,3])

test_property(full_seq_1.shape[0], 4)
test_property(full_seq_2.shape[0], 8)
test_property(full_seq_3.shape[0], 4)
test_property(part_seq_1.shape[0], 2)
test_property(part_seq_2.shape[0], 3)
test_property(part_seq_3.shape[0], 2)

test_property(full_seq_1.shape[1], 94)
test_property(part_seq_1.shape[1], 88)

test_markers(full_seq_1, [83, 85, 91, 86])
test_markers(full_seq_2, [58, 193, 83, 52, 83, 101, 134, 80])
test_markers(full_seq_3, [446, 516, 504, 414])

test_sequence(full_seq_1[0], "gkgdpkkprgkmssyaffvqtsreehkkkhpdasvnfsefskkcserwktmsakekgkfedmakadkaryeremktyippkge")
test_sequence(full_seq_1[1], "mqdrvkrpmnafivwsrdqrrkmalenprmrnseiskqlgyqwkmlteaekwpffqeaqklqamhrekypnykyrprrkakmlpk")
test_sequence(full_seq_1[2], "mkklkkhpdfpkkpltpyfrffmekrakyaklhpemsnldltkilskkykelpekkkmkyiqdfqrekqefernlarfredhpdliqnakk")
test_sequence(full_seq_1[3], "mhikkplnafmlymkemranvvaestlkesaainqilgrrwhalsreeqakyyelarkerqlhmqlypgwsardnygkkkkrkrek")
test_sequence(full_seq_2[0], "nlfvalydfvasgdntlsitkgeklrvlgynhngewceaqtkngqgwvpsnyitpvns")
test_sequence(full_seq_2[1], "plallldsslegefdlvqriiyevddpslpndegitalhnavcaghteivkflvqfgvnvnaadsdgwtplhcaascnnvqvckflvesgaavfamtysdmqtaadkceemeegytqcsqflygvqekmgimnkgviyalwdyepqnddelpmkegdcmtiihrededeiewwwarlndkegyvprnllglyp")
test_sequence(full_seq_2[2], "aegyqyralydykkereedidlhlgdiltvnkgslvalgfsdgqearpeeigwlngynettgergdfpgtyveyigrkkispp")
test_sequence(full_seq_2[5], "adrklcadqecshpismavalqdymapdcrfltihrgqvvyvfsklkgrgrlfwggsvqgdyygdlaarlgyfpssivredqtlkpgkvdvktdkwdfycq")

test_sequence(part_seq_1[0], "gkgdpkkprgkmssyaffvqtsreehkkkhpdasvnfsefskkcserwktmsakekgkfedmakadkaryeremktyippkge")
test_sequence(part_seq_1[1], "mqdrvkrpmnafivwsrdqrrkmalenprmrnseiskqlgyqwkmlteaekwpffqeaqklqamhrekypnykyrprrkakmlpk")
test_sequence(part_seq_3[0], "mtvepfrnepietfqteearramrealrrvreefgrhyplyiggewvdtkermvslnpsapsevvgttakagkaeaeaaleaawkafktwkdwpqedrsrlllkaaalmrrrkreleatlvyevgknwveasadvaeaidfieyyaraalryrypavevvpypgednesfyvplgagvviapwnfpvaiftgmivgpvavgntviakpaedavvvgakvfeifheagfppgvvnflpgvgeevgaylvehprirfinftgslevglkiyeaagrlapgqtwfkrayvetggknaiivdetadfdlaaegvvvsaygfqgqkcsaasrliltqgayepvlervlkraerlsvgpaeenpdlgpvvsaeqerkvlsyieigknegqlvlggkrlegegyfiaptvftevppkariaqeeifgpvlsvirvkdfaealevandtpygltggvysrkrehlewarrefhvgnlyfnrkitgalvgvqpfggfklsgtnaktgaldylrlflemkavaerf")
test_sequence(part_seq_3[1], "dellekakkvreawdvlrnattreknkaikkiaeklderrkeileanridvekarergvkeslvdrlalndkridexikacetviglkdpvgevidswvredglriarvrvpigpigiiyesrpnvtvettilalksgntillrggsdalnsnkaivsairealketeipessvefientdrslvlexirlreylslviprggyglisfvrdnatvpvletgvgnchifvdesadlkkavpviinaktqrpgtcnaaekllvhekiakeflpviveelrkhgvevrgcektreivpdvvpateddwpteyldliiaikvvknvdeaiehikkystghsesiltenysnakkfvseidaaavyvnastrftdggqfgfgaeigistqrfhargpvglrelttykfvvlgeyhvre")


input1, target1 = data.get_input_target_data(bb11001, [0,1,2,3], config.base_model)
assert np.all(input1["sequences"] == full_seq_1[:,1:,:])
input2, target2 = data.get_input_target_data(bb11002, list(range(8)), config.base_model)
assert np.all(input2["sequences"] == full_seq_2[:,1:,:])
input3, target3 = data.get_input_target_data(bb11002, [3,4,5], config.base_model)
assert np.all(input3["sequences"] == part_seq_2[:,1:,:])

def test_column(col, col_as_string):
    counts = np.zeros(len(data.ALPHABET)+3)
    for x in col_as_string:
        counts[aa_to_index[x.upper()]] += 1
    counts /= len(col_as_string)
    assert np.all(np.isclose(col, counts)), "column " + col_as_string + " has wrong distribution " + str(col)
    
    
test_property(np.argmax(input1["in_columns"][0]), data.START_MARKER)
test_column(input1["in_columns"][1], "--m-")    
test_column(input1["in_columns"][2], "--k-")  
test_column(input1["in_columns"][3], "--k-")   
test_column(input1["in_columns"][4], "g-l-") 
test_column(input1["in_columns"][11], "pvpi") 
test_column(input1["in_columns"][12], "rkkk")  
test_column(input1["in_columns"][59], "kkkq") 
test_column(input1["in_columns"][80], "yyfw") 

test_property(np.argmax(input2["in_columns"][0]), data.START_MARKER) 
test_column(input2["in_columns"][1], "-p------")         
test_column(input2["in_columns"][3], "-a------")      
test_column(input2["in_columns"][155], "kelkarep")    
test_column(input2["in_columns"][162], "linwifil")     
test_column(input2["in_columns"][163], "ghkkpshk")    
test_column(input2["in_columns"][166], "-dl--nk-")      

test_property(np.argmax(input3["in_columns"][0]), data.START_MARKER) 
test_column(input3["in_columns"][1], "--a")         
test_column(input3["in_columns"][2], "--d")         
test_column(input3["in_columns"][3], "--r")         
test_column(input3["in_columns"][4], "-tk")        
test_column(input3["in_columns"][39], "avv")        
test_column(input3["in_columns"][40], "kvv")         
test_column(input3["in_columns"][45], "gfk")        
test_column(input3["in_columns"][46], "-ql")       
test_column(input3["in_columns"][48], "--g")     
test_column(input3["in_columns"][49], "-p-")    
test_column(input3["in_columns"][60], "dvs")   
test_column(input3["in_columns"][61], "nkv")   

assert np.all(input1["in_columns"][1:] == target1["out_columns"][:-1])
assert np.all(input2["in_columns"][1:] == target2["out_columns"][:-1])
assert np.all(input3["in_columns"][1:] == target3["out_columns"][:-1])

def test_attention1(seq, pos, target_column):
    test_property(np.argmax(target1["out_attention"][seq, pos]), target_column)
    
test_attention1(0,0,3)
test_attention1(0,1,4)
test_attention1(0,48,51)
test_attention1(1,0,6)
test_attention1(1,42,50)
test_attention1(1,56,64)
test_attention1(2,0,0)
test_attention1(2,38,40)

def test_attention3(seq, pos, target_column):
    test_property(np.argmax(target3["out_attention"][seq, pos]), target_column)
    
test_attention3(0,0,12)
test_attention3(1,0,3)
test_attention3(2,0,0)
test_attention3(0,14,32)
test_attention3(1,49,53)