## How to use the UniRep mLSTM "babbler". This version demonstrates the 64-unit and the 1900-unit architecture. 

We recommend getting started with the 64-unit architecture as it is easier and faster to run, but has the same interface as the 1900-unit one.

Use the 64-unit or the 1900-unit model?

In [1]:
USE_FULL_1900_DIM_MODEL = False # if True use 1900 dimensional model, else use 64 dimensional one.

## Setup

In [2]:
import tensorflow as tf
import numpy as np

# Set seeds
tf.set_random_seed(42)
np.random.seed(42)

if USE_FULL_1900_DIM_MODEL:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/1900_weights/ 1900_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler1900 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./1900_weights"
    
else:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/64_weights/ 64_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler64 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./64_weights"

## Data formatting and management

Initialize UniRep, also referred to as the "babbler" in our code. You need to provide the batch size you will use and the path to the weight directory.

In [3]:
batch_size = 12
b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)

  from ._conv import register_converters as _register_converters


In [4]:
from scipy.spatial import distance
# PLUS BESOIN DE get_rd_prot() JE CROIS
def get_rd_prot():    # Retourne une ligne aleatoire du fichier fullProtein.list
    lines = open("fullProtein.list").read().splitlines()
    return np.random.choice(lines).split() # return ("protein name", "protein class)

def get_prot_seq(file_name):
    f = open("dataset/fastas/" + file_name + ".fasta", "r")
    next(f)
    seq = ""
    for line in f:
        tmp = line.rstrip()    # Supprimer le "\n"
        seq += tmp
    f.close
    return seq

def get_avg_vec(seq):
    avg_vec = b.get_rep(seq)[0]
    return avg_vec

def get_concat_vec(seq):
    avg_vec = b.get_rep(seq)[0]
    fnl_hid_vec = b.get_rep(seq)[1]
    fnl_cell_vec = b.get_rep(seq)[2]
    seq_vec = np.concatenate((avg_vec, fnl_hid_vec, fnl_cell_vec))
    return seq_vec

def get_classe(searched_protein):
    for classe, protein_list in classes.items():
        for protein_name, seq in protein_list.items():
            if protein_name == searched_protein:
                return classe


def dic_init(avg = True):
    classes = dict()
    f = open("partialProtein.list", "r")
    for line in f:
        infos = line.split()
        protein = infos[0]    # Protein name
        classe = infos[1]     # Protein class
        if classe not in classes:
            classes[classe] = dict()
        if avg:
            classes[classe][protein] = get_avg_vec(get_prot_seq(protein))
        else:
            classes[classe][protein] = get_concat_vec(get_prot_seq(protein))
    return classes

def get_dist_intra(protein_dict): # Fonctionne 
    dist_intra = dict()
    for classe, protein_list in protein_dict.items():
        if classe not in dist_intra:
            dist_intra[classe] = dict()
        for protein_a, vec_a in protein_list.items():
            dist_intra[classe][protein_a] = (None, np.inf)
            for protein_b, vec_b in protein_list.items():
                if protein_a == protein_b:
                    continue
                dist = distance.euclidean(vec_a, vec_b)
                if dist < dist_intra[classe][protein_a][1]:
                    dist_intra[classe][protein_a] = (protein_b, dist)
    return dist_intra

def get_dist_extra(protein_dict): # A CODER 
    dist_extra = dict()
    for classe_a, protein_list_a in protein_dict.items():
        if classe_a not in dist_extra:
            dist_extra[classe_a] = dict()
        for protein_a, vec_a in protein_list_a.items():
            dist_extra[classe_a][protein_a] = (None, np.inf)
            for classe_b, protein_list_b in protein_dict.items():
                if classe_a == classe_b:
                    continue
                for protein_b, vec_b in protein_list_b.items():
                    dist = distance.euclidean(vec_a, vec_b)
                    if dist < dist_extra[classe_a][protein_a][1]:
                        dist_extra[classe_a][protein_a] = (protein_b, dist)
    return dist_extra
                    
                

In [5]:
# EXEMPLE VECTEURS
seq = "MRKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFARYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
print(get_avg_vec(seq))
print(get_concat_vec(seq))

[-0.05927763  0.13672972 -0.12012561 -0.92642754 -0.00543742 -0.23650903
  0.0897011   0.22526713  0.3273445  -0.28901672  0.13982387  0.25751212
 -0.05055477 -0.07889696 -0.05144807 -0.04764353 -0.8712801   0.0408201
  0.12066931 -0.694603   -0.08486307  0.38025245 -0.39046776 -0.26605323
  0.11696819 -0.07166346 -0.10103704  0.09316302  0.56611663 -0.4857574
  0.18355957  0.31225762  0.21201386 -0.09779096  0.63104796  0.09026467
 -0.2454242   0.03384295  0.09200428 -0.10399538  0.02719643  0.21240036
 -0.04368882  0.00127077 -0.09775491  0.14389816 -0.91479826 -0.25973573
  0.18563841  0.22764802  0.4566749   0.02696512 -0.0083634  -0.06258812
  0.11238468 -0.00564607 -0.04524819 -0.01792673 -0.03315708  0.30691475
 -0.15224716 -0.04997362  0.43892825 -0.01767274]
[-5.92776276e-02  1.36729717e-01 -1.20125614e-01 -9.26427543e-01
 -5.43741928e-03 -2.36509025e-01  8.97011012e-02  2.25267127e-01
  3.27344507e-01 -2.89016724e-01  1.39823869e-01  2.57512122e-01
 -5.05547710e-02 -7.8896962

In [6]:
classes = dic_init()
print(classes)

{'a.1.1.2': {'d3sdha_': array([-0.07054725,  0.10115661, -0.13261783, -0.940131  , -0.01698289,
       -0.09205481,  0.08504727,  0.18527837,  0.39105144, -0.14565688,
        0.19388342,  0.28555447, -0.03018061,  0.12709962, -0.09605851,
       -0.05154177, -0.8121881 ,  0.03181541,  0.10113516, -0.55182105,
       -0.09193845,  0.3945473 , -0.33150598, -0.2733507 ,  0.15068373,
       -0.09027573, -0.15639597,  0.0129333 ,  0.4741566 , -0.60439694,
        0.15609662,  0.44052792,  0.2378565 , -0.13433918,  0.62674415,
        0.11638118, -0.20006928,  0.05692971,  0.01575945, -0.14919691,
        0.01402304,  0.19160947, -0.01479574,  0.1531309 , -0.03848166,
        0.18453752, -0.89245343, -0.31758446,  0.1823717 ,  0.2568044 ,
        0.46785805,  0.02946598, -0.02667353, -0.09683505,  0.08570692,
        0.01555479, -0.05862859, -0.06577273,  0.02900021,  0.36144042,
       -0.21110065, -0.032789  ,  0.45371523, -0.00468708], dtype=float32), 'd1scta_': array([-0.06008476,  0.10

In [9]:
dist_intra = get_dist_intra(classes)
print(dist_intra)

{'a.1.1.2': {'d3sdha_': ('d1sctb_', 0.28723201155662537), 'd1scta_': ('d1sctb_', 0.21436138451099396), 'd1sctb_': ('d1scta_', 0.21436138451099396)}, 'a.1.1.1': {'d1ux8a_': ('d1ngka_', 0.23578718304634094), 'd1ngka_': ('d1ux8a_', 0.23578718304634094)}, 'a.1.1.4': {'d1kr7a_': (None, inf)}}


In [8]:
dist_extra = get_dist_extra(classes)
print(dist_extra)

{'a.1.1.2': {'d3sdha_': ('d1kr7a_', 0.33625099062919617), 'd1scta_': ('d1ux8a_', 0.3595080077648163), 'd1sctb_': ('d1ux8a_', 0.3553575575351715)}, 'a.1.1.1': {'d1ux8a_': ('d1sctb_', 0.3553575575351715), 'd1ngka_': ('d1scta_', 0.37047824263572693)}, 'a.1.1.4': {'d1kr7a_': ('d3sdha_', 0.33625099062919617)}}


In [7]:
# Testing manually if dist intra/extra are correct
for key, value in classes.items():
    print(key)
    for key2, value2, in value.items():
        print("  ", key2)
prot1 = classes["a.1.1.2"]["d1sctb_"]
prot2 = classes["a.1.1.4"]["d1kr7a_"]
print(distance.euclidean(prot1, prot2))

a.1.1.2
   d3sdha_
   d1scta_
   d1sctb_
a.1.1.1
   d1ux8a_
   d1ngka_
a.1.1.4
   d1kr7a_
0.5299457907676697


In [59]:
from scipy.spatial import distance

f = open("fullProtein.list")

line = next(f).split()
studied_prot = get_seq(line[0])

same_class = []
rd_class = []

nb_seq_class = 5


for i in range (nb_seq_class): #Recuperer les noms de fichiers
    line = next(f).split()
    same_class.append(line[0])
    rd_protein =random_line("fullProtein.list")
    rd_class.append(line[0])
    
f.close

print(same_class)
print(rd_class)

seq_list1 = [] # Recuperer les sequences
dist1 = []
for i in range(nb_seq_class):
    prot = get_seq(same_class[0])
    seq_list1.append(prot)
    #dist1.append(distance.euclidean(studied_prot[1], prot[1]))

seq_list2 = []
dist2 = []
for i in range(nb_seq_class):
    prot = get_seq(rd_class[0])
    seq_list2.append(prot)
    #dist2.append(distance.euclidean(studied_prot[1], prot[1]))

print(seq_list1)

"""rd_protein = random_line("fullProtein.list")
print(random_protein)

file_name = "dataset/fastas/" + protein[0] + ".fasta"
rd_file_name = "dataset/fastas/" + rd_protein[0] + ".fasta"

seq, seq_vector = get_seq(file_name)
rd_seq, rd_seq_vector = get_seq(rd_file_name)

print("Studied sequence :")
print(seq + "\n")
print("Sequence size : " + str(len(seq)) + "\n")
print("Vector representation of the sequence :\n" + str(seq_vector) + "\n")
print("Vector size : " + str(len(seq_vector)) + "\n")
print("Is seq a valide sequence ?\n" + str(b.is_valid_seq(seq)) + "\n")
"""

NameError: name 'get_seq' is not defined