## How to use the UniRep mLSTM "babbler". This version demonstrates the 64-unit and the 1900-unit architecture. 

We recommend getting started with the 64-unit architecture as it is easier and faster to run, but has the same interface as the 1900-unit one.

Use the 64-unit or the 1900-unit model?

In [2]:
USE_FULL_1900_DIM_MODEL = False # if True use 1900 dimensional model, else use 64 dimensional one.

## Setup

In [3]:
import tensorflow as tf
import numpy as np

# Set seeds
tf.set_random_seed(42)
np.random.seed(42)

if USE_FULL_1900_DIM_MODEL:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/1900_weights/ 1900_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler1900 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./1900_weights"
    
else:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/64_weights/ 64_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler64 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./64_weights"

## Data formatting and management

Initialize UniRep, also referred to as the "babbler" in our code. You need to provide the batch size you will use and the path to the weight directory.

In [4]:
batch_size = 12
b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)

  from ._conv import register_converters as _register_converters


In [35]:
def get_rd_prot():    # Retourne une ligne aleatoire du fichier fullProtein.list
    lines = open("fullProtein.list").read().splitlines()
    return np.random.choice(lines).split() # return ("protein name", "protein class)

def get_prot_seq(file_name):
    f = open("dataset/fastas/" + file_name + ".fasta", "r")
    next(f)
    seq = ""
    for line in f:
        tmp = line.rstrip()    # Supprimer le "\n"
        seq += tmp
    f.close
    return seq, seq_vector  # Return ("sequence", "vecteur associe)

def get_avg_vec(seq):
    avg_vec = b.get_rep(seq)[0]
    return avg_vec

def get_concat_vec(seq):
    avg_vec = b.get_rep(seq)[0]
    fnl_hid_vec = b.get_rep(seq)[1]
    fnl_cell_vec = b.get_rep(seq)[2]
    seq_vec = np.concatenate((avg_vec, fnl_hid_vec, fnl_cell_vec))
    return seq_vec

In [36]:
seq = "MRKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFARYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
print(get_avg_vec(seq))
print(get_concat_vec(seq))

[-0.05927763  0.13672972 -0.12012561 -0.92642754 -0.00543742 -0.23650903
  0.0897011   0.22526713  0.3273445  -0.28901672  0.13982387  0.25751212
 -0.05055477 -0.07889696 -0.05144807 -0.04764353 -0.8712801   0.0408201
  0.12066931 -0.694603   -0.08486307  0.38025245 -0.39046776 -0.26605323
  0.11696819 -0.07166346 -0.10103704  0.09316302  0.56611663 -0.4857574
  0.18355957  0.31225762  0.21201386 -0.09779096  0.63104796  0.09026467
 -0.2454242   0.03384295  0.09200428 -0.10399538  0.02719643  0.21240036
 -0.04368882  0.00127077 -0.09775491  0.14389816 -0.91479826 -0.25973573
  0.18563841  0.22764802  0.4566749   0.02696512 -0.0083634  -0.06258812
  0.11238468 -0.00564607 -0.04524819 -0.01792673 -0.03315708  0.30691475
 -0.15224716 -0.04997362  0.43892825 -0.01767274]
[-5.92776276e-02  1.36729717e-01 -1.20125614e-01 -9.26427543e-01
 -5.43741928e-03 -2.36509025e-01  8.97011012e-02  2.25267127e-01
  3.27344507e-01 -2.89016724e-01  1.39823869e-01  2.57512122e-01
 -5.05547710e-02 -7.8896962

In [13]:
from scipy.spatial import distance

f = open("fullProtein.list")

line = next(f).split()
studied_prot = get_seq(line[0])

same_class = []
rd_class = []

nb_seq_class = 5


for i in range (nb_seq_class): #Recuperer les noms de fichiers
    line = next(f).split()
    same_class.append(line[0])
    rd_protein =random_line("fullProtein.list")
    rd_class.append(line[0])
    
f.close

print(same_class)
print(rd_class)

seq_list1 = [] # Recuperer les sequences
dist1 = []
for i in range(nb_seq_class):
    prot = get_seq(same_class[0])
    seq_list1.append(prot)
    #dist1.append(distance.euclidean(studied_prot[1], prot[1]))

seq_list2 = []
dist2 = []
for i in range(nb_seq_class):
    prot = get_seq(rd_class[0])
    seq_list2.append(prot)
    #dist2.append(distance.euclidean(studied_prot[1], prot[1]))

print(seq_list1)
print(seq_list2)

"""rd_protein = random_line("fullProtein.list")
print(random_protein)

file_name = "dataset/fastas/" + protein[0] + ".fasta"
rd_file_name = "dataset/fastas/" + rd_protein[0] + ".fasta"

seq, seq_vector = get_seq(file_name)
rd_seq, rd_seq_vector = get_seq(rd_file_name)

print("Studied sequence :")
print(seq + "\n")
print("Sequence size : " + str(len(seq)) + "\n")
print("Vector representation of the sequence :\n" + str(seq_vector) + "\n")
print("Vector size : " + str(len(seq_vector)) + "\n")
print("Is seq a valide sequence ?\n" + str(b.is_valid_seq(seq)) + "\n")
"""

['d1dlya_', 'd1s69a_', 'd1idra_', 'd1ngka_', 'd1ux8a_']
['d1dlya_', 'd1s69a_', 'd1idra_', 'd1ngka_', 'd1ux8a_']
[('SLFAKLGGREAVEAAVDKFYNKIVADPTVSTYFSNTDMKVQRSKQFAFLAYALGGASEWKGKDMRTAHKDLVPHLSDVHFQAVARHLSDTLTELGVPPEDITDAMAVVASTRTEVLNMPQQ', array([24,  7, 21, 18, 15,  4, 21, 13, 13,  2,  6, 15, 16,  6, 15, 15, 16,
        5,  4, 18, 19,  9,  4, 17, 16, 15,  5, 14,  8, 16,  7,  8, 19, 18,
        7,  9,  8,  5,  1,  4, 16, 10,  2,  7,  4, 10, 18, 15, 18, 21, 15,
       19, 15, 21, 13, 13, 15,  7,  6, 20,  4, 13,  4,  5,  1,  2,  8, 15,
        3,  4,  5, 21, 16, 14,  3, 21,  7,  5, 16,  3, 18, 10, 15, 16, 15,
        2,  3, 21,  7,  5,  8, 21,  8,  6, 21, 13, 16, 14, 14,  6,  5, 17,
        8,  5, 15,  1, 15, 16, 16, 15,  7,  8,  2,  8,  6, 16, 21,  9,  1,
       14, 10, 10])), ('SLFAKLGGREAVEAAVDKFYNKIVADPTVSTYFSNTDMKVQRSKQFAFLAYALGGASEWKGKDMRTAHKDLVPHLSDVHFQAVARHLSDTLTELGVPPEDITDAMAVVASTRTEVLNMPQQ', array([24,  7, 21, 18, 15,  4, 21, 13, 13,  2,  6, 15, 16,  6, 15, 15, 16,
        5,  4

'rd_protein = random_line("fullProtein.list")\nprint(random_protein)\n\nfile_name = "dataset/fastas/" + protein[0] + ".fasta"\nrd_file_name = "dataset/fastas/" + rd_protein[0] + ".fasta"\n\nseq, seq_vector = get_seq(file_name)\nrd_seq, rd_seq_vector = get_seq(rd_file_name)\n\nprint("Studied sequence :")\nprint(seq + "\n")\nprint("Sequence size : " + str(len(seq)) + "\n")\nprint("Vector representation of the sequence :\n" + str(seq_vector) + "\n")\nprint("Vector size : " + str(len(seq_vector)) + "\n")\nprint("Is seq a valide sequence ?\n" + str(b.is_valid_seq(seq)) + "\n")\n'