## How to use the UniRep mLSTM "babbler". This version demonstrates the 64-unit and the 1900-unit architecture. 

We recommend getting started with the 64-unit architecture as it is easier and faster to run, but has the same interface as the 1900-unit one.

Use the 64-unit or the 1900-unit model?

In [1]:
USE_FULL_1900_DIM_MODEL = False # if True use 1900 dimensional model, else use 64 dimensional one.

## Setup

In [2]:
import tensorflow as tf
import numpy as np
from scipy.spatial import distance

# Set seeds
tf.set_random_seed(42)
np.random.seed(42)

if USE_FULL_1900_DIM_MODEL:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/1900_weights/ 1900_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler1900 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./1900_weights"
    
else:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/64_weights/ 64_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler64 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./64_weights"

## Data formatting and management

Initialize UniRep, also referred to as the "babbler" in our code. You need to provide the batch size you will use and the path to the weight directory.

In [3]:
batch_size = 12
b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)

  from ._conv import register_converters as _register_converters


In [4]:
def get_prot_seq(file_name):
    f = open("dataset/fastas/" + file_name + ".fasta", "r") # Retriving the file containing the sequence
    next(f) # Skipping the first line (containing the protein's name)
    seq = ""
    for line in f: # Retriving the sequence
        tmp = line.rstrip()    # Deleting "\n"
        seq += tmp
    f.close
    return seq

def get_avg_vec(seq):
    avg_vec = b.get_rep(seq)[0] # Vector 1 : avg
    return avg_vec

def get_concat_vec(seq):
    avg_vec = b.get_rep(seq)[0] # Vector 1 : avg
    fnl_hid_vec = b.get_rep(seq)[1] # Vector 2 : final hidden
    fnl_cell_vec = b.get_rep(seq)[2] # Vector 3 : final cell
    seq_vec = np.concatenate((avg_vec, fnl_hid_vec, fnl_cell_vec)) # Concatenation of all three vectors
    return seq_vec

def get_classe(searched_protein): # Returning the protein's category (the key in the level 0 dictionnary)
    for classe, protein_list in classes.items(): # Browsing the category dictionnary (level 0)
        for protein_name, seq in protein_list.items(): # Browsing the protein dictionnary (level 1)
            if protein_name == searched_protein:
                return classe


def dic_init(avg = True): # Initializing the nested dictionnary containing all proteins and their vector (avg or concatenated)
    classes = dict()
    f = open("partialProtein.list", "r")
    for line in f: # Browsing all protein
        infos = line.split()
        protein = infos[0]    # Protein name
        classe = infos[1]     # Protein category
        if classe not in classes: # Adding new category key if it doesn't exist
            classes[classe] = dict()
        if avg: # adding the avg or concatenated vector
            classes[classe][protein] = get_avg_vec(get_prot_seq(protein))
        else:
            classes[classe][protein] = get_concat_vec(get_prot_seq(protein))
    return classes

def get_dist_intra(protein_dict): # Initializing a dictionnary containning the shortest euclidian distance between proteins of the same category
    dist_intra = dict()
    for classe, protein_list in protein_dict.items():
        if classe not in dist_intra: # Adding new category key if it doesn't exist
            dist_intra[classe] = dict()
        for protein_a, vec_a in protein_list.items():
            dist_intra[classe][protein_a] = (None, np.inf)
            for protein_b, vec_b in protein_list.items():
                if protein_a == protein_b:
                    continue
                dist = distance.euclidean(vec_a, vec_b)
                if dist < dist_intra[classe][protein_a][1]:
                    dist_intra[classe][protein_a] = (protein_b, dist)
    return dist_intra

def get_dist_extra(protein_dict): # Initializing a dictionnary containning the shortest euclidian distance between proteins of different category
    dist_extra = dict()
    for classe_a, protein_list_a in protein_dict.items():
        if classe_a not in dist_extra: # Adding new category key if it doesn't exist
            dist_extra[classe_a] = dict()
        for protein_a, vec_a in protein_list_a.items():
            dist_extra[classe_a][protein_a] = (None, np.inf)
            for classe_b, protein_list_b in protein_dict.items():
                if classe_a == classe_b:
                    continue
                for protein_b, vec_b in protein_list_b.items():
                    dist = distance.euclidean(vec_a, vec_b)
                    if dist < dist_extra[classe_a][protein_a][1]:
                        dist_extra[classe_a][protein_a] = (protein_b, dist)
    return dist_extra
                    
                

In [5]:
classes = dic_init()
print(classes)

{'a.1.1.1': {'d1dlwa_': array([ 0.06514913,  0.12177301, -0.10926606, -0.9525883 , -0.02587943,
       -0.20463514,  0.06858744,  0.16824898,  0.40710443, -0.1904161 ,
        0.14028704,  0.27083248,  0.02257151,  0.10667225, -0.07503703,
       -0.07836503, -0.8943525 ,  0.0444199 ,  0.17088582, -0.58399343,
       -0.06072905,  0.34278175, -0.3374346 , -0.23926288,  0.11747619,
       -0.15984318, -0.14865014,  0.00924869,  0.40626946, -0.6548695 ,
        0.12585634,  0.5345347 ,  0.2873067 , -0.16253684,  0.70054954,
        0.0539537 , -0.19953674,  0.09688381, -0.03571857, -0.17473587,
        0.00367005,  0.22055848,  0.03923672,  0.15917331, -0.06309959,
        0.17931348, -0.873872  , -0.37892547,  0.14863081,  0.28562456,
        0.34736454,  0.04010273, -0.02423001, -0.08781188,  0.06318163,
        0.02796522,  0.02181917, -0.06890925,  0.03193155,  0.36216864,
       -0.21348695, -0.04554855,  0.526494  , -0.03351151], dtype=float32), 'd1dlya_': array([-0.06524004,  0.12

In [6]:
dist_intra = get_dist_intra(classes)
print(dist_intra)

{'a.2.2.1': {'d1r73a_': ('d2hgq11', 0.22577500343322754), 'd1vqov1': ('d2j0121', 0.40468984842300415), 'd2j0121': ('d2hgq11', 0.25743529200553894), 'd2hgq11': ('d1r73a_', 0.22577500343322754)}, 'a.1.1.1': {'d1dlwa_': ('d1dlya_', 0.36367663741111755), 'd1dlya_': ('d1dlwa_', 0.36367663741111755)}, 'a.1.1.2': {'d1b0ba_': ('d1scta_', 0.3489246070384979), 'd1sctb_': ('d1scta_', 0.21436138451099396), 'd1scta_': ('d1sctb_', 0.21436138451099396), 'd3sdha_': ('d1sctb_', 0.28723201155662537)}, 'a.2.3.1': {'d1wjza_': ('d1iura_', 0.3824501037597656), 'd1iura_': ('d1wjza_', 0.3824501037597656), 'd1fafa_': ('d1iura_', 0.41205480694770813)}}


In [7]:
dist_extra = get_dist_extra(classes)
print(dist_extra)

{'a.2.2.1': {'d1r73a_': ('d1fafa_', 0.5813828706741333), 'd1vqov1': ('d1fafa_', 0.43741369247436523), 'd2j0121': ('d1fafa_', 0.5186391472816467), 'd2hgq11': ('d1fafa_', 0.565814733505249)}, 'a.1.1.1': {'d1dlwa_': ('d3sdha_', 0.3733803331851959), 'd1dlya_': ('d3sdha_', 0.19076266884803772)}, 'a.1.1.2': {'d1b0ba_': ('d1dlya_', 0.3643576502799988), 'd1sctb_': ('d1dlya_', 0.29844632744789124), 'd1scta_': ('d1dlya_', 0.3501546382904053), 'd3sdha_': ('d1dlya_', 0.19076266884803772)}, 'a.2.3.1': {'d1wjza_': ('d1b0ba_', 0.416841059923172), 'd1iura_': ('d1vqov1', 0.44203561544418335), 'd1fafa_': ('d1vqov1', 0.43741369247436523)}}
