# GPU setting

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# Import library

In [1]:
from Embedding import Embedder
import pandas as pd
import numpy as np
import torch

# Functions

In [2]:
def get_gpu_properties():
    print('-'*100)
    print('GPU availability:', torch.cuda.is_available())
    print('-'*100)
    print('Available GPU counts:', torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(i, torch.cuda.get_device_name(i))
    print('-'*100)
    print('Current device:', torch.cuda.current_device(), torch.cuda.get_device_name(torch.cuda.current_device()))
    print('-'*100)

# Chec GPU properties

In [3]:
get_gpu_properties()

----------------------------------------------------------------------------------------------------
GPU availability: True
----------------------------------------------------------------------------------------------------
Available GPU counts: 4
0 NVIDIA Tesla V100-DGXS-32GB
1 NVIDIA Tesla V100-DGXS-32GB
2 NVIDIA Tesla V100-DGXS-32GB
3 NVIDIA Tesla V100-DGXS-32GB
----------------------------------------------------------------------------------------------------
Current device: 0 NVIDIA Tesla V100-DGXS-32GB
----------------------------------------------------------------------------------------------------


# load DeepLoc data set

In [8]:
df = pd.read_csv('../data/DeepLoc/DeepLocAll.csv')
sequences = df.iloc[:,0].tolist()
sequences = [''.join(seq.split()) for seq in sequences]

In [6]:
for name in ['Albert_BFD', 'BERT_BFD', 'T5_BFD', 'T5_FT', 'XLNet_Uniref100']:
    embedder = Embedder()
    if name == 'Albert_BFD':
        feature = embedder.albert_embedding(sequences)
    if name == 'BERT_BFD':
        feature = embedder.bert_embedding(sequences)
    if name == 'T5_BFD':
        feature = embedder.t5_embedding(sequences)
    if name == 'T5_FT':
        feature = embedder.t5ft_embedding(sequences)
    if name == 'XLNet_Uniref100':
        feature = embedder.xlnet_embedding(sequences)
    pd.concat([df.iloc[:,1:], pd.DataFrame(feature)], axis=1).to_csv(f'../data/DeepLoc/DeepLocEmbedd_{name}.csv', index=False)

Some weights of the model checkpoint at /home/kurosaki/.cache/bio_embeddings/prottrans_bert_bfd/model_directory were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
from bio_embeddings import embed
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from tqdm import tqdm

# load embedding models

# DeepLoc

In [6]:
df = pd.read_csv('../data/DeepLoc/DeepLocAll.csv')
sequences = df.iloc[:,0].tolist()
sequences = [''.join(seq.split()) for seq in sequences]

for name in ['Albert_BFD', 'BERT_BFD', 'T5_BFD', 'T5_FT', 'XLNet_Uniref100']:
    if name == 'Albert_BFD':
      generator = albert_embedder.embed_many(sequences)
    if name == 'BERT_BFD':
        generator = bert_embedder.embed_many(sequences)
    if name == 'T5_BFD':
        generator = t5_embedder.embed_many(sequences)
    if name == 'T5_FT':
        generator = t5ft_embedder.embed_many(sequences)
    if name == 'XLNet_Uniref100':
        generator = xlnet_embdder.embed_many(sequences)
    features = np.array([np.mean(v, axis=0) for v in generator])
    print('-'*100)
    print(name, features.shape)
    pd.concat([df.iloc[:,1:], pd.DataFrame(features)], axis=1).to_csv(f'../data/DeepLoc/DeepLocEmbedd_{name}.csv', index=False)

RuntimeError for sequence with 6620 residues: CUDA out of memory. Tried to allocate 10.46 GiB (GPU 0; 31.74 GiB total capacity; 13.37 GiB already allocated; 10.18 GiB free; 19.95 GiB reserved in total by PyTorch). This most likely means that you don't have enough GPU RAM to embed a protein this long. Embedding on the CPU instead, which is very slow
Some weights of the model checkpoint at /home/kurosaki/.cache/bio_embeddings/prottrans_albert_bfd/model_directory were not used when initializing AlbertModel: ['predictions.decoder.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.weight', 'predictions.dense.bias', 'sop_classifier.classifier.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining mod

KeyboardInterrupt: 

# Deep PPI

In [16]:
fasta_path = '../data/DeepPPI/DeepPPI.fasta'
seq_dict = {}
for i, record in enumerate(tqdm(SeqIO.parse(fasta_path, 'fasta'), total=4424)):
    Id = record.id.split('|')[1]
    seq = str(record.seq)
    seq_dict[str(Id)] = seq
df = pd.DataFrame(seq_dict.keys(), columns = ['UniprotID'])
sequences = list(seq_dict.values())

100%|██████████| 4424/4424 [00:00<00:00, 45474.18it/s]


In [25]:
for i, model in enumerate(models):
    if model ==  'Albert_BFD':
        emb = embed.ProtTransAlbertBFDEmbedder()
    elif model ==  'BERT_BFD':
        emd = embed.ProtTransBertBFDEmbedder()
    elif model == 'T5_BFD':
        emd = embed.ProtTransT5BFDEmbedder()
    elif model == 'T5_FT':
        emb = embed.ProtTransT5XLU50Embedder()
    generator = emb.embed_many(sequences)
    features = [np.mean(v, axis=0) for v in generator]
    pd.concat([df, pd.DataFrame(features)], axis=1).to_csv(f'../data/DeepPPI/DeepPPIEmbedd_{model}.csv', index=False)

Some weights of the model checkpoint at /home/kurosaki/.cache/bio_embeddings/prottrans_albert_bfd/model_directory were not used when initializing AlbertModel: ['predictions.decoder.bias', 'predictions.bias', 'sop_classifier.classifier.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'sop_classifier.classifier.weight', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/kurosaki/.cache/bio_embeddings/prottrans_bert_bfd/model_directory were no

# test

In [1]:
import pandas as pd

In [5]:
df = pd.read_csv('../data/DeepLoc/DeepLocEmbedd_T5_FT.csv')
df

Unnamed: 0,input,loc,membrane,0,1,2,3,4,5,6,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,M M K T L S S G N C T L N V P A K N S Y R M V ...,Cell.membrane,M,0.005140,0.101230,0.012850,0.021046,0.004626,0.021493,-0.054627,...,-0.039923,0.003448,-0.050793,-0.024625,0.042415,0.019373,-0.024845,-0.014186,0.000061,0.017455
1,M A K R T F S N L E T F L I F L L V M M S A I ...,Cell.membrane,M,0.033010,0.066597,0.041995,0.030470,-0.013707,0.019532,-0.076544,...,-0.047641,-0.003586,-0.002714,-0.039838,0.021222,-0.004296,0.008132,0.012745,0.030368,-0.012899
2,M G N C Q A G H N L H L C L A H H P P L V C A ...,Cell.membrane,M,0.002815,0.004737,0.024810,-0.006830,-0.011687,0.030223,-0.058251,...,0.003590,-0.052906,0.009581,0.003414,-0.049453,-0.044577,-0.007692,-0.065482,0.037445,0.031141
3,M D P S K Q G T L N R V E N S V Y R T A F K L ...,Cell.membrane,M,-0.047677,-0.010981,0.025459,-0.002435,-0.053464,0.061748,-0.055489,...,0.021491,-0.015486,-0.000625,-0.028873,-0.020675,0.038899,0.005726,0.007258,-0.000905,0.031985
4,M L L A W V Q A F L V S N M L L A E A Y G S G ...,Cell.membrane,M,0.014964,-0.009305,0.060831,-0.016183,-0.008358,0.084196,-0.048217,...,-0.022407,-0.037921,0.001360,-0.010348,-0.019215,0.042999,0.003124,-0.061529,0.022786,0.041758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8459,I I E C F F S C E I E K D G K S K E G K P C K ...,Extracellular,S,0.095750,0.074050,0.077187,-0.126235,-0.026397,-0.143217,-0.126798,...,-0.020201,-0.008499,-0.002725,-0.155470,0.088511,-0.078960,-0.028378,0.036604,0.064585,0.118211
8460,M R V S V P V L A L A F G S L A A A A P N A G ...,Extracellular,S,0.040690,0.023037,0.054974,-0.005959,-0.037790,-0.007396,-0.017116,...,-0.094401,-0.014769,-0.026801,0.015351,0.028712,0.048774,-0.000919,0.013360,-0.008210,0.003581
8461,M L F W T A F S M A L S L R L A L A R S S I E ...,Extracellular,S,-0.021006,-0.000614,0.031062,0.010280,-0.026196,0.022467,-0.063973,...,-0.005070,-0.006231,0.014474,-0.011704,-0.034645,0.007463,0.003945,-0.010791,0.030907,0.019197
8462,M M A F P P Q S C V H V L P P K S I Q M W E P ...,Extracellular,S,0.024560,0.018751,0.134095,0.047219,-0.044477,0.046516,0.005344,...,-0.070051,-0.000845,0.071466,-0.060846,0.017049,-0.071921,0.011296,-0.033704,0.031455,0.018192
