In [1]:
%matplotlib inline

In [1]:
import os
import glob
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from torchvision import datasets, transforms
from ast import literal_eval
import pandas as pd
import itertools

In [2]:
torch.cuda.set_device(1)

In [3]:
df = pd.read_csv('../data/cluster_rmsds_out.csv')

In [124]:
dir_path = '/mnt/nasdata/vera/msa_transformer_embeddings/*'
embedding_files = {file_name.split('/')[-1] for file_name in glob.glob(dir_path)}
data = {}
for index, row in df.iterrows():
    rmsd_list = [row[5:5+int(row['length'])].values.tolist()]
    pdb_id = row['pdb_id']
    embedding_file = 'embeddings_' + pdb_id[:4] + '_' + pdb_id[4].lower() + '.pt'
    if embedding_file in embedding_files:
        data[embedding_file] = rmsd_list
print(len(data))

10403


In [5]:
rmsd_dataset = pd.DataFrame.from_dict(data, orient='index')
rmsd_dataset.to_csv('../data/rmsd_dataset.csv')

## Create a Dataset

In [3]:
class SmallDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.labels = pd.read_csv(csv_file).sample(100)
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        embedding_path = os.path.join(self.root_dir, self.labels.iloc[idx,0])
        embedding = torch.load(embedding_path)
        label = literal_eval(self.labels.iloc[idx, 1])
        if self.transform:
            embedding = self.transform(embedding)
        return embedding, label, embedding_path

In [4]:
test_dataset = SmallDataset(csv_file='../data/rmsd_dataset.csv',
                                    root_dir='/mnt/nasdata/vera/msa_transformer_embeddings/')

In [59]:
len(test_dataset)

100

In [60]:
test_dataset[25][0][0][:][1][:].shape

torch.Size([209, 768])

In [48]:
matrix = test_dataset[23][0][0,:,1,:].cpu().numpy()

In [13]:
from sklearn.decomposition import PCA

In [55]:
# Initialize PCA class
pca = PCA(n_components=16)

In [56]:
# Fit and transform feature set
new_feature_set = pca.fit_transform(matrix)

In [57]:
print(new_feature_set.shape)

(512, 16)


In [58]:
new_feature_set

array([[16.451086  ,  4.957172  ,  1.2965112 , ...,  2.123791  ,
        -1.2510049 ,  1.1796682 ],
       [ 1.2180768 ,  2.2817183 ,  0.42695534, ...,  0.43066245,
         1.4244059 , -0.11070893],
       [ 5.4178905 ,  3.094024  , -0.93604875, ..., -0.75943094,
        -0.13720031,  1.4712237 ],
       ...,
       [ 1.6147488 , -3.0943813 ,  1.1871496 , ...,  0.590542  ,
         2.2659724 ,  1.325006  ],
       [-0.26194412, -2.5441074 ,  0.2858513 , ..., -0.6241302 ,
        -0.12774006,  0.34780914],
       [-0.49971464, -1.0685382 , -0.09630635, ..., -0.4897208 ,
         0.61182183,  0.6384848 ]], dtype=float32)

In [26]:
class SingleResidueDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        sequence_labels = pd.read_csv(csv_file).sample(100).iloc[:][1]
        all_labels = itertools.chain(sequence_labels.tolist())
        self.labels = all_labels
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        embedding_path = os.path.join(self.root_dir, self.labels.iloc[idx,0])
        embedding = torch.load(embedding_path)
        label = literal_eval(self.labels.iloc[idx, 1])
        if self.transform:
            embedding = self.transform(embedding)
        return embedding, label, embedding_path

In [23]:
sample.shape

torch.Size([1, 108527616])

In [4]:
df4 = pd.read_csv('../data/rmsd_dataset.csv').sample(100)

In [5]:
all_labels = []
for seq in df4.iloc[:,1].tolist():
    all_labels += literal_eval(seq)
len(all_labels)

22115

In [61]:
root_dir='/mnt/nasdata/vera/msa_transformer_embeddings/'
all_embeddings = []
for i in range(50):
    embedding, labels, path = test_dataset[i]
    for residue in range(embedding.shape[2]-1):
        small_embedding = pca.fit_transform(embedding[0,:,residue+1,:].cpu().numpy())
        all_embeddings += torch.from_numpy(small_embedding)
    print(i)
print(len(all_embeddings))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
5908480


In [158]:
import gc

In [159]:
def clear_model():
    gc.collect()
    all_embeddings = []
    torch.cuda.empty_cache()

In [160]:
clear_model()

In [98]:
import shutil

In [3]:
df4 = pd.read_csv('../data/rmsd_dataset.csv').sample(100)

In [5]:
root_dir='/mnt/nasdata/vera/msa_transformer_embeddings/'

In [9]:
i = 0
for index, row in df4.iterrows():
    embedding_path = os.path.join(root_dir, row[0])
    embeddings = torch.load(embedding_path)
    pdb_id = row[0].replace('embeddings_', '').replace('.pt', '')
    print(embeddings.shape)
    print(pdb_id)
    continue
    for residue in range(embeddings.shape[2]-1):
        test_dataset[23][0][0,:,1,:].cpu().numpy()
        residue_embeddings = embeddings[:,:,residue+1,:]
        print(residue_embeddings.shape)
        #output_path = "../data/100_embeddings/" + pdb_id + '_' + str(residue) + '.pt'
        #torch.save(residue_embeddings, output_path)
    i += 1
    if i == 50:
        break

torch.Size([1, 512, 305, 768])
4l00_b
torch.Size([1, 512, 232, 768])
6gqf_a
torch.Size([1, 512, 142, 768])
5z1t_a
torch.Size([1, 512, 157, 768])
3ljl_a
torch.Size([1, 512, 148, 768])
6gxg_b
torch.Size([1, 512, 336, 768])
2zbw_b
torch.Size([1, 512, 267, 768])
4hqo_b
torch.Size([1, 512, 210, 768])
1zgl_u
torch.Size([1, 512, 165, 768])
3pu2_b
torch.Size([1, 512, 308, 768])
4n7w_a
torch.Size([1, 512, 328, 768])
5z75_a
torch.Size([1, 512, 323, 768])
4z90_a
torch.Size([1, 512, 266, 768])
5lk5_a
torch.Size([1, 512, 228, 768])
6o3l_h
torch.Size([1, 512, 251, 768])
4iqz_a
torch.Size([1, 512, 291, 768])
3c18_a
torch.Size([1, 512, 281, 768])
4c16_a
torch.Size([1, 512, 139, 768])
6a6f_a
torch.Size([1, 512, 209, 768])
4qtu_b
torch.Size([1, 512, 287, 768])
1jr2_a
torch.Size([1, 512, 232, 768])
6bft_a
torch.Size([1, 512, 187, 768])
5cdj_b
torch.Size([1, 512, 126, 768])
2cxy_a
torch.Size([1, 512, 140, 768])
6kta_a
torch.Size([1, 512, 390, 768])
6ny5_a
torch.Size([1, 512, 302, 768])
4kwv_f
torch.Size([

In [10]:
import datetime;
import time
ct = datetime.datetime.now()

In [18]:
path = '/mnt/nasdata/vera/msa_transformer_embeddings/embeddings_6pon_b.pt'

In [19]:
ts1 = time.time()
tensor = torch.load(path)
ts2 = time.time()
print(ts2 - ts1)

3.9751322269439697


In [39]:
df_names = pd.read_csv('../data/rmsd_dataset.csv').sample(100)

In [43]:
for i in range(5):
    pdb_id = df_names.iloc[i,0]
    length = len(literal_eval(df_names.iloc[i,1]))
    ts1 = time.time()
    tensor = torch.load('/mnt/nasdata/vera/msa_transformer_embeddings/' + pdb_id)
    ts2 = time.time()
    diff = ts2 - ts1
    print(pdb_id, length, round(diff, 2), '–>', round(1000*diff/length, 2), 'ms/residue')

embeddings_3m3w_b.pt 320 0.18 –> 0.55 ms/residue
embeddings_2hzk_d.pt 365 0.2 –> 0.54 ms/residue
embeddings_5ans_a.pt 175 0.1 –> 0.57 ms/residue
embeddings_2c35_b.pt 172 0.1 –> 0.56 ms/residue
embeddings_5lp5_c.pt 248 0.14 –> 0.55 ms/residue


In [53]:


ts1 = time.time()
tensor = torch.load('/home/vera/projects/masters_project/data/100_embeddings/4bkm_a_23.pt')
ts2 = time.time()
diff = ts2 - ts1
print(diff)

0.17844319343566895


In [57]:
for pdb_path in glob.glob('/home/vera/projects/masters_project/data/100_embeddings/6aae_b_*'):
    tensor = torch.load(pdb_path)
    ts1 = time.time()
    torch.save(tensor, pdb_path)
    ts2 = time.time()
    diff = ts2 - ts1
    print(diff)

7.2348949909210205
7.745619297027588
7.0855584144592285
9.984408378601074
9.019051790237427
