In [3]:
import optuna as opt
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
#from twilio.rest import Client
import pickle

import time

import sys
import os 


sys.path.append('../../pscapes')
sys.path.append('../../nk-ml-2024/')


from torch.utils.data import DataLoader

from pscapes.landscape_class import ProteinLandscape
from pscapes.utils import dict_to_np_array, np_array_to_dict

from src.architectures import SequenceRegressionCNN, SequenceRegressionLinear, SequenceRegressionMLP, SequenceRegressionLSTM, SequenceRegressionTransformer

from src.ml_utils import train_val_test_split_ohe, landscapes_ohe_to_numpy
from src.hyperopt import objective_NK, sklearn_objective_NK

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 

from src.train_utils import train_models_from_hparams_NK, read_MLP_hparams, read_CNN_hparams, read_LSTM_hparams, read_transformer_hparams

from src.analysis import get_latent_representation, instantiate_model_from_study

from sklearn.neighbors import kneighbors_graph
import math
import networkx as nx
from scipy.sparse import diags

ModuleNotFoundError: No module named 'architectures'

In [None]:
HPARAM_PATH = '../hyperopt/results/NK_hyperopt_results.pkl'
DATA_PATH = '../data/nk_landscapes/'
MODEL_SAVEPATH = '../models/'
RESULT_PATH = '../results/'
SEQ_LEN = 6
AA_ALPHABET  = 'ACDEFG'
N_REPLICATES = 1

In [3]:
with open(HPARAM_PATH, 'rb') as handle: 
    NK_hparams = pickle.load(handle)

In [4]:
#load landscape data 
landscapes = []
print('Loading landscapes.')
for k in range(SEQ_LEN):
    replicate_list = []
    for r in range(N_REPLICATES):
        landscape = ProteinLandscape(csv_path=DATA_PATH+'/k{0}_r{1}.csv'.format(k,r), amino_acids=AA_ALPHABET)
        replicate_list.append(landscape)
    landscapes.append(replicate_list)
landscapes = [[i.fit_OHE() for i in j] for j in landscapes]

print('Calculating train-test-val splits')
splits = [train_val_test_split_ohe(i, random_state=1) for i in landscapes]
#landscapes_ohe, xy_train, xy_val, xy_test, x_tests, y_tests = splits[k_index] 

Loading landscapes.
Calculating train-test-val splits


In [5]:
model_names = ['linear', 'mlp', 'cnn', 'ulstm', 'blstm', 'transformer']# 'RF', 'GB']

In [6]:
def instantiate_model_from_study(model_name, study, alphabet_size=6, seq_length=6): 
    if model_name == 'linear':
        model_instance = SequenceRegressionLinear(alphabet_size=alphabet_size, sequence_length=seq_length)
    elif model_name == 'mlp': 
        hparams = read_MLP_hparams(study.best_params)
        model_instance = SequenceRegressionMLP(**hparams, alphabet_size=alphabet_size, sequence_length=seq_length)
    elif model_name == 'cnn': 
        hparams = read_CNN_hparams(study.best_params)
        model_instance = SequenceRegressionCNN(**hparams, input_channels=alphabet_size, sequence_length=seq_length)
    elif model_name == 'ulstm': 
        hparams = read_LSTM_hparams(study.best_params)
        model_instance = SequenceRegressionLSTM(**hparams, input_size=alphabet_size, bidirectional=False)
    elif model_name == 'blstm': 
        hparams = read_LSTM_hparams(study.best_params)
        model_instance = SequenceRegressionLSTM(**hparams, input_size=alphabet_size, bidirectional=True)
    elif model_name == 'transformer': 
        hparams = read_transformer_hparams(study.best_params)
        model_instance = SequenceRegressionTransformer(**hparams, input_dim=alphabet_size)
    elif model_name == 'RF': 
        hparams = study.best_params
        model_instance = RandomForestRegressor(**hparams)
    elif model_name == 'GB': 
        hparams = study.best_params
        model_instance = GradientBoostingRegressor(**hparams)
    return model_instance

In [7]:
instantiated_models = {x:[] for x in model_names}
for model_name in NK_hparams.keys(): 
    if model_name != 'RF' and model_name != 'GB': 
        for k_value, study in enumerate(NK_hparams[model_name]):
            replicate_models = []
            for replicate in range(N_REPLICATES): 
                model = instantiate_model_from_study(model_name, study)
                replicate_models.append(model)
            instantiated_models[model_name].append(replicate_models)
            



In [8]:
#let's load model weights 
for model_name in model_names: 
    for k_value in range(SEQ_LEN): 
        for replicate in range(N_REPLICATES): 
            model_path = MODEL_SAVEPATH + '{}_NK_k{}_r{}.pt'.format(model_name, k_value, replicate)
            model_instance = instantiated_models[model_name][k_value][replicate]
            model_instance.load_state_dict(torch.load(model_path, weights_only=True))

In [39]:
def adjacency_to_diag_laplacian(A): 
    """
    Calculates degree and laplacian matrices from an adjacency matrix.  
    
    Args: 
        A (scipy sparse matrix): adjacency of graph
    Returns: 
        D (scipy sparse matrix): degree matrix of graph
        L (scipy sparse matrix) : laplacian matrix of graph
    """
    degrees = A.sum(axis=1).A1
    D = diags(degrees, format='csr')
    L = D-A

    
    return D, L

def sparse_dirichlet(L, f): 
    """
    Calculates the Dirichlet energy of a signal f over a graph. 
    
    Args: 
    L (scipy sparse matrix): graph laplacian
    f (np array): signal over graph
    """

    f = f.astype('float64') 
    Lf  = L.dot(f)
    f_T = f.T
    fLf = f_T.dot(Lf)
    return fLf.item()

In [10]:
landscapes_as_tensor = [[torch.from_numpy(i).to(torch.float32) for i in j[0]] for j in splits]
flattened_landscapes = [[i.reshape(i.shape[0], i.shape[1]*i.shape[2]) for i in j[0]] for j in splits]

In [49]:
mlp_models   = [instantiated_models['mlp'][i][0].to('cuda') for i in range(SEQ_LEN)]
layer_outputs = []
hooks = []
# Define a hook function to capture the output
def hook_fn(module, input, output):
    layer_outputs.append(output)

hooks = [i.fc_layers[-1].register_forward_hook(hook_fn) for i in mlp_models]

In [57]:
[hook.remove() for hook in hooks ]

[None, None, None, None, None, None]

In [58]:
model_forward_output = [mlp_models[i](landscapes_as_tensor[i][0].to('cuda')) for i in range(len(mlp_models))]

In [59]:
layer_outputs_np = [i.cpu().detach().numpy() for i in layer_outputs]


degree=30


In [60]:
len(layer_outputs_np)

48

In [25]:
nk_landscapes_knn = [[kneighbors_graph(flattened_landscapes[i][j], n_neighbors=degree, n_jobs=-1)for j in range(N_REPLICATES)] for i in range(len(flattened_landscapes))]

In [32]:
latent_rep_mlp_knn = [kneighbors_graph(layer_outputs_np[i], n_neighbors=degree, n_jobs=-1 ) for i in range(len(layer_outputs))]

In [35]:
landscape_y = [[i.fitnesses.reshape(-1,1) for i in j] for j in landscapes]

In [43]:
nk_landscapes_laplacians = [adjacency_to_diag_laplacian(i[0])[-1] for i in nk_landscapes_knn]
latent_rep_mlp_laplacians = [adjacency_to_diag_laplacian(i)[-1] for i in latent_rep_mlp_knn]

In [47]:
len(latent_rep_mlp_laplacians)

12

In [46]:
nk_landscapes_dirichlets = [sparse_dirichlet(nk_landscapes_laplacians[i], landscape_y[i][0]) for i in range(len(nk_landscapes_laplacians))]
latent_rep_mlp_dirichlets = [sparse_dirichlet(latent_rep_mlp_laplacians[i], landscape_y[i][0]) for i in range(len(latent_rep_mlp_laplacians))]

IndexError: list index out of range

In [19]:
latent_representations = {x:{k:{r:None for r in range(N_REPLICATES)} for k in range(SEQ_LEN)} for x in model_names}

def get_latent_representation_hook(model_name, k_value, replicate_number):
    def hook_fn(module, input, output):
        latent_representations[model_name][k_value][replicate_number] = output
    return hook_fn


hooks = [[] for k_val in range(SEQ_LEN)]

model_name = 'mlp' 
for k_val in range(SEQ_LEN): 
    for replicate in range(N_REPLICATES): 
        model = instantiated_models[model_name][k_val][replicate]
        hook = model.fc_layers[-1].register_forward_hook(get_latent_representation_hook(model_name, k_val, replicate))
        hooks[k_val].append(hook)

for k_val in range(SEQ_LEN): 

    for replicate in range(N_REPLICATES):
        in_data = landscapes_as_tensor[k_val][replicate]
        print(in_data.shape)
        model = instantiated_models[model_name][k_val][replicate]
        _ = model(in_data)


torch.Size([46656, 6, 6])


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [77]:
landscapes_as_tensor = [[torch.from_numpy(i).to(torch.float32) for i in j[0]] for j in splits]
flattened_landscapes = [[i.reshape(i.shape[0], i.shape[1]*i.shape[2]) for i in j[0]] for j in splits]

In [78]:
landscapes_as_tensor[0][0].shape

torch.Size([46656, 6, 6])

In [79]:
# Run forward pass for each model and capture latent representations
for k_val in range(SEQ_LEN): 

    for replicate in range(N_REPLICATES):
        in_data = landscapes_as_tensor[k_val][replicate]
        print(in_data.shape)
        model = instantiated_models[model_name][k_val][replicate]
        _ = model(in_data).cpu().detach().numpy() 


torch.Size([46656, 6, 6])
torch.Size([46656, 6, 6])
torch.Size([46656, 6, 6])
torch.Size([46656, 6, 6])
torch.Size([46656, 6, 6])
torch.Size([46656, 6, 6])


In [73]:
degree=30

In [80]:
latent_representations['mlp'][0][0]

tensor([[0.5851],
        [0.5791],
        [0.5643],
        ...,
        [0.5685],
        [0.6438],
        [0.4562]], grad_fn=<AddmmBackward0>)

In [65]:
latent_rep_knn    = []
model_name = 'mlp'
for k_val in range(SEQ_LEN):
    klist = []
    for replicate in range(N_REPLICATES): 
        kneighbors_graph(latent_representations[model_name][k_val][replicate], n_ne)

[<Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 1399680 stored elements and shape (46656, 46656)>]