https://colab.research.google.com/drive/1iDc3BahJig-TKtN1c2woTjN6PW7zz0_m?usp=sharing

Code to generate and save pretrained Autoencoders and .csv-files of latent represenations.

How to use this code:
1.   Upload data sets to content pane. To run the code wo. modifications the data sets should be called:
*   exprs_intersect.csv for gene data
*   intersect_test_genes_imputed.csv for gene data of test cohorts

2.   Create folders 'models', 'csv' and 'csv_eval' in content pane  
3.   Run the Notebook

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import pandas as pd
from typing import Callable
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import copy
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
"""
The code in this chunk is based on the autoencoder implementation
in this repository https://github.com/phcavelar/pathwayae.
"""
class MLP(nn.Module):
    def __init__(
            self,
            input_dim:int,
            hidden_dims:list[int],
            output_dim:int,
            nonlinearity:Callable,
            dropout_rate:float=0.5,
            bias:bool=True,
            ):
        super().__init__()
        in_dims = [input_dim] + hidden_dims
        out_dims = hidden_dims + [output_dim]

        self.layers = nn.ModuleList([nn.Linear(d_in, d_out, bias=bias) for d_in, d_out in zip(in_dims, out_dims)])
        self.nonlinearity = nonlinearity
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x:torch.Tensor) -> torch.Tensor:
        for layer in self.layers[:-1]:
            x = self.dropout(self.nonlinearity(layer(x)))
        return self.layers[-1](x)

    def layer_activations(self, x:torch.Tensor) -> list[torch.Tensor]:
        # To allow for activation normalisation
        activations = [x]
        for layer in self.layers[:-1]:
            activations.append(self.dropout(self.nonlinearity(layer(activations[-1]))))
        return activations[1:] + [self.layers[-1](activations[-1])]

class NopLayer(nn.Module):
    def __init__(
            self,
            *args,
            **kwargs,
            ):
        super().__init__()

    def forward(self, x:torch.Tensor) -> torch.Tensor:
        return x

    def update_temperature(self,*args,**kwargs) -> None:
        pass

    def layer_activations(self,*args,**kwargs) -> list[torch.Tensor]:
        return []

class Autoencoder(nn.Module):
    def __init__(
            self,
            input_dim:int=None,
            hidden_dims:list[int]=[128],
            encoding_dim:int=64,
            nonlinearity=F.relu,
            final_nonlinearity=lambda x:x,
            dropout_rate:float=0.5,
            bias:bool=True,
            ):
        super().__init__()
        if input_dim is None:
            raise ValueError("Must specify input dimension before initialising the model")
        try:
            len(hidden_dims)
        except TypeError:
            hidden_dims = [hidden_dims]

        self.encoder = MLP(input_dim, hidden_dims, encoding_dim, nonlinearity, dropout_rate, bias)
        self.decoder = MLP(encoding_dim, hidden_dims[-1::-1], input_dim, nonlinearity, dropout_rate, bias)
        self.final_nonlinearity = final_nonlinearity

    def encode(self,x:torch.Tensor) -> torch.Tensor:
        return self.encoder(x)

    def decode(self,x:torch.Tensor) -> torch.Tensor:
        return self.final_nonlinearity(self.decoder(x))

    def forward(self, x:torch.Tensor) -> torch.Tensor:
        z = self.encode(x)
        x_hat = self.decode(z)
        return x_hat

    def layer_activations(self,x:torch.Tensor) -> list[torch.Tensor]:
        # To allow for activation normalisation
        encoder_activations = self.encoder.layer_activations(x)
        decoder_activations = self.decoder.layer_activations(encoder_activations[-1])
        return encoder_activations + decoder_activations

    def get_feature_importance_matrix(self) -> torch.Tensor:
        with torch.no_grad():
            feature_importance_matrix = self.encoder.layers[0].weight.T
            for layer in self.encoder.layers[1:]:
                feature_importance_matrix = torch.matmul(feature_importance_matrix, layer.weight.T)
        return feature_importance_matrix.detach()



In [16]:
def fit_autoenc(X):
  """
  Trains an autoencoder model on the given dataset X.

  Args:
      X (DataFrame): Input dataset.

  Returns:
      model (Autoencoder): Trained autoencoder model with the best state (based on validation loss).
  """
  model = Autoencoder(input_dim = len(X.columns)).to(device)
  from torch.utils.data import Dataset, DataLoader, TensorDataset

  train, test = train_test_split(X, test_size=0.1, random_state=True, shuffle=True, stratify=None)

  data_train = torch.FloatTensor(train.values).to(device)
  dataset_train = TensorDataset(data_train)

  data_test = torch.FloatTensor(test.values).to(device)
  dataset_test = TensorDataset(data_test)

  training_history =  {'train_loss': [], 'val_loss': []}
  patience = 15
  counter = 0
  best_val_loss = float('inf')
  best_model_state = None


  train_loader = DataLoader(
      dataset=dataset_train,
      batch_size=128,
      shuffle=True
  )

  val_loader = DataLoader(
      dataset=dataset_test,
      batch_size=128,
      shuffle=False
  )

  criterion = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr=1e-3)

  num_epochs = 500
  for epoch in range(num_epochs):
      total_loss = 0
      for i, (genes,) in enumerate(train_loader):
          genes = genes.to(device)

          outputs = model(genes)
          loss = criterion(outputs, genes)

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          total_loss += loss.item()

      avg_loss = total_loss / len(train_loader)
      print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')
      training_history['train_loss'].append(avg_loss)

      model.eval()
      val_loss = 0.0
      with torch.no_grad():
          for i, (genes, ) in enumerate(val_loader):
              genes = genes.to(device)
              outputs = model(genes)
              val_loss += criterion(outputs, genes).item()

      val_loss = val_loss / len(val_loader)
      training_history['val_loss'].append(val_loss)

      if val_loss < best_val_loss:
          best_val_loss = val_loss
          best_model_state = copy.deepcopy(model.state_dict())
          counter = 0
      else:
          counter += 1

      if counter > patience:
          print(f"Early stopping at epoch {epoch+1}")
          break

        # Restore best model
  if best_model_state is not None:
      model.load_state_dict(best_model_state)

  return model

def check_early_stopping(counter, training_history):
  """
  Checks for early stopping by comparing the last two validation losses.

  Args:
      counter (int): Current early stopping counter.
      training_history (dict): Dictionary containing training and validation losses.

  Returns:
      counter (int): Updated counter value based on validation loss comparison.
  """
  if len(training_history['val_loss']) < 2:
      return 0.0

  print(training_history['val_loss'][-1])
  print(training_history['val_loss'][-2])

  if training_history['val_loss'][-1] < training_history['val_loss'][-2]:
      counter = 0.0
  else:
      counter += 1.0
  return counter



In [17]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import joblib
from google.colab import files

def remove_cohort(X, c):
    """
    Removes specified cohort from the dataset.

    Args:
        X (DataFrame): Intersection genes with cohort info.
        c (str): The cohort to be removed from the dataset.

    Returns:
        filtered_X (DataFrame): Dataset without the specified cohort.
        missing_X (DataFrame): Dataset containing only the samples from the specified cohort.
    """
    X['c'] = X.index.to_series().str.split('.').str[0]
    indices = X[X['c'] != c].index
    numeric_indices = [X.index.get_loc(idx) for idx in indices]
    filtered_X = X.loc[indices].drop(['c'], axis = 1)

    missing_indices = X[X['c'] == c].index
    missing_numeric_indices = [X.index.get_loc(idx) for idx in missing_indices]
    missing_X = X.loc[missing_indices].drop(['c'], axis = 1)
    return filtered_X, missing_X


def fit_feature_sel(X, current_c = ''):
    """
    Trains an autoencoder for each cohort in the input dataset and saves trained autoencoder.
    In addition, it saves the latent representations of the training and test cohorts (Group A) and
    the latent representations of the test cohorts (Group B).

    Args:
        X (DataFrame): The input dataset containing samples from multiple cohorts.
        current_c (str): A prefix for saving models and CSV files (default is an empty string).

    Returns:
        None
    """
    cohorts = X.index.to_series().str.split('.').str[0].unique()

    print(current_c)
    for c in cohorts:
        print(c)

        X_tmp, X_missing = remove_cohort(X, c)
        X_val = pd.read_csv('/content/intersect_test_genes_imputed.csv', index_col = 0)
        estimator_c = fit_autoenc(X_tmp)
        latent_X = estimator_c.encoder(torch.FloatTensor(X_tmp.values).to(device)).detach().cpu().numpy()
        latent_X_test = estimator_c.encoder(torch.FloatTensor(X_missing.values).to(device)).detach().cpu().numpy()
        latent = np.concatenate((latent_X, latent_X_test), axis = 0)
        idx = np.concatenate((X_tmp.index, X_missing.index), axis = 0)
        df = pd.DataFrame(latent, index = idx)

        latent_eval = estimator_c.encoder(torch.FloatTensor(X_val.values).to(device)).detach().cpu().numpy()
        idx_eval = X_val.index
        df_eval = pd.DataFrame(latent_eval, index = idx_eval)

        if current_c == '':
            c_path = '/content/models/' + c + '.pth'
            torch.save(estimator_c.state_dict(), c_path)
            df.to_csv(c + '.csv')
            path_csv = '/content/csv/' + c + '.csv'
            df.to_csv(path_csv)

            path_csv_eval = '/content/csv_eval/' + c + '.csv'
            df_eval.to_csv(path_csv_eval)
        else:
            c_path = '/content/models/' + current_c + '_' + c + '.pth'
            torch.save(estimator_c.state_dict(), c_path)
            path_csv = '/content/csv/' + current_c + '_' + c + '.csv'
            df.to_csv(path_csv)
            path_csv_eval = '/content/csv_eval/' + current_c + '_' + c + '.csv'
            df_eval.to_csv(path_csv_eval)


In [18]:
X = pd.read_csv('/content/exprs_intersect.csv', index_col = 0)
cohorts = X.index.to_series().str.split('.').str[0].unique()

In [19]:
# Latent representations/AE for Hyperparameter tuning
fit_feature_sel(X)
print("Outer round done!")


Atlanta_2014_Long
Epoch [1/5], Average Loss: 1.1041
Epoch [2/5], Average Loss: 1.0062
Epoch [3/5], Average Loss: 0.9846
Epoch [4/5], Average Loss: 0.9344
Epoch [5/5], Average Loss: 0.8757
Belfast_2018_Jain
Epoch [1/5], Average Loss: 1.1445
Epoch [2/5], Average Loss: 1.0207
Epoch [3/5], Average Loss: 1.0026
Epoch [4/5], Average Loss: 0.9753
Epoch [5/5], Average Loss: 0.9302
CamCap_2016_Ross_Adams
Epoch [1/5], Average Loss: 1.1132
Epoch [2/5], Average Loss: 0.9954
Epoch [3/5], Average Loss: 0.9701
Epoch [4/5], Average Loss: 0.9239
Epoch [5/5], Average Loss: 0.8628
CancerMap_2017_Luca
Epoch [1/5], Average Loss: 1.0868
Epoch [2/5], Average Loss: 0.9915
Epoch [3/5], Average Loss: 0.9747
Epoch [4/5], Average Loss: 0.9507
Epoch [5/5], Average Loss: 0.9110
CPC_GENE_2017_Fraser
Epoch [1/5], Average Loss: 1.1236
Epoch [2/5], Average Loss: 1.0287
Epoch [3/5], Average Loss: 0.9926
Epoch [4/5], Average Loss: 0.9854
Epoch [5/5], Average Loss: 0.8749
CPGEA_2020_Li
Epoch [1/5], Average Loss: 1.1204
E

In [20]:
cohorts = X.index.to_series().str.split('.').str[0].unique()
# Latent representations/AEs for nested resampling
for c in cohorts:
    X_tmp, X_missing = remove_cohort(X, c)
    fit_feature_sel(X_tmp, c)

print("Inner round done")

Atlanta_2014_Long
Belfast_2018_Jain
Epoch [1/5], Average Loss: 1.1509
Epoch [2/5], Average Loss: 1.0230
Epoch [3/5], Average Loss: 0.9957
Epoch [4/5], Average Loss: 0.9682
Epoch [5/5], Average Loss: 0.9208
CamCap_2016_Ross_Adams
Epoch [1/5], Average Loss: 1.1530
Epoch [2/5], Average Loss: 1.0197
Epoch [3/5], Average Loss: 0.9773
Epoch [4/5], Average Loss: 0.9606
Epoch [5/5], Average Loss: 0.8960
CancerMap_2017_Luca
Epoch [1/5], Average Loss: 1.1381
Epoch [2/5], Average Loss: 1.0427
Epoch [3/5], Average Loss: 0.9878
Epoch [4/5], Average Loss: 0.9412
Epoch [5/5], Average Loss: 0.9081
CPC_GENE_2017_Fraser
Epoch [1/5], Average Loss: 1.1153
Epoch [2/5], Average Loss: 1.0127
Epoch [3/5], Average Loss: 0.9873
Epoch [4/5], Average Loss: 0.9466
Epoch [5/5], Average Loss: 0.8971
CPGEA_2020_Li
Epoch [1/5], Average Loss: 1.0711
Epoch [2/5], Average Loss: 1.0190
Epoch [3/5], Average Loss: 0.9548
Epoch [4/5], Average Loss: 0.9034
Epoch [5/5], Average Loss: 0.8539
DKFZ_2018_Gerhauser
Epoch [1/5], Ave

In [21]:
X = X.drop(['c'], axis = 1)
est_final = fit_autoenc(X)

# Latent representations for performance evaluation on final models
latent_X = est_final.encoder(torch.FloatTensor(X.values).to(device)).detach().cpu().numpy()
df = pd.DataFrame(latent_X, index = X.index)
df.to_csv('/content/csv/pretrnd_cmplt.csv')
torch.save(est_final.state_dict(), '/content/models/pretrnd_cmplt.pth')

X_val = pd.read_csv('/content/intersect_test_genes_imputed.csv', index_col = 0)
latent_eval = est_final.encoder(torch.FloatTensor(X_val.values).to(device)).detach().cpu().numpy()
idx_eval = X_val.index
df_eval = pd.DataFrame(latent_eval, index = idx_eval)
df_eval.to_csv('/content/csv_eval/pretrnd_cmplt.csv')


Epoch [1/5], Average Loss: 1.1858
Epoch [2/5], Average Loss: 1.0073
Epoch [3/5], Average Loss: 0.9770
Epoch [4/5], Average Loss: 0.9345
Epoch [5/5], Average Loss: 0.8905


In [22]:
!zip -r csv.zip csv/

  adding: csv/ (stored 0%)
  adding: csv/CPC_GENE_2017_Fraser_CPGEA_2020_Li.csv (deflated 56%)
  adding: csv/pretrnd_cmplt.csv (deflated 56%)
  adding: csv/Atlanta_2014_Long_CancerMap_2017_Luca.csv (deflated 56%)
  adding: csv/MSKCC_2010_Taylor_Stockholm_2016_Ross_Adams.csv (deflated 56%)
  adding: csv/MSKCC_2010_Taylor_CPC_GENE_2017_Fraser.csv (deflated 56%)
  adding: csv/Atlanta_2014_Long.csv (deflated 56%)
  adding: csv/DKFZ_2018_Gerhauser_Stockholm_2016_Ross_Adams.csv (deflated 56%)
  adding: csv/Stockholm_2016_Ross_Adams_CPC_GENE_2017_Fraser.csv (deflated 56%)
  adding: csv/CancerMap_2017_Luca_MSKCC_2010_Taylor.csv (deflated 56%)
  adding: csv/CancerMap_2017_Luca_CPGEA_2020_Li.csv (deflated 56%)
  adding: csv/CamCap_2016_Ross_Adams_Atlanta_2014_Long.csv (deflated 56%)
  adding: csv/CPGEA_2020_Li_Stockholm_2016_Ross_Adams.csv (deflated 56%)
  adding: csv/CamCap_2016_Ross_Adams_Stockholm_2016_Ross_Adams.csv (deflated 56%)
  adding: csv/Stockholm_2016_Ross_Adams_MSKCC_2010_Taylor.csv

In [23]:
!zip -r csv_eval.zip csv_eval/

  adding: csv_eval/ (stored 0%)
  adding: csv_eval/CPC_GENE_2017_Fraser_CPGEA_2020_Li.csv (deflated 56%)
  adding: csv_eval/pretrnd_cmplt.csv (deflated 56%)
  adding: csv_eval/Atlanta_2014_Long_CancerMap_2017_Luca.csv (deflated 56%)
  adding: csv_eval/MSKCC_2010_Taylor_Stockholm_2016_Ross_Adams.csv (deflated 56%)
  adding: csv_eval/MSKCC_2010_Taylor_CPC_GENE_2017_Fraser.csv (deflated 56%)
  adding: csv_eval/Atlanta_2014_Long.csv (deflated 56%)
  adding: csv_eval/DKFZ_2018_Gerhauser_Stockholm_2016_Ross_Adams.csv (deflated 56%)
  adding: csv_eval/Stockholm_2016_Ross_Adams_CPC_GENE_2017_Fraser.csv (deflated 56%)
  adding: csv_eval/CancerMap_2017_Luca_MSKCC_2010_Taylor.csv (deflated 56%)
  adding: csv_eval/CancerMap_2017_Luca_CPGEA_2020_Li.csv (deflated 56%)
  adding: csv_eval/CamCap_2016_Ross_Adams_Atlanta_2014_Long.csv (deflated 56%)
  adding: csv_eval/CPGEA_2020_Li_Stockholm_2016_Ross_Adams.csv (deflated 56%)
  adding: csv_eval/CamCap_2016_Ross_Adams_Stockholm_2016_Ross_Adams.csv (defla

In [24]:
!zip -r models.zip models/

  adding: models/ (stored 0%)
  adding: models/DKFZ_2018_Gerhauser.pth (deflated 8%)
  adding: models/Stockholm_2016_Ross_Adams_MSKCC_2010_Taylor.pth (deflated 8%)
  adding: models/Stockholm_2016_Ross_Adams_Belfast_2018_Jain.pth (deflated 8%)
  adding: models/MSKCC_2010_Taylor_Stockholm_2016_Ross_Adams.pth (deflated 8%)
  adding: models/CPGEA_2020_Li_Atlanta_2014_Long.pth (deflated 8%)
  adding: models/DKFZ_2018_Gerhauser_CancerMap_2017_Luca.pth (deflated 8%)
  adding: models/CamCap_2016_Ross_Adams_CancerMap_2017_Luca.pth (deflated 8%)
  adding: models/Atlanta_2014_Long_DKFZ_2018_Gerhauser.pth (deflated 8%)
  adding: models/Atlanta_2014_Long_CancerMap_2017_Luca.pth (deflated 8%)
  adding: models/CamCap_2016_Ross_Adams_Stockholm_2016_Ross_Adams.pth (deflated 8%)
  adding: models/DKFZ_2018_Gerhauser_CPC_GENE_2017_Fraser.pth (deflated 8%)
  adding: models/CamCap_2016_Ross_Adams.pth (deflated 8%)
  adding: models/Atlanta_2014_Long_MSKCC_2010_Taylor.pth (deflated 8%)
  adding: models/Atlant

In [25]:
#files.download('models.zip')

In [26]:
#files.download('csv_eval.zip')

In [27]:
#files.download('csv.zip')