## Libraries importing

In [135]:
# Load packages and classes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tiffslide
import seaborn as sns
import gget
import tifffile
import zarr
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# MosaicDataset and BruceDataset classes allow loading and visualisation of the different data sources
from gbmhackathon import MosaicDataset

## Getting data

In [57]:
source_dict_mosaic = MosaicDataset.load_tabular()

In [79]:
print(source_dict_mosaic.keys())
print(source_dict_mosaic['wes'].keys())

dele = source_dict_mosaic['wes']['WES CNV deletion']
amp = source_dict_mosaic['wes']['WES CNV amplification']
mut = source_dict_mosaic['wes']['WES mutations']

dict_keys(['clinical', 'bulk_rna', 'wes', 'he'])
dict_keys(['WES CNV deletion', 'WES CNV amplification', 'WES CNV oncogenic', 'WES mutations'])


## Organizing the data to have the same columns

In [80]:
int_size = len(set(mut.columns).intersection(set(amp.columns)))

print(f"The size of the intersection is {int_size}")

The size of the intersection is 498


In [98]:
## I discovered that we have a duplicated column at amp and dele
amp = amp[mut.columns]
dele = dele[mut.columns]

len(set(amp.columns))
column_counts = pd.Series(amp.columns).value_counts()
duplicated_columns = column_counts[column_counts > 1].index.tolist()
duplicated_columns

['P2RY8']

In [102]:
## Droping the duplicated column
column_counts = pd.Series(amp.columns).value_counts()
duplicated_columns = column_counts[column_counts > 1].index.tolist()

amp = amp.loc[:, ~amp.columns.duplicated()]
dele = dele.loc[:, ~dele.columns.duplicated()]

print(amp.shape)
print(dele.shape)
print(mut.shape)

(107, 498)
(107, 498)
(107, 498)


## Creating the architecture to embed

In [207]:
## In this case is constructed as the article say, and using the concatenation

class DNAModel(nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim,dropout_rate=0.2):
        super(DNAModel, self).__init__()
        fused_input_dim = input_dim * 3
        self.embedding = nn.Sequential(
            nn.Linear(fused_input_dim, input_dim),
            nn.Linear(input_dim, input_dim),
            nn.Linear(input_dim, input_dim),
            nn.Linear(input_dim, middle_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU(),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, output_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU()
        )
    
    def forward(self, x):
        x = x.view(1, -1)
        x = self.embedding(x)
        return x

In [208]:
input_dim = 498
middle_dim = 128     
output_dim = 64      

model = DNAModel(input_dim, middle_dim, output_dim)

for i in range(mut.shape[0]):
    mut_row = torch.tensor(mut.iloc[i, :].to_numpy(), dtype=torch.float32)
    dele_row = torch.tensor(dele.iloc[i, :].to_numpy(), dtype=torch.float32)
    amp_row = torch.tensor(amp.iloc[i, :].to_numpy(), dtype=torch.float32)

    input_data = torch.stack([mut_row, dele_row, amp_row], dim=0)
    torch.unsqueeze(input_data, dim = 0)
    output = model(input_data)

output.shape

torch.Size([1, 64])

In [134]:
## testing line
input_data = mut_row
input_data = torch.stack([mut_row, dele_row, amp_row], dim=0)

input_data.view(1,-1)
output = model(input_data)

tensor([[0., 0., 0.,  ..., 0., 0., 0.]])

### Alternative model with an embedding before

In [None]:
class DNAModel(nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim,dropout_rate=0.2):
        super(DNAModel, self).__init__()
        self.embedding = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.Linear(input_dim, input_dim),
            nn.Linear(input_dim, input_dim),
            nn.Linear(input_dim, middle_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU(),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, output_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU()
        )
    
    def forward(self, x):
        x = x.view(1, -1)
        x = self.embedding(x)
        return x

input_dim = 498
middle_dim = 128     
output_dim = 64      

model = DNAModel(input_dim, middle_dim, output_dim)

for i in range(mut.shape[0]):
    mut_row = torch.tensor(mut.iloc[i, :].to_numpy(), dtype=torch.float32)
    dele_row = torch.tensor(dele.iloc[i, :].to_numpy(), dtype=torch.float32)
    amp_row = torch.tensor(amp.iloc[i, :].to_numpy(), dtype=torch.float32)

    input_data = torch.stack([mut_row, dele_row, amp_row], dim=0)
    torch.unsqueeze(input_data, dim = 0)
    output = model(input_data)

output.shape

In [None]:
class DNAModelCNN(nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim, dropout_rate=0.2):
        super(DNAModelCNN, self).__init__()

        self.conv1d = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2) 

        conv_out_dim = (input_dim // 2) * 8  # Ajuste do tamanho da saída após convolução + pooling
        self.fc = nn.Sequential(
            nn.Linear(conv_out_dim, middle_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(middle_dim, output_dim)
        )

    def forward(self, x):
        x = self.conv1d(x)  # Aplica convolução 1D (mantém a relação entre as 3 linhas)
        x = self.relu(x)
        x = self.pool(x)  # Reduz dimensão pela metade
        x = x.view(x.shape[0], -1)  # Flatten para entrada na rede densa
        x = self.fc(x)  # Passa pelas camadas densas
        return x


In [None]:
# Definição das dimensões
input_dim = 498
middle_dim = 128
output_dim = 64

# Criar modelo
model = DNAModelCNN(input_dim, middle_dim, output_dim)

# Simulação dos dados (3 linhas, input_dim colunas)
mut_row = torch.rand(input_dim)
dele_row = torch.rand(input_dim)
amp_row = torch.rand(input_dim)

# Empilhando as linhas no formato (3, input_dim)
input_data = torch.stack([mut_row, dele_row, amp_row], dim=0).unsqueeze(0)  # (1, 3, input_dim) para batch

# Passando pelo modelo
output = model(input_data)

print("Saída do modelo:", output.shape)  # Deve ser (1, output_dim)


In [137]:
# 🔹 Definição do Modelo CNN 1D
class DNAModelCNN(nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim, dropout_rate=0.2):
        super(DNAModelCNN, self).__init__()

        self.conv1d = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)

        conv_out_dim = (input_dim // 2) * 8  # Ajuste do tamanho da saída
        self.fc = nn.Sequential(
            nn.Linear(conv_out_dim, middle_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(middle_dim, output_dim)
        )

    def forward(self, x):
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.shape[0], -1)  # Flatten
        x = self.fc(x)
        return x

# 🔹 Dataset Customizado
class DNADataset(Dataset):
    def __init__(self, mut_data, dele_data, amp_data):
        self.mut_data = torch.tensor(mut_data.to_numpy(), dtype=torch.float32)
        self.dele_data = torch.tensor(dele_data.to_numpy(), dtype=torch.float32)
        self.amp_data = torch.tensor(amp_data.to_numpy(), dtype=torch.float32)
        self.labels = torch.tensor(labels.to_numpy(), dtype=torch.long)

    def __len__(self):
        return len(self.amp_data.shape[0])

    def __getitem__(self, idx):
        mut_row = self.mut_data[idx]
        dele_row = self.dele_data[idx]
        amp_row = self.amp_data[idx]

        input_data = torch.stack([mut_row, dele_row, amp_row], dim=0)

        return input_data

# 🔹 Função para Treinar o Modelo
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in dataloader:
            inputs, labels = inputs.unsqueeze(1), labels  # Adiciona batch dim
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

# 🔹 Função para Obter Representações
def extract_representations(model, dataloader):
    model.eval()
    representations = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.unsqueeze(1)
            reps = model(inputs)  # Pegamos a saída do modelo
            representations.append(reps)

    return torch.cat(representations, dim=0)

# 🔹 Parâmetros
input_dim = 498
middle_dim = 128
output_dim = 64
batch_size = 32
learning_rate = 0.001
num_epochs = 20

dataset = DNADataset(mut, dele, amp)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 🔹 Criando Modelo e Treinando
model = DNAModelCNN(input_dim, middle_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_model(model, dataloader, criterion, optimizer, num_epochs)

# 🔹 Extraindo Representações do Modelo Treinado
representations = extract_representations(model, dataloader)
print("Formato das Representações:", representations.shape)  # Deve ser (1000, 64)


TypeError: DNADataset.__init__() missing 1 required positional argument: 'labels'

In [145]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# 🔹 Modelo CNN Encoder para Representação Não Supervisionada
class DNAModelCNN(nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim, dropout_rate=0.2):
        super(DNAModelCNN, self).__init__()

        self.conv1d = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)

        conv_out_dim = (input_dim // 2) * 8  # Ajuste da saída após pooling
        self.encoder = nn.Sequential(
            nn.Linear(conv_out_dim, middle_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(middle_dim, output_dim)  # O output agora é o embedding final
        )

    def forward(self, x):
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.shape[0], -1)  # Flatten
        x = self.encoder(x)  # Gerar embedding final
        return x

In [170]:
class DNADataset(Dataset):
    def __init__(self, mut_data, dele_data, amp_data):
        self.mut_data = torch.tensor(mut_data.to_numpy(), dtype=torch.float32)
        self.dele_data = torch.tensor(dele_data.to_numpy(), dtype=torch.float32)
        self.amp_data = torch.tensor(amp_data.to_numpy(), dtype=torch.float32)

    def __len__(self):
        return len(self.mut_data)

    def __getitem__(self, idx):
        mut_row = self.mut_data[idx]
        dele_row = self.dele_data[idx]
        amp_row = self.amp_data[idx]

        input_data = torch.stack([mut_row, dele_row, amp_row], dim=0)

        return input_data  # Agora retorna só o input, sem label


In [187]:
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs in dataloader:
            #inputs = inputs.unsqueeze(1)  # Adiciona batch dim
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, outputs)  # Tenta preservar a própria estrutura
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")


In [190]:
def extract_representations(model, dataloader):
    model.eval()
    representations = []
    with torch.no_grad():
        for inputs in dataloader:
            #inputs = inputs.unsqueeze(1)
            reps = model(inputs)  # Pegamos a saída do encoder
            representations.append(reps)

    return torch.cat(representations, dim=0)


In [191]:
# 🔹 Definição dos parâmetros
input_dim = 498
middle_dim = 128
output_dim = 64
batch_size = 32
learning_rate = 0.001
num_epochs = 20

dataset = DNADataset(mut, dele, amp)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 🔹 Criando o modelo
model = DNAModelCNN(input_dim, middle_dim, output_dim)
criterion = nn.MSELoss()  # Agora estamos tentando preservar a estrutura das representações
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 🔹 Treinando
train_model(model, dataloader, criterion, optimizer, num_epochs)

# 🔹 Extraindo Representações
representations = extract_representations(model, dataloader)
print("Formato das Representações:", representations.shape)  # Deve ser (1000, 64)

Epoch 1/20, Loss: 0.0
Epoch 2/20, Loss: 0.0
Epoch 3/20, Loss: 0.0
Epoch 4/20, Loss: 0.0
Epoch 5/20, Loss: 0.0
Epoch 6/20, Loss: 0.0
Epoch 7/20, Loss: 0.0
Epoch 8/20, Loss: 0.0
Epoch 9/20, Loss: 0.0
Epoch 10/20, Loss: 0.0
Epoch 11/20, Loss: 0.0
Epoch 12/20, Loss: 0.0
Epoch 13/20, Loss: 0.0
Epoch 14/20, Loss: 0.0
Epoch 15/20, Loss: 0.0
Epoch 16/20, Loss: 0.0
Epoch 17/20, Loss: 0.0
Epoch 18/20, Loss: 0.0
Epoch 19/20, Loss: 0.0
Epoch 20/20, Loss: 0.0
Formato das Representações: torch.Size([107, 64])


## On y va

In [196]:
class DNAAutoencoderCNN(nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim, dropout_rate=0.2):
        super(DNAAutoencoderCNN, self).__init__()

        # 🔹 Encoder
        self.conv1d = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)

        conv_out_dim = (input_dim // 2) * 8  # reduced size by convolution
        self.encoder = nn.Sequential(
            nn.Linear(conv_out_dim, middle_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(middle_dim, output_dim) 
        )

        self.decoder = nn.Sequential(
            nn.Linear(output_dim, middle_dim),
            nn.ReLU(),
            nn.Linear(middle_dim, conv_out_dim),
            nn.ReLU()
        )

        self.upsample = nn.Upsample(scale_factor=2, mode="linear")  # To restore the dimension
        self.deconv1d = nn.ConvTranspose1d(in_channels=8, out_channels=3, kernel_size=3, padding=1)

    def forward(self, x):
        # 🔹 Encoder
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(x.shape[0], -1)  
        encoded = self.encoder(x)  
        
        # 🔹 Decoder
        x = self.decoder(encoded)
        x = x.view(x.shape[0], 8, -1)  # Restauration to the format of deconv1
        x = self.upsample(x)  # 🔹 restauration of original size
        x = self.deconv1d(x)  # Return [batch, 3, input_dim]

        return encoded, x  # Embedding + reconstruction

In [197]:
def train_autoencoder(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs in dataloader:
            optimizer.zero_grad()
            encoded, reconstructed = model(inputs)  # Pegamos a saída reconstruída
            loss = criterion(reconstructed, inputs)  # 🔹 Comparação com a entrada original!
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")


In [198]:
def extract_representations(model, dataloader):
    model.eval()
    representations = []
    with torch.no_grad():
        for inputs in dataloader:
            encoded, _ = model(inputs)  # Pegamos apenas o embedding
            representations.append(encoded)

    return torch.cat(representations, dim=0)


In [206]:
input_dim = 498
middle_dim = 128
output_dim = 64
batch_size = 32
learning_rate = 0.001
num_epochs = 400

model = DNAAutoencoderCNN(input_dim, middle_dim, output_dim)
criterion = nn.MSELoss() 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_autoencoder(model, dataloader, criterion, optimizer, num_epochs)

representations = extract_representations(model, dataloader)
print("Formato das Representações:", representations.shape)  


Epoch 1/400, Loss: 0.048531231470406055
Epoch 2/400, Loss: 0.03448156779631972
Epoch 3/400, Loss: 0.027836423832923174
Epoch 4/400, Loss: 0.02715453878045082
Epoch 5/400, Loss: 0.02653001481667161
Epoch 6/400, Loss: 0.025478763040155172
Epoch 7/400, Loss: 0.0252101537771523
Epoch 8/400, Loss: 0.02554304664954543
Epoch 9/400, Loss: 0.025003886315971613
Epoch 10/400, Loss: 0.023920131847262383
Epoch 11/400, Loss: 0.02502440381795168
Epoch 12/400, Loss: 0.02389764552935958
Epoch 13/400, Loss: 0.024430389516055584
Epoch 14/400, Loss: 0.023858182597905397
Epoch 15/400, Loss: 0.02285435190424323
Epoch 16/400, Loss: 0.023500895127654076
Epoch 17/400, Loss: 0.02481219545006752
Epoch 18/400, Loss: 0.02326290449127555
Epoch 19/400, Loss: 0.023387568537145853
Epoch 20/400, Loss: 0.023261358495801687
Epoch 21/400, Loss: 0.023011695593595505
Epoch 22/400, Loss: 0.024504899978637695
Epoch 23/400, Loss: 0.023283464834094048
Epoch 24/400, Loss: 0.02474353276193142
Epoch 25/400, Loss: 0.022635506931692

In [221]:
import torch
import torch.nn as nn
import torch.optim as optim

class DNAModel(nn.Module):
    def __init__(self, input_dim, middle_dim, output_dim, dropout_rate=0.2):
        super(DNAModel, self).__init__()
        fused_input_dim = input_dim * 3
        self.encoder = nn.Sequential(
            nn.Linear(fused_input_dim, input_dim),
            nn.Linear(input_dim, input_dim),
            nn.Linear(input_dim, input_dim),
            nn.Linear(input_dim, middle_dim),
            nn.Dropout(dropout_rate),
            nn.ReLU(),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, middle_dim),
            nn.Linear(middle_dim, output_dim),  # Embedding de saída
            nn.Dropout(dropout_rate),
            nn.ReLU()
        )

        # Decoder para reconstrução dos dados
        self.decoder = nn.Sequential(
            nn.Linear(output_dim, middle_dim),
            nn.ReLU(),
            nn.Linear(middle_dim, middle_dim),
            nn.ReLU(),
            nn.Linear(middle_dim, middle_dim),
            nn.ReLU(),
            nn.Linear(middle_dim, fused_input_dim)  # Reconstrução da entrada original
        )

    def forward(self, x):
        x = x.view(1, -1)  # Ajuste da dimensão
        embedding = self.encoder(x)  # Criando a representação latente
        reconstruction = self.decoder(embedding)  # Reconstrução da entrada
        return embedding, reconstruction  # Retorna tanto o embedding quanto a reconstrução


def train_autoencoder(model, mut, dele, amp, criterion, optimizer, num_epochs=10):
    #model.train() 
    for epoch in range(num_epochs):
        total_loss = 0
        num_samples = mut.shape[0]
        for i in range(num_samples):
            mut_row = torch.tensor(mut.iloc[i, :].to_numpy(), dtype=torch.float32)
            dele_row = torch.tensor(dele.iloc[i, :].to_numpy(), dtype=torch.float32)
            amp_row = torch.tensor(amp.iloc[i, :].to_numpy(), dtype=torch.float32)
    
            input_data = torch.stack([mut_row, dele_row, amp_row], dim=0)
            input_data = input_data.view(1, -1)  
    
            optimizer.zero_grad()
            embedding, reconstruction = model(input_data)
    
            loss = criterion(reconstruction, input_data)
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / num_samples:.6f}")

# Parâmetros do modelo
input_dim = 498
middle_dim = 128     
output_dim = 64      

# Criar o modelo
model = DNAModel(input_dim, middle_dim, output_dim)

# Definir otimizador e função de perda
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Simulação dos dados
num_samples = mut.shape[0]  # Número de amostras
num_epochs = 80  # Número de épocas de treinamento

train_autoencoder(model, mut, dele, amp, criterion, optimizer, num_epochs)

sample_input = torch.stack([
    torch.tensor(mut.iloc[0, :].to_numpy(), dtype=torch.float32),
    torch.tensor(dele.iloc[0, :].to_numpy(), dtype=torch.float32),
    torch.tensor(amp.iloc[0, :].to_numpy(), dtype=torch.float32)
], dim=0).view(1, -1)

embedding, _ = model(sample_input)
print("Embedding gerado:", embedding)

Epoch 1, Loss: 0.024076
Epoch 2, Loss: 0.022949
Epoch 3, Loss: 0.022862
Epoch 4, Loss: 0.022896
Epoch 5, Loss: 0.023200
Epoch 6, Loss: 0.022764
Epoch 7, Loss: 0.035530
Epoch 8, Loss: 0.023515
Epoch 9, Loss: 0.022819
Epoch 10, Loss: 0.022632
Epoch 11, Loss: 0.022617
Epoch 12, Loss: 0.022589
Epoch 13, Loss: 0.022583
Epoch 14, Loss: 0.022574
Epoch 15, Loss: 0.022704
Epoch 16, Loss: 0.022615
Epoch 17, Loss: 0.022568
Epoch 18, Loss: 0.022545
Epoch 19, Loss: 0.022555
Epoch 20, Loss: 0.022547
Epoch 21, Loss: 0.022539
Epoch 22, Loss: 0.022524
Epoch 23, Loss: 0.022555
Epoch 24, Loss: 0.022541
Epoch 25, Loss: 0.022529
Epoch 26, Loss: 0.022528
Epoch 27, Loss: 0.022521
Epoch 28, Loss: 0.022527
Epoch 29, Loss: 0.022510
Epoch 30, Loss: 0.022483
Epoch 31, Loss: 0.022539
Epoch 32, Loss: 0.022523
Epoch 33, Loss: 0.022515
Epoch 34, Loss: 0.022513
Epoch 35, Loss: 0.022505
Epoch 36, Loss: 0.022504
Epoch 37, Loss: 0.022501
Epoch 38, Loss: 0.022497
Epoch 39, Loss: 0.022496
Epoch 40, Loss: 0.022492
Epoch 41,