In [11]:
%load_ext autoreload
%autoreload 2

import pytorch_lightning as pl
import os
import torch.nn as nn
from utils import data
import torch
from dotenv import load_dotenv

load_dotenv(".env")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [12]:
X_test, y_test = data.get_data("data/splits/test")
X_val, y_val = data.get_data("data/splits/val")


train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test, y_test), batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_val, y_val), batch_size=256)

X_test.shape

torch.Size([9120, 5045])

In [13]:
class Encoder(nn.Module):
    def __init__(self, feature_dim: int, latent_dim: int):
        super().__init__()
        self.latent_dim = latent_dim
        self.embedding = nn.Linear(feature_dim, latent_dim)

    def forward(self, x: torch.Tensor):
        return self.embedding(x)



class Decoder(nn.Module):
    def __init__(self, feature_dim: int, latent_dim: int):
        super().__init__()
        self.latent_dim = latent_dim
        self.embedding = nn.Linear(latent_dim, feature_dim)

    def forward(self, x: torch.Tensor):
        return self.embedding(x).sigmoid()

In [14]:
class Autoencoder(pl.LightningModule):
    def __init__(self, encoder, decoder, learning_rate=1e-3):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.learning_rate = learning_rate
        self.criterion = nn.MSELoss()

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

    def training_step(self, batch, batch_idx):
        x, _ = batch
        x_hat = self.forward(x)
        loss = self.criterion(x_hat, x)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, _ = batch
        x_hat = self.forward(x)
        loss = self.criterion(x_hat, x)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_accuracy", torch.mean((x_hat == x).to(torch.float32)), prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)


encoder = Encoder(5045, 200)
decoder = Decoder(5045, 200)

autoencoder = Autoencoder(encoder, decoder, 0.01)

neptune_logger = pl.loggers.NeptuneLogger(
    name="Autoencoder",
    project="JPL/rna-sequencing",
    api_key=os.getenv("NEPTUNE_API_TOKEN"),
    capture_stdout=False,
    capture_stderr=False,
    capture_traceback=False,
    capture_hardware_metrics=False,
)

trainer = pl.Trainer(logger=neptune_logger)

trainer.fit(autoencoder, train_loader, val_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/philipp/Documents/Studium/Informatik/Semester 2/Applied Machine Learning in Genomic Data Science/Project/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:72: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/JPL/rna-sequencing/e/RNAS-114



  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | encoder   | Encoder | 1.0 M  | train
1 | decoder   | Decoder | 1.0 M  | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
8.093     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/philipp/Documents/Studium/Informatik/Semester 2/Applied Machine Learning in Genomic Data Science/Project/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.
/Users/philipp/Documents/Studium/Informatik/Semester 2/Applied Machine Learning in Genomic Data Science/Project/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [24]:
def make_visible(vector):
    nonzero_indices = torch.nonzero(vector, as_tuple=True)[0]
    nonzero_values = vector[nonzero_indices]

    return list(zip(nonzero_indices.tolist(), nonzero_values.tolist()))

In [25]:
make_visible(X_test[0])

[(37, 2.442145824432373),
 (45, 0.7177269458770752),
 (50, 1.832396388053894),
 (91, 2.346402883529663),
 (94, 1.1312528848648071),
 (97, 2.529517889022827),
 (98, 0.7177269458770752),
 (100, 1.1312528848648071),
 (111, 1.1312528848648071),
 (120, 1.832396388053894),
 (124, 0.7177269458770752),
 (127, 0.7177269458770752),
 (128, 1.4229412078857422),
 (130, 1.1312528848648071),
 (140, 1.1312528848648071),
 (150, 4.158666133880615),
 (160, 2.12206768989563),
 (166, 2.346402883529663),
 (173, 0.7177269458770752),
 (183, 0.7177269458770752),
 (188, 1.1312528848648071),
 (197, 0.7177269458770752),
 (214, 1.4229412078857422),
 (244, 1.1312528848648071),
 (249, 0.7177269458770752),
 (273, 0.7177269458770752),
 (283, 0.7177269458770752),
 (290, 1.1312528848648071),
 (293, 0.7177269458770752),
 (294, 0.7177269458770752),
 (309, 1.4229412078857422),
 (323, 0.7177269458770752),
 (331, 1.4229412078857422),
 (337, 0.7177269458770752),
 (339, 0.7177269458770752),
 (373, 0.7177269458770752),
 (387, 0

In [26]:
make_visible(autoencoder(X_test[0]))

[(1, 9.384789527741422e-37),
 (2, 1.3161322553750058e-28),
 (6, 2.3596159290438046e-35),
 (8, 3.7780024766863535e-33),
 (9, 2.751830334298703e-26),
 (12, 1.0700001182728294e-35),
 (14, 4.006815241603583e-15),
 (16, 9.02963261165117e-23),
 (17, 2.223845901044942e-34),
 (19, 3.166671407273836e-27),
 (27, 1.969472407894185e-21),
 (31, 5.776276384185024e-34),
 (33, 2.008903714904966e-24),
 (34, 2.651244001160987e-30),
 (37, 1.0),
 (39, 5.221493635410204e-28),
 (40, 1.4947469533976087e-33),
 (44, 6.295856642191983e-33),
 (45, 5.440501790286943e-21),
 (46, 1.2055788932234473e-27),
 (47, 4.81316740565795e-27),
 (48, 3.0211931382235277e-30),
 (50, 1.0),
 (53, 3.288429459702178e-31),
 (54, 4.2108831415278165e-36),
 (56, 9.82937820978166e-26),
 (57, 1.9717504659178817e-28),
 (61, 4.78286909406076e-30),
 (63, 1.543989998985222e-19),
 (64, 4.3872472544158085e-31),
 (66, 3.0641573764889704e-25),
 (67, 9.914733253603045e-37),
 (68, 7.981972808573613e-32),
 (69, 1.218124569400056e-28),
 (72, 1.0),
 (