In [1]:
import sys
import os
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
sys.path.append("../../")
from utils.evaluation import evaluate
from utils.metrics import Metrics



In [2]:
import neptune
from neptune_pytorch import NeptuneLogger
from neptune.utils import stringify_unsupported
from dotenv import load_dotenv
load_dotenv()

True

In [3]:

import pickle
from git import Repo

# Get the git root directory
repo = Repo(".", search_parent_directories=True)
git_root = repo.git.rev_parse("--show-toplevel")

# Load data
X_Train_pd = pickle.load(open(f"{git_root}/data/splits/train/X_pandas.pck", "rb"))
y_Train_pd = pickle.load(open(f"{git_root}/data/splits/train/y_pandas.pck", "rb"))

X_Val_pd = pickle.load(open(f"{git_root}/data/splits/val/X_pandas.pck", "rb"))
y_Val_pd = pickle.load(open(f"{git_root}/data/splits/val/y_pandas.pck", "rb"))

In [4]:
X_Train = torch.tensor(X_Train_pd.values, dtype=torch.float32)
y_Train = torch.tensor(y_Train_pd.values, dtype=torch.float32)

X_Val = torch.tensor(X_Val_pd.values, dtype=torch.float32)
y_Val = torch.tensor(y_Val_pd.values, dtype=torch.float32)


In [5]:
# class SimpleNetwork(torch.nn.Module):
#     def __init__(self):
#         super().__init__()

#         self.layers = torch.nn.Sequential(
#             torch.nn.Linear(5045, 5045),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(5045, 5045),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(5045, 2000),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(2000, 1000),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(1000, 300),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(300, 105),
#         )

#     def forward(self, x):
#         return self.layers(x)

class SimpleNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.layers = torch.nn.Sequential(
            torch.nn.Linear(5045, 3934),
            torch.nn.LeakyReLU(0.26250042355767117),
            torch.nn.BatchNorm1d(3934),
            torch.nn.Dropout(p=0.26081286435916),
            torch.nn.Linear(3934, 2246),
            torch.nn.LeakyReLU(0.26250042355767117),
            torch.nn.BatchNorm1d(2246),
            torch.nn.Dropout(p=0.26081286435916),
            torch.nn.Linear(2246, 1955),
            torch.nn.LeakyReLU(0.26250042355767117),
            torch.nn.BatchNorm1d(1955),
            torch.nn.Dropout(p=0.26081286435916),
            torch.nn.Linear(1955, 745),
            torch.nn.LeakyReLU(0.26250042355767117),
            torch.nn.BatchNorm1d(745),
            torch.nn.Dropout(p=0.26081286435916),
            torch.nn.Linear(745, 702),
            torch.nn.LeakyReLU(0.26250042355767117),
            torch.nn.BatchNorm1d(702),
            torch.nn.Dropout(p=0.26081286435916),
            torch.nn.Linear(702, 105),
        )

        # Apply Xavier initialization
        for layer in self.layers:
            if isinstance(layer, torch.nn.Linear):
                torch.nn.init.xavier_normal_(layer.weight)

    def forward(self, x):
        return self.layers(x)


In [6]:
class rnaDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [7]:
def label_from_logits(y_hat: torch.Tensor, threshold = 0.5) -> torch.Tensor:
    with torch.no_grad():
        y_pred_tensor = (torch.sigmoid(y_hat) > threshold).float()
    return y_pred_tensor


def evaluate_from_dataframe(X: pd.DataFrame):
    X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32)
    
    #model: a pytorch model, which transforms X -> y in torch.Tensor format
    model.eval()
    model.cpu()
    y_pred_tensor = label_from_logits(model(X_tensor))
    
    return pd.DataFrame(y_pred_tensor.numpy())

def training(model, optimizer, criterion, train_dataloader, val_dataloder, epochs, device, neptune_logger=None, run = None):
    criterion = criterion.to(device)
    model = model.to(device)
    for epoch in tqdm(range(epochs)):
        train_acc = 0
        val_acc = 0
        train_loss = 0
        val_loss = 0
        model.train()
        for x,y in train_dataloader:
            x = x.to(device)
            y = y.to(device)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_acc += Metrics.calculate_accuracy(y.cpu().numpy(), label_from_logits(y_pred).cpu().numpy())
        
        with torch.no_grad():
            model.eval()
            y_preds = np.array([])
            y_trues = np.array([])
            for x,y in val_dataloder:
                x = x.to(device)
                y = y.to(device)
                y_pred = model(x)
                val_loss += criterion(y_pred, y)
                y_pred = label_from_logits(y_pred).cpu().numpy()
                y = y.cpu().numpy()
                y_preds = np.vstack((y_preds, y_pred)) if y_preds.size else y_pred
                y_trues = np.vstack((y_trues, y)) if y_trues.size else y

            val_acc = Metrics.calculate_accuracy(y_preds, y_trues)
            val_precision = Metrics.calculate_precision(y_preds, y_trues)
            val_recall = Metrics.calculate_recall(y_preds, y_trues)
            val_f1 = Metrics.calculate_f1_score(y_preds, y_trues)

        if neptune_logger:
            run[neptune_logger.base_namespace]['train_loss'].append(train_loss)
            run[neptune_logger.base_namespace]['train_acc'].append(train_acc/len(train_dataloader))
            run[neptune_logger.base_namespace]['val_loss'].append(val_loss)
            run[neptune_logger.base_namespace]['val_acc'].append(val_acc)
            run[neptune_logger.base_namespace]['val_precision'].append(val_precision)
            run[neptune_logger.base_namespace]['val_recall'].append(val_recall)
            run[neptune_logger.base_namespace]['val_f1'].append(val_f1)

        print(f"Epoch: {epoch} Train Loss: {train_loss} Train Acc: {train_acc/len(train_dataloader)} Val Loss: {val_loss} Val Acc: {val_acc}")
        #print(f"CUDA memory allocated: {torch.cuda.memory_allocated(device)/1024**3:.2f} GB")




In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


In [9]:
train_dataset = torch.utils.data.TensorDataset(X_Train, y_Train)
val_dataset = torch.utils.data.TensorDataset(X_Val, y_Val)

In [10]:
parameters = {
    "batch_size": 161,
    "lr": 0.00005667968103081318,
    "epochs": 35,
    "shuffle": True,
    "model_name": "SimpleNetwork",
    "optimizer": "AdamW",
    "criterion": "BCEWithLogitsLoss",
    "device": device,
    "LayerInitialization": "Xavier",
    "drop_out": True,
    "layerNormaization": False,
    "batchNormaization": True,
    "Threshold": 0.5,
    "weight_decay": 0.004941861623778181
}

In [11]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=parameters["batch_size"], shuffle=parameters["shuffle"])
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=parameters["batch_size"], shuffle=False)

model = SimpleNetwork()

optimizer = torch.optim.AdamW(model.parameters(), lr=parameters["lr"], weight_decay=parameters["weight_decay"])
criterion = torch.nn.BCEWithLogitsLoss()


In [12]:
run = neptune.init_run(
    api_token=os.getenv("NEPTUNE_API_KEY"),
    project=os.getenv("NEPTUNE_PROJECT_NAME"),
    name="SimpleNetwork - best optuna",
)
run["model/structure"] = str(model)

neptune_logger = NeptuneLogger(run=run, model=model)
                               
run[neptune_logger.base_namespace]["hyperparams"] = stringify_unsupported(parameters)


training(model, optimizer, criterion, train_dataloader, val_dataloader, parameters["epochs"], device=device, neptune_logger=neptune_logger, run=run)

metrics_test = evaluate(evaluate_from_dataframe)

run["test"] = metrics_test.as_dict()

neptune_logger.log_model()
run.stop()



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/JPL/rna-sequencing/e/RNAS-184


  0%|          | 0/35 [00:00<?, ?it/s]

Epoch: 0 Train Loss: 340.4381865262985 Train Acc: 0.0 Val Loss: 34.52456283569336 Val Acc: 0.005701754385964913
Epoch: 1 Train Loss: 237.21992021799088 Train Acc: 0.032342770952472205 Val Loss: 20.001052856445312 Val Acc: 0.11019736842105263
Epoch: 2 Train Loss: 118.08926412463188 Train Acc: 0.09816212685035709 Val Loss: 9.237150192260742 Val Acc: 0.10657894736842105
Epoch: 3 Train Loss: 58.92049115151167 Train Acc: 0.1027320128875148 Val Loss: 5.323086261749268 Val Acc: 0.11896929824561403
Epoch: 4 Train Loss: 37.39048132300377 Train Acc: 0.1279598017621147 Val Loss: 3.7923271656036377 Val Acc: 0.1543859649122807
Epoch: 5 Train Loss: 28.451277647167444 Train Acc: 0.15798962637152167 Val Loss: 3.056426525115967 Val Acc: 0.1875
Epoch: 6 Train Loss: 23.372356072068214 Train Acc: 0.18859490861082992 Val Loss: 2.533606767654419 Val Acc: 0.21611842105263157
Epoch: 7 Train Loss: 20.09332551434636 Train Acc: 0.22553492762743904 Val Loss: 2.1967129707336426 Val Acc: 0.25855263157894737
Epoch: 

In [16]:
torch.save(model.state_dict(), f"{git_root}/experiments/generating/model.pth")

In [14]:
# model = SimpleNetwork()
# model.load_state_dict(torch.load(f"{git_root}/experiments/generating/model.pth"))

In [15]:
# metrics_test = evaluate(evaluate_from_dataframe)

# for metric, value in metrics_test:
#     print(f"{metric}: {value}")