In [1]:
import sys
import os
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
sys.path.append("../../")
from utils.evaluation import evaluate
from utils.metrics import Metrics



In [2]:
import neptune
from neptune_pytorch import NeptuneLogger
from neptune.utils import stringify_unsupported
from dotenv import load_dotenv
load_dotenv()

True

In [3]:

import pickle
from git import Repo

# Get the git root directory
repo = Repo(".", search_parent_directories=True)
git_root = repo.git.rev_parse("--show-toplevel")

# Load data
X_Train_pd = pickle.load(open(f"{git_root}/data/splits/train/X_pandas.pck", "rb"))
y_Train_pd = pickle.load(open(f"{git_root}/data/splits/train/y_pandas.pck", "rb"))

X_Val_pd = pickle.load(open(f"{git_root}/data/splits/val/X_pandas.pck", "rb"))
y_Val_pd = pickle.load(open(f"{git_root}/data/splits/val/y_pandas.pck", "rb"))

In [4]:
X_Train = torch.tensor(X_Train_pd.values, dtype=torch.float32)
y_Train = torch.tensor(y_Train_pd.values, dtype=torch.float32)

X_Val = torch.tensor(X_Val_pd.values, dtype=torch.float32)
y_Val = torch.tensor(y_Val_pd.values, dtype=torch.float32)


In [5]:
# class SimpleNetwork(torch.nn.Module):
#     def __init__(self):
#         super().__init__()

#         self.layers = torch.nn.Sequential(
#             torch.nn.Linear(5045, 5045),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(5045, 5045),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(5045, 2000),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(2000, 1000),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(1000, 300),
#             torch.nn.LeakyReLU(0.1),
#             torch.nn.Linear(300, 105),
#         )

#     def forward(self, x):
#         return self.layers(x)

class SimpleNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.layers = torch.nn.Sequential(
            torch.nn.Linear(5045, 5045),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(5045),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(5045, 5045),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(5045),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(5045, 2000),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(2000),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(2000, 1000),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(1000),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(1000, 300),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(300),
            torch.nn.Dropout(p=0.2),
            torch.nn.Linear(300, 105),
        )

        # Apply Xavier initialization
        for layer in self.layers:
            if isinstance(layer, torch.nn.Linear):
                torch.nn.init.xavier_normal_(layer.weight)

    def forward(self, x):
        return self.layers(x)


In [6]:
class rnaDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [7]:
def label_from_logits(y_hat: torch.Tensor, threshold = 0.5) -> torch.Tensor:
    with torch.no_grad():
        y_pred_tensor = (torch.sigmoid(y_hat) > threshold).float()
    return y_pred_tensor


def evaluate_from_dataframe(X: pd.DataFrame):
    X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32)
    
    #model: a pytorch model, which transforms X -> y in torch.Tensor format
    model.eval()
    model.cpu()
    y_pred_tensor = label_from_logits(model(X_tensor))
    
    return pd.DataFrame(y_pred_tensor.numpy())

def training(model, optimizer, criterion, train_dataloader, val_dataloder, epochs, device, neptune_logger=None, run = None):
    criterion = criterion.to(device)
    model = model.to(device)
    for epoch in tqdm(range(epochs)):
        train_acc = 0
        val_acc = 0
        train_loss = 0
        val_loss = 0
        model.train()
        for x,y in train_dataloader:
            x = x.to(device)
            y = y.to(device)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_acc += Metrics.calculate_accuracy(y.cpu().numpy(), label_from_logits(y_pred).cpu().numpy())
        
        with torch.no_grad():
            model.eval()
            y_preds = np.array([])
            y_trues = np.array([])
            for x,y in val_dataloder:
                x = x.to(device)
                y = y.to(device)
                y_pred = model(x)
                val_loss += criterion(y_pred, y)
                y_pred = label_from_logits(y_pred).cpu().numpy()
                y = y.cpu().numpy()
                y_preds = np.vstack((y_preds, y_pred)) if y_preds.size else y_pred
                y_trues = np.vstack((y_trues, y)) if y_trues.size else y

            val_acc = Metrics.calculate_accuracy(y_preds, y_trues)
            val_precision = Metrics.calculate_precision(y_preds, y_trues)
            val_recall = Metrics.calculate_recall(y_preds, y_trues)
            val_f1 = Metrics.calculate_f1_score(y_preds, y_trues)

        if neptune_logger:
            run[neptune_logger.base_namespace]['train_loss'].append(train_loss)
            run[neptune_logger.base_namespace]['train_acc'].append(train_acc/len(train_dataloader))
            run[neptune_logger.base_namespace]['val_loss'].append(val_loss)
            run[neptune_logger.base_namespace]['val_acc'].append(val_acc)
            run[neptune_logger.base_namespace]['val_precision'].append(val_precision)
            run[neptune_logger.base_namespace]['val_recall'].append(val_recall)
            run[neptune_logger.base_namespace]['val_f1'].append(val_f1)

        print(f"Epoch: {epoch} Train Loss: {train_loss} Train Acc: {train_acc/len(train_dataloader)} Val Loss: {val_loss} Val Acc: {val_acc}")
        #print(f"CUDA memory allocated: {torch.cuda.memory_allocated(device)/1024**3:.2f} GB")




In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


In [9]:
train_dataset = torch.utils.data.TensorDataset(X_Train, y_Train)
val_dataset = torch.utils.data.TensorDataset(X_Val, y_Val)

In [10]:
parameters = {
    "batch_size": 512,
    "lr": 0.001,
    "epochs": 50,
    "shuffle": True,
    "model_name": "SimpleNetwork",
    "optimizer": "AdamW",
    "criterion": "BCEWithLogitsLoss",
    "device": device,
    "LayerInitialization": "Xavier",
    "drop_out": True,
    "layerNormaization": False,
    "batchNormaization": True,
    "Threshold": 0.5,
}

In [11]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=parameters["batch_size"], shuffle=parameters["shuffle"])
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=parameters["batch_size"], shuffle=False)

model = SimpleNetwork()

optimizer = torch.optim.AdamW(model.parameters(), lr=parameters["lr"])
criterion = torch.nn.BCEWithLogitsLoss()


In [12]:
run = neptune.init_run(
    api_token=os.getenv("NEPTUNE_API_KEY"),
    project=os.getenv("NEPTUNE_PROJECT_NAME"),
    name="SimpleNetwork - with B.Norm",
)
run["model/structure"] = str(model)

neptune_logger = NeptuneLogger(run=run, model=model)
                               
run[neptune_logger.base_namespace]["hyperparams"] = stringify_unsupported(parameters)


training(model, optimizer, criterion, train_dataloader, val_dataloader, parameters["epochs"], device=device, neptune_logger=neptune_logger, run=run)

metrics_test = evaluate(evaluate_from_dataframe)

run["test"] = metrics_test.as_dict()

neptune_logger.log_model()
run.stop()



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/JPL/rna-sequencing/e/RNAS-142


  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0 Train Loss: 54.263460136950016 Train Acc: 0.06099136884427833 Val Loss: 1.2756699323654175 Val Acc: 0.13168859649122808
Epoch: 1 Train Loss: 7.525874361395836 Train Acc: 0.1636092669901937 Val Loss: 0.7360912561416626 Val Acc: 0.20877192982456141
Epoch: 2 Train Loss: 5.341028776019812 Train Acc: 0.26032838539573455 Val Loss: 0.5790475606918335 Val Acc: 0.31030701754385964
Epoch: 3 Train Loss: 4.3627725429832935 Train Acc: 0.35589655046820995 Val Loss: 0.5163073539733887 Val Acc: 0.3712719298245614
Epoch: 4 Train Loss: 3.7141436003148556 Train Acc: 0.426295761829167 Val Loss: 0.4726189076900482 Val Acc: 0.41853070175438595
Epoch: 5 Train Loss: 3.2597478330135345 Train Acc: 0.4823995299480214 Val Loss: 0.45921918749809265 Val Acc: 0.44956140350877194
Epoch: 6 Train Loss: 2.901451548561454 Train Acc: 0.5268239528072503 Val Loss: 0.45880505442619324 Val Acc: 0.4611842105263158
Epoch: 7 Train Loss: 2.6060511264950037 Train Acc: 0.5680087617722316 Val Loss: 0.45976361632347107 Val A

In [13]:
#torch.save(model.state_dict(), f"{git_root}/experiments/generating/model.pth")

In [14]:
# model = SimpleNetwork()
# model.load_state_dict(torch.load(f"{git_root}/experiments/generating/model.pth"))

In [15]:
# metrics_test = evaluate(evaluate_from_dataframe)

# for metric, value in metrics_test:
#     print(f"{metric}: {value}")