In [1]:
import sys
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
sys.path.append("../../")
from utils.evaluation import evaluate
from utils.metrics import Metrics


In [2]:

import pickle
from git import Repo

# Get the git root directory
repo = Repo(".", search_parent_directories=True)
git_root = repo.git.rev_parse("--show-toplevel")

# Load data
X_Train_pd = pickle.load(open(f"{git_root}/data/splits/train/X_pandas.pck", "rb"))
y_Train_pd = pickle.load(open(f"{git_root}/data/splits/train/y_pandas.pck", "rb"))

X_Val_pd = pickle.load(open(f"{git_root}/data/splits/val/X_pandas.pck", "rb"))
y_Val_pd = pickle.load(open(f"{git_root}/data/splits/val/y_pandas.pck", "rb"))

In [3]:
X_Train = torch.tensor(X_Train_pd.values, dtype=torch.float32)
y_Train = torch.tensor(y_Train_pd.values, dtype=torch.float32)

X_Val = torch.tensor(X_Val_pd.values, dtype=torch.float32)
y_Val = torch.tensor(y_Val_pd.values, dtype=torch.float32)


In [4]:
class SimpleNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.layers = torch.nn.Sequential(
            torch.nn.Linear(5045, 5045),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(5045, 5045),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(5045, 2000),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(2000, 1000),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(1000, 300),
            torch.nn.LeakyReLU(0.1),
            torch.nn.Linear(300, 105),
        )

    def forward(self, x):
        return self.layers(x)


In [5]:
class rnaDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [11]:
def label_from_logits(y_hat: torch.Tensor, threshold = 0.5) -> torch.Tensor:
    with torch.no_grad():
        y_pred_tensor = (torch.sigmoid(y_hat) > threshold).float()
    return y_pred_tensor


def evaluate_from_dataframe(X: pd.DataFrame):
    X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32)
    
    #model: a pytorch model, which transforms X -> y in torch.Tensor format
    model.eval()
    y_pred_tensor = label_from_logits(model(X_tensor))
    
    return pd.DataFrame(y_pred_tensor.numpy())

def training(model, optimizer, criterion, train_dataloader, val_dataloder, epochs):

    for epoch in range(epochs):
        train_acc = 0
        val_acc = 0
        train_loss = 0
        val_loss = 0
        model.train()
        for x,y in tqdm(train_dataloader):
            y_pred = model(x)
            loss = criterion(y_pred, y)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_acc += Metrics.calculate_accuracy(y.numpy(), label_from_logits(y_pred).numpy())
        
        with torch.no_grad():
            model.eval()
            for x,y in val_dataloder:
                y_pred = model(x)
                val_loss += criterion(y_pred, y)
                val_acc += Metrics.calculate_accuracy(y.numpy(), label_from_logits(y_pred).numpy())
        
        print(f"Epoch: {epoch} Train Loss: {train_loss} Train Acc: {train_acc/len(train_dataloader)} Val Loss: {val_loss} Val Acc: {val_acc/len(val_dataloder)}")




In [7]:
train_dataset = torch.utils.data.TensorDataset(X_Train, y_Train)
val_dataset = torch.utils.data.TensorDataset(X_Val, y_Val)

In [16]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=512, shuffle=False)

model = SimpleNetwork()

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

training(model, optimizer, criterion, train_dataloader, val_dataloader, 10)

  0%|          | 0/143 [00:00<?, ?it/s]

Epoch: 0 Train Loss: 11.91609650477767 Train Acc: 0.07617663706025775 Val Loss: 1.084060549736023 Val Acc: 0.08850827991452992


  0%|          | 0/143 [00:00<?, ?it/s]

Epoch: 1 Train Loss: 7.790685672312975 Train Acc: 0.10257116715899044 Val Loss: 0.8984967470169067 Val Acc: 0.12370626335470086


In [19]:

model.eval()
y_hat = model(X_Val[0:1].to_sparse())
print(torch.sigmoid(y_hat))



tensor([[1.0634e-05, 1.0258e-05, 1.9013e-04, 5.8805e-06, 1.4959e-04, 1.8044e-04,
         3.6646e-04, 1.6590e-05, 9.0268e-07, 1.9725e-04, 6.3482e-02, 1.6193e-03,
         7.5242e-06, 6.7861e-04, 1.8010e-05, 6.7315e-03, 6.1281e-03, 5.8049e-02,
         3.5786e-04, 1.0614e-04, 1.4006e-04, 1.4745e-05, 1.4580e-02, 3.3396e-05,
         5.1725e-05, 7.4024e-06, 1.4748e-04, 5.4765e-06, 7.0496e-06, 4.0057e-04,
         1.8596e-03, 1.1568e-02, 5.3106e-05, 9.5422e-02, 4.1390e-05, 8.5477e-05,
         2.0739e-04, 1.3044e-03, 2.6656e-05, 3.4249e-04, 3.6442e-05, 5.5484e-04,
         7.8637e-05, 2.3134e-04, 1.8367e-03, 1.8434e-04, 2.2175e-05, 1.0316e-05,
         3.1426e-03, 7.0401e-06, 5.0942e-04, 1.1907e-03, 1.3823e-03, 1.3899e-04,
         1.6465e-04, 7.1561e-04, 3.1723e-04, 8.0935e-04, 3.4670e-05, 4.2775e-06,
         1.4803e-03, 4.0289e-02, 6.6162e-05, 9.4952e-05, 1.5338e-03, 1.3086e-02,
         1.3491e-03, 1.0604e-03, 5.8041e-05, 2.8500e-05, 2.8337e-04, 3.9093e-05,
         9.7599e-03, 2.5940e

In [62]:
evaluate(model, X_Val[0:300], y_Val[0:300])

torch.Size([300, 105])
torch.Size([300, 105])


tensor(0.1967)

In [82]:
#torch.save(model.state_dict(), f"{git_root}/experiments/generating/model.pth")

In [18]:
model = SimpleNetwork()
model.load_state_dict(torch.load(f"{git_root}/experiments/generating/model.pth"))

  model.load_state_dict(torch.load(f"{git_root}/experiments/generating/model.pth"))


<All keys matched successfully>

In [19]:
metrics_test = evaluate(evaluate_from_dataframe)

for metric, value in metrics_test:
    print(f"{metric}: {value}")

accuracy: 0.42774122807017545
precision: 0.650244324304447
recall: 0.42416622145821437
auc: 0.7111511141993914
f1_score: 0.4896933622744449
