# SWEEM Model Implementation

This file is used to illustrate the preprocessing, training, and evaluation 
stages of our model. Comments and more information will be provided per section.

## Preprocessing

Here we load in the data and establish our train-test split. We also set up dataloaders
for us to be able to properly use the data within our training loop.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

train_data = pd.read_csv('./Data/Multiple/train.csv')
test_data = pd.read_csv('./Data/Multiple/test.csv')

## Training Data
rna_train = train_data.columns[:2708]                       # 2708 rna
scna_train = train_data.columns[2708:5404]                  # 2696 scna
mutation_train = train_data.columns[5404:5591]              # 187 mutation
methy_train = train_data.columns[5591:7957]                 # 2366 methy
target_train = train_data.columns[-3:]                      # 3 target

## Testing Data
rna_test = test_data.columns[:2708]                         # 2708 rna
scna_test = test_data.columns[2708:5404]                    # 2696 scna
mutation_test = test_data.columns[5404:5591]                # 187 mutation
methy_test = test_data.columns[5591:7957]                   # 2366 methy
target_test = test_data.columns[-3:]                        # 3 target

In [2]:
### Sanity Checks on Data

# Data Shapes; should have same number of features
print('Training Data Shape: ', train_data.shape)    # (380, 7961)
print('Testing Data Shape: ', test_data.shape)      # (48 , 7961)
print()

# Check header information
print(f"RNA Labels:         {list(train_data.columns[0:2])} ... {list(train_data.columns[2706:2708])}")
print(f"SCNA Labels:        {list(train_data.columns[2708:2710])} ... {list(train_data.columns[5402:5404])}")
print(f"Mutation Labels:    {list(train_data.columns[5404:5406])} ... {list(train_data.columns[5589:5591])}")
print(f"Methylation Labels: {list(train_data.columns[5591:5593])} ... {list(train_data.columns[7955:7957])}")
print(f"Target Labels:      {list(train_data.columns[-3:])}")

Training Data Shape:  (380, 7961)
Testing Data Shape:  (48, 7961)

RNA Labels:         ['ST3GAL2_rna', 'ELOVL1_rna'] ... ['TK2_rna', 'DHODH_rna']
SCNA Labels:        ['ST3GAL2_scna', 'ELOVL1_scna'] ... ['TK2_scna', 'DHODH_scna']
Mutation Labels:    ['ABCG4_mutation', 'DAPK1_mutation'] ... ['CDH4_mutation', 'ACSL4_mutation']
Methylation Labels: ['ST3GAL2_methy', 'ELOVL1_methy'] ... ['PRKCQ_methy', 'TK2_methy']
Target Labels:      ['SAMPLE_ID', 'OS_MONTHS', 'OS_EVENT']


In [None]:
## Ground truth: OS_MONTHS, OS_EVENT

# Split the data into train and validation sets.
train_features, val_features, train_labels, val_labels = train_test_split(
    train_data.iloc[:, :-3], train_data.iloc[:, -2], test_size=0.2, random_state=42)

# Data in month, event format
train_features_alt, val_features_alt, train_labels_alt, val_labels_alt = train_test_split(
    train_data.iloc[:, :-3], train_data.iloc[:, -2:], test_size=0.2, random_state=42)

test_features, test_labels = test_data.iloc[:, :-3], test_data.iloc[:, -2]

# Create Tensor datasets
train_dataset = TensorDataset(torch.tensor(train_features.values), torch.tensor(train_labels.values))
train_dataset_alt = TensorDataset(torch.tensor(train_features_alt.values), torch.tensor(train_labels_alt.values))
val_dataset   = TensorDataset(torch.tensor(val_features.values),   torch.tensor(val_labels.values))
test_dataset  = TensorDataset(torch.tensor(test_features.values),  torch.tensor(test_labels.values))

# Create DataLoader objects
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_dataloader_alt = DataLoader(train_dataset_alt, batch_size=batch_size, shuffle=True)
val_dataloader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
val_dataloader_alt = DataLoader(train_dataset_alt, batch_size=batch_size, shuffle=True)
test_dataloader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Define Self-Attention Model

In [None]:
from model import SelfAttentionModel 
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm

model = SelfAttentionModel(7961-3, 2000, 2000)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on", device)
model.to(device)


criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [None]:
class LinearRegression(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 300)
        self.linear3 = nn.Linear(300, 250)
        self.linear4 = nn.Linear(250, 100)
        self.linear5 = nn.Linear(100, 5)
        self.linearOut = nn.Linear(5, output_size)
        self.relu = nn.ReLU()

    def forward(self, x, x2):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        out = self.relu(out)
        out = self.linear3(out)
        out = self.relu(out)
        out = self.linear4(out)
        out = self.relu(out)
        out = self.linear5(out)
        out = self.relu(out)
        out = self.linearOut(out)
        out = torch.sigmoid(out)
        return out
    
model = LinearRegression(7961-3, 6000, 1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on", device)
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
### Training Loop
from loss import R_set, neg_par_log_likelihood

num_epochs = 10
epoch_train_losses = []
epoch_val_losses   = []

for epoch in range(num_epochs):
    epoch_train_loss = 0
    epoch_val_loss   = 0
    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_dataloader)))

    model.train()
    ## Training
    for (batchX, batchY) in train_dataloader:
        # Forward pass
        outputs = model(batchX.to(device).to(torch.float32))
        outputs = outputs.squeeze()

        # MSE Backward pass
        loss = criterion(outputs, batchY.to(device).to(torch.float32))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        epoch_train_loss += loss.item()
        progress_bar.update(1)

    model.eval()
    ## Validation
    with torch.no_grad():
        for (batchX, batchY) in val_dataloader:
            outputs = model(batchX.to(device).to(torch.float32))
            outputs = outputs.squeeze()
            loss = criterion(outputs, batchY.to(device).to(torch.float32))
            epoch_val_loss += loss.item()

    # Save and print losses
    epoch_train_loss /= len(train_dataloader)
    epoch_val_loss /= len(val_dataloader)
    epoch_train_losses.append(epoch_train_loss)
    epoch_val_losses.append(epoch_val_loss)
    print(f"Epoch {epoch + 1} training loss: {epoch_train_loss}")
    print(f"Epoch {epoch + 1} validation loss: {epoch_val_loss}")

## Train Loop with Alt Dataloader

In [None]:
### Training Loop
from loss import neg_par_log_likelihood, temp_loss
num_epochs = 10
epoch_train_losses = []
epoch_val_losses   = []

for epoch in range(num_epochs):
    epoch_train_loss = 0
    epoch_val_loss   = 0
    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_dataloader_alt)))

    model.train()
    ## Training
    for (batchX, batchY) in train_dataloader_alt:
        # Forward pass
        outputs = model(batchX.to(device).to(torch.float32))
        #print(outputs)
        #outputs = outputs.squeeze()
        
        # Alt Backward Pass
        time, event = batchY[:,0], batchY[:,1]
        loss = temp_loss(outputs, time, event)
        #loss = neg_par_log_likelihood(outputs, time, event)
        #print(loss)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        epoch_train_loss += loss.item()
        progress_bar.update(1)

    model.eval()
    ## Validation
    with torch.no_grad():
        for (batchX, batchY) in val_dataloader_alt:
            outputs = model(batchX.to(device).to(torch.float32))
            #outputs = outputs.squeeze()
            time, event = batchY[:,0], batchY[:,1] 
            loss = temp_loss(outputs, time, event)
            #loss = neg_par_log_likelihood(outputs, time, event)
            epoch_val_loss += loss.item()

    # Save and print losses
    epoch_train_loss /= len(train_dataloader_alt)
    epoch_val_loss /= len(val_dataloader_alt)
    epoch_train_losses.append(epoch_train_loss)
    epoch_val_losses.append(epoch_val_loss)
    print(f"Epoch {epoch + 1} training loss: {epoch_train_loss}")
    print(f"Epoch {epoch + 1} validation loss: {epoch_val_loss}")

In [106]:
### Sanity Checking Outputs

# Check the outputs of the model on the test set
model.eval()
sample_number = 5
for test_images, test_labels in test_dataloader: 
    outputs = model(test_images.to(device).to(torch.float32))
    print("Intended Output: ", test_labels)
    print("Actual Output: ", outputs)
    break

Intended Output:  tensor([0, 0, 0, 0, 1, 1, 0, 0])
Actual Output:  tensor([[0.3996],
        [0.3996],
        [0.3996],
        [0.3996],
        [0.3996],
        [0.3996],
        [0.3996],
        [0.3996]], device='cuda:0', grad_fn=<SigmoidBackward0>)


